diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..cbc7ad947c8d2566e0450b4b31a516a84d5c9334 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample/lime/first_frame.jpg filter=lfs diff=lfs merge=lfs -text
+sample/moving_ball/first_frame.jpg filter=lfs diff=lfs merge=lfs -text
diff --git a/app.py b/app.py
index 04cc31aa8d0e06aeaac3b59bb361ed71d831e43f..5704575130149acac43c8ccd328bb8b374485f3e 100644
--- a/app.py
+++ b/app.py
@@ -1,7 +1,281 @@
+"""
+VOID – Video Object and Interaction Deletion
+Gradio demo for Hugging Face Spaces (ZeroGPU)
+"""
+
+import os
+import sys
+import tempfile
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import imageio
+import mediapy as media
+import spaces
 import gradio as gr
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file
+from diffusers import DDIMScheduler
+from PIL import Image
+
+# ── project imports ────────────────────────────────────────────────────────────
+sys.path.insert(0, os.path.dirname(__file__))
+
+from videox_fun.models import (
+    AutoencoderKLCogVideoX,
+    CogVideoXTransformer3DModel,
+    T5EncoderModel,
+    T5Tokenizer,
+)
+from videox_fun.pipeline import CogVideoXFunInpaintPipeline
+from videox_fun.utils.fp8_optimization import convert_weight_dtype_wrapper
+from videox_fun.utils.utils import temporal_padding
+
+# ── constants ──────────────────────────────────────────────────────────────────
+# Set these env vars in your HF Space settings, or hardcode once weights are public.
+BASE_MODEL_ID  = os.environ.get("BASE_MODEL_ID", "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP")
+VOID_MODEL_ID  = os.environ.get("VOID_MODEL_ID", "your-hf-username/VOID")
+VOID_CKPT_FILE = "void_pass1.safetensors"
+
+SAMPLE_SIZE  = (384, 672)   # H × W
+MAX_VID_LEN  = 197
+TEMPORAL_WIN = 85
+FPS          = 12
+WEIGHT_DTYPE = torch.bfloat16
+NEG_PROMPT = (
+    "The video is not of a high quality, it has a low resolution. "
+    "Watermark present in each frame. The background is solid. "
+    "Strange body and strange trajectory. Distortion."
+)
+
+# ── model loading (once at startup, lives in CPU RAM between GPU requests) ─────
+print("Loading VOID pipeline …")
+
+transformer = CogVideoXTransformer3DModel.from_pretrained(
+    BASE_MODEL_ID,
+    subfolder="transformer",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float8_e4m3fn,  # qfloat8 to save VRAM
+    use_vae_mask=True,
+    stack_mask=False,
+).to(WEIGHT_DTYPE)
+
+# Load VOID Pass-1 checkpoint
+ckpt_path  = hf_hub_download(repo_id=VOID_MODEL_ID, filename=VOID_CKPT_FILE)
+state_dict = load_file(ckpt_path)
+state_dict = state_dict.get("state_dict", state_dict)
+
+# Adapt patch_embed channels if they differ (mask-conditioning channels added)
+param_name = "patch_embed.proj.weight"
+if state_dict[param_name].size(1) != transformer.state_dict()[param_name].size(1):
+    feat_dim   = 16 * 8  # latent_channels * feat_scale
+    new_weight = transformer.state_dict()[param_name].clone()
+    new_weight[:, :feat_dim]  = state_dict[param_name][:, :feat_dim]
+    new_weight[:, -feat_dim:] = state_dict[param_name][:, -feat_dim:]
+    state_dict[param_name] = new_weight
+
+transformer.load_state_dict(state_dict, strict=False)
+
+vae = AutoencoderKLCogVideoX.from_pretrained(
+    BASE_MODEL_ID, subfolder="vae"
+).to(WEIGHT_DTYPE)
+tokenizer    = T5Tokenizer.from_pretrained(BASE_MODEL_ID, subfolder="tokenizer")
+text_encoder = T5EncoderModel.from_pretrained(
+    BASE_MODEL_ID, subfolder="text_encoder", torch_dtype=WEIGHT_DTYPE
+)
+scheduler = DDIMScheduler.from_pretrained(BASE_MODEL_ID, subfolder="scheduler")
+
+pipeline = CogVideoXFunInpaintPipeline(
+    vae=vae,
+    tokenizer=tokenizer,
+    text_encoder=text_encoder,
+    transformer=transformer,
+    scheduler=scheduler,
+)
+convert_weight_dtype_wrapper(transformer, WEIGHT_DTYPE)
+pipeline.enable_model_cpu_offload()
+
+print("VOID pipeline ready.")
+
+
+# ── helpers ────────────────────────────────────────────────────────────────────
+def load_video_tensor(path: str) -> torch.Tensor:
+    """Return (1, C, T, H, W) float32 in [0, 1] resized to SAMPLE_SIZE."""
+    frames = media.read_video(path)
+    t = torch.from_numpy(np.array(frames))[:MAX_VID_LEN]     # (T, H, W, C)
+    t = t.permute(3, 0, 1, 2).float() / 255.0                # (C, T, H, W)
+    t = F.interpolate(t, SAMPLE_SIZE, mode="area").unsqueeze(0)
+    return t
+
+
+def load_quadmask_tensor(path: str) -> torch.Tensor:
+    """
+    Return (1, 1, T, H, W) float32 in [0, 1].
+
+    Quadmask pixel values:
+      0   → primary object (to erase)
+      63  → overlap / interaction zone
+      127 → affected region (shadows, reflections …)
+      255 → background (keep)
+
+    After quantisation the mask is inverted so 255 = "erase", 0 = "keep",
+    matching the pipeline's internal convention.
+    """
+    frames = media.read_video(path)[:MAX_VID_LEN]
+    if frames.ndim == 4:
+        frames = frames[..., 0]   # take first channel, grayscale
+    m = torch.from_numpy(np.array(frames)).unsqueeze(0).float()   # (1, T, H, W)
+    m = F.interpolate(m, SAMPLE_SIZE, mode="area").unsqueeze(0)   # (1, 1, T, H, W)
+
+    # Quantise to four canonical values
+    m = torch.where(m <= 31,               torch.zeros_like(m),        m)
+    m = torch.where((m > 31) & (m <= 95),  torch.full_like(m, 63),     m)
+    m = torch.where((m > 95) & (m <= 191), torch.full_like(m, 127),    m)
+    m = torch.where(m > 191,               torch.full_like(m, 255),    m)
+
+    m = 255.0 - m   # invert
+    return m / 255.0
+
+
+def tensor_to_mp4(video: torch.Tensor) -> str:
+    """Save (1, C, T, H, W) in [0, 1] to a temp mp4 and return the path."""
+    frames = video[0].permute(1, 2, 3, 0).cpu().float().numpy()  # (T, H, W, C)
+    frames = (frames * 255).clip(0, 255).astype(np.uint8)
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    imageio.mimsave(tmp.name, frames, fps=FPS)
+    return tmp.name
+
+
+# ── inference ──────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=300)
+def run_inpaint(
+    input_video_path: str,
+    mask_video_path: str,
+    prompt: str,
+    num_steps: int,
+    guidance_scale: float,
+    seed: int,
+) -> str:
+    if not input_video_path or not mask_video_path:
+        raise gr.Error("Please upload both an input video and a quadmask video.")
+    if not prompt.strip():
+        raise gr.Error("Please enter a prompt describing the scene after removal.")
+
+    generator = torch.Generator(device="cuda").manual_seed(int(seed))
+
+    input_video = load_video_tensor(input_video_path)
+    input_mask  = load_quadmask_tensor(mask_video_path)
+
+    input_video = temporal_padding(input_video, min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN)
+    input_mask  = temporal_padding(input_mask,  min_length=TEMPORAL_WIN, max_length=MAX_VID_LEN)
+
+    with torch.no_grad():
+        result = pipeline(
+            prompt=prompt,
+            negative_prompt=NEG_PROMPT,
+            height=SAMPLE_SIZE[0],
+            width=SAMPLE_SIZE[1],
+            num_frames=TEMPORAL_WIN,
+            video=input_video,
+            mask_video=input_mask,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_steps,
+            strength=1.0,
+            use_trimask=True,
+            use_vae_mask=True,
+            stack_mask=False,
+            zero_out_mask_region=False,
+        ).videos
+
+    return tensor_to_mp4(result)
+
+
+# ── Gradio UI ──────────────────────────────────────────────────────────────────
+QUADMASK_EXPLAINER = """
+### Quadmask format
+
+The quadmask is a **grayscale video** where each pixel value encodes what role that region plays:
+
+| Pixel value | Meaning |
+|-------------|---------|
+| **0** (black) | Primary object to remove |
+| **63** (dark grey) | Overlap / interaction zone |
+| **127** (mid grey) | Affected region — shadows, reflections, secondary effects |
+| **255** (white) | Background — keep as-is |
+
+Use the **VLM-Mask-Reasoner** pipeline included in the repo to generate quadmasks automatically.
+"""
+
+SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "sample")
+EXAMPLES = [
+    [
+        os.path.join(SAMPLE_DIR, "lime",        "input_video.mp4"),
+        os.path.join(SAMPLE_DIR, "lime",        "quadmask_0.mp4"),
+        "A lime falls on the table.",
+        30, 1.0, 42,
+    ],
+    [
+        os.path.join(SAMPLE_DIR, "moving_ball", "input_video.mp4"),
+        os.path.join(SAMPLE_DIR, "moving_ball", "quadmask_0.mp4"),
+        "A ball rolls off the table.",
+        30, 1.0, 42,
+    ],
+    [
+        os.path.join(SAMPLE_DIR, "pillow",      "input_video.mp4"),
+        os.path.join(SAMPLE_DIR, "pillow",      "quadmask_0.mp4"),
+        "Two pillows placed on the table.",
+        30, 1.0, 42,
+    ],
+]
+
+with gr.Blocks(title="VOID – Video Object & Interaction Deletion") as demo:
+    gr.Markdown(
+        """
+# VOID – Video Object and Interaction Deletion
+
+Upload a video and its **quadmask**, enter a prompt describing the scene *after* removal,
+and VOID will erase the object along with its physical interactions (shadows, deformations, secondary motion).
+
+> Built on **CogVideoX-Fun-V1.5-5B** fine-tuned for interaction-aware video inpainting.
+        """
+    )
+
+    with gr.Row():
+        with gr.Column():
+            input_video = gr.Video(label="Input video", sources=["upload"])
+            mask_video  = gr.Video(label="Quadmask video", sources=["upload"])
+            prompt = gr.Textbox(
+                label="Prompt — describe the scene after removal",
+                placeholder="e.g. A wooden table with nothing on it.",
+                lines=2,
+            )
+            with gr.Accordion("Advanced settings", open=False):
+                num_steps      = gr.Slider(10, 50, value=30, step=1,    label="Inference steps")
+                guidance_scale = gr.Slider(1.0, 10.0, value=1.0, step=0.5, label="Guidance scale")
+                seed           = gr.Number(value=42, label="Seed", precision=0)
+            run_btn = gr.Button("Run VOID", variant="primary")
+
+        with gr.Column():
+            output_video = gr.Video(label="Inpainted output", interactive=False)
+
+    gr.Markdown(QUADMASK_EXPLAINER)
+
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed],
+        outputs=[output_video],
+        fn=run_inpaint,
+        cache_examples=True,
+        label="Sample sequences — click to load and run",
+    )
 
-def greet(name):
-    return "Hello " + name + "!!"
+    run_btn.click(
+        fn=run_inpaint,
+        inputs=[input_video, mask_video, prompt, num_steps, guidance_scale, seed],
+        outputs=[output_video],
+    )
 
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()
+if __name__ == "__main__":
+    demo.launch()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b31453511f0affdd6765a698572a43719af74188
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,61 @@
+# Core deep learning
+torch==2.7.1
+torchvision==0.22.1
+torchdiffeq==0.2.5
+torchsde==0.2.6
+
+# Diffusion / generation
+diffusers==0.33.1
+accelerate==1.12.0
+transformers==4.57.1
+safetensors==0.6.2
+peft==0.17.1
+
+# Training utilities
+deepspeed==0.17.6
+came-pytorch==0.1.3
+tensorboard==2.20.0
+
+# Vision / video
+opencv-python==4.10.0.84
+scikit-image==0.25.2
+imageio==2.37.0
+imageio-ffmpeg==0.6.0
+mediapy==1.2.4
+decord==0.6.0
+kornia==0.8.1
+albumentations==2.0.8
+timm==1.0.19
+tomesd==0.1.3
+Pillow==11.3.0
+
+# Data / ML utilities
+numpy==1.26.4
+scipy==1.14.0
+scikit-learn==1.7.2
+datasets==4.0.0
+einops==0.8.0
+
+# Config / logging
+omegaconf==2.3.0
+ml_collections==1.1.0
+absl-py==2.3.1
+loguru==0.7.3
+tqdm==4.67.1
+matplotlib==3.10.6
+
+# NLP
+sentencepiece==0.2.1
+ftfy==6.1.1
+beautifulsoup4==4.13.5
+
+# Misc
+func-timeout==4.3.5
+requests==2.32.5
+packaging==25.0
+
+# Optional: Gradio UI (only needed for app.py / demo)
+# gradio>=3.41.2,<=3.48.0
+
+# Note: SAM2 must be installed separately per the instructions at
+# https://github.com/facebookresearch/sam2?tab=readme-ov-file#installation
diff --git a/sample/lime/first_frame.jpg b/sample/lime/first_frame.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..714cafe960ef4cd0dcf48ac08c99b78f585d3e5f
--- /dev/null
+++ b/sample/lime/first_frame.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:eb8e417430f5cc3ee15bffcd60c51b13461c567c3809dedeaeaca55efb567c06
+size 894122
diff --git a/sample/lime/input_video.mp4 b/sample/lime/input_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..29832285b5a6dce9a4b3702a66ec5b020892d2e8
--- /dev/null
+++ b/sample/lime/input_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0efabfbfc85bf29d11ac0f734eccf5dc824c511333c15953b73d3e357d7d9a87
+size 3892459
diff --git a/sample/lime/prompt.json b/sample/lime/prompt.json
new file mode 100644
index 0000000000000000000000000000000000000000..7ce9b869a2db148c7319085cb4159e06389fa5ef
--- /dev/null
+++ b/sample/lime/prompt.json
@@ -0,0 +1,3 @@
+{
+  "bg": "A lime falls on the table."
+}
diff --git a/sample/lime/quadmask_0.mp4 b/sample/lime/quadmask_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..65caeedfcc2a797a5f1d4a007afacfbde5ed5812
--- /dev/null
+++ b/sample/lime/quadmask_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00a01b7fb47107edcbfd5a036d6d7b1097ea8624df9c2440d184ddfa90a8bdd5
+size 1907329
diff --git a/sample/lime/segmentation_info.json b/sample/lime/segmentation_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..72483c3c9da4c6b3128129e9ad3b583d8a345222
--- /dev/null
+++ b/sample/lime/segmentation_info.json
@@ -0,0 +1,221 @@
+{
+  "total_frames": 46,
+  "frame_width": 3840,
+  "frame_height": 2160,
+  "fps": 12.0,
+  "num_points": 25,
+  "points_by_frame": {
+    "0": [
+      [
+        2126,
+        1099
+      ],
+      [
+        2366,
+        1099
+      ],
+      [
+        2683,
+        1080
+      ],
+      [
+        2784,
+        1176
+      ],
+      [
+        2640,
+        1176
+      ],
+      [
+        2539,
+        1176
+      ],
+      [
+        2318,
+        1176
+      ],
+      [
+        2116,
+        1291
+      ],
+      [
+        2496,
+        1291
+      ],
+      [
+        2654,
+        1286
+      ],
+      [
+        2654,
+        1406
+      ],
+      [
+        2342,
+        1406
+      ],
+      [
+        2342,
+        1776
+      ],
+      [
+        2620,
+        1776
+      ],
+      [
+        2539,
+        1924
+      ],
+      [
+        2304,
+        1972
+      ],
+      [
+        2217,
+        1992
+      ],
+      [
+        2385,
+        2030
+      ],
+      [
+        2596,
+        2025
+      ],
+      [
+        2673,
+        1987
+      ],
+      [
+        2217,
+        1776
+      ],
+      [
+        2198,
+        1660
+      ],
+      [
+        2452,
+        1588
+      ],
+      [
+        2294,
+        1483
+      ],
+      [
+        2270,
+        1358
+      ]
+    ]
+  },
+  "video_path": "limecoke.mp4",
+  "instruction": "",
+  "primary_points_by_frame": {
+    "0": [
+      [
+        2126,
+        1099
+      ],
+      [
+        2366,
+        1099
+      ],
+      [
+        2683,
+        1080
+      ],
+      [
+        2784,
+        1176
+      ],
+      [
+        2640,
+        1176
+      ],
+      [
+        2539,
+        1176
+      ],
+      [
+        2318,
+        1176
+      ],
+      [
+        2116,
+        1291
+      ],
+      [
+        2496,
+        1291
+      ],
+      [
+        2654,
+        1286
+      ],
+      [
+        2654,
+        1406
+      ],
+      [
+        2342,
+        1406
+      ],
+      [
+        2342,
+        1776
+      ],
+      [
+        2620,
+        1776
+      ],
+      [
+        2539,
+        1924
+      ],
+      [
+        2304,
+        1972
+      ],
+      [
+        2217,
+        1992
+      ],
+      [
+        2385,
+        2030
+      ],
+      [
+        2596,
+        2025
+      ],
+      [
+        2673,
+        1987
+      ],
+      [
+        2217,
+        1776
+      ],
+      [
+        2198,
+        1660
+      ],
+      [
+        2452,
+        1588
+      ],
+      [
+        2294,
+        1483
+      ],
+      [
+        2270,
+        1358
+      ]
+    ]
+  },
+  "primary_frames": [
+    0
+  ],
+  "first_appears_frame": 0
+}
\ No newline at end of file
diff --git a/sample/moving_ball/first_frame.jpg b/sample/moving_ball/first_frame.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1603f6539d338dc47b2c3a53376e8709cd7a9dd8
--- /dev/null
+++ b/sample/moving_ball/first_frame.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32773994491b764d09cf357983abd4cbd89dd9601fa45a5c2ed1a340ab70df90
+size 652643
diff --git a/sample/moving_ball/input_video.mp4 b/sample/moving_ball/input_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2c915c50b2e2fb17e9f6156aa3001744bd38c01a
--- /dev/null
+++ b/sample/moving_ball/input_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e07906cc204ba26c0dd05eed545030cb7e79f2742e983ff0b04d2d9c3c762d29
+size 2014662
diff --git a/sample/moving_ball/prompt.json b/sample/moving_ball/prompt.json
new file mode 100644
index 0000000000000000000000000000000000000000..5cc31f548ca25125b9479c6d6655d89a48053a51
--- /dev/null
+++ b/sample/moving_ball/prompt.json
@@ -0,0 +1,3 @@
+{
+  "bg": "A ball rolls off the table."
+}
diff --git a/sample/moving_ball/quadmask_0.mp4 b/sample/moving_ball/quadmask_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..077edbf1cbc902ccb490c88327913f65238ce58f
--- /dev/null
+++ b/sample/moving_ball/quadmask_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5904642de05a65f210bd49e3c24b7d0657ef57ff40eb9baafd562962c9dd9189
+size 2485881
diff --git a/sample/pillow/input_video.mp4 b/sample/pillow/input_video.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c54b4144070eb33a41ef21496ba35d53bf32f054
--- /dev/null
+++ b/sample/pillow/input_video.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ca3e6b666497e053491772e8f0317e22520c63ebaa8896b8378757d016e0f75
+size 2960087
diff --git a/sample/pillow/prompt.json b/sample/pillow/prompt.json
new file mode 100644
index 0000000000000000000000000000000000000000..bf292ebce03a1136ce4141b2d9704e2e1624aeb5
--- /dev/null
+++ b/sample/pillow/prompt.json
@@ -0,0 +1,3 @@
+{
+  "bg": "Two pillows placed on the table."
+}
diff --git a/sample/pillow/quadmask_0.mp4 b/sample/pillow/quadmask_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..424c3d9095502234a87cca94421e7f496307ee01
--- /dev/null
+++ b/sample/pillow/quadmask_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eb70257593da06f682a3ddda54a9d260d4fc514f645237f5ca74b08f8da61a6
+size 2
diff --git a/sample/pillow/segmentation_info.json b/sample/pillow/segmentation_info.json
new file mode 100644
index 0000000000000000000000000000000000000000..f3c3de8cf51947db3a47a2701922e261c838545f
--- /dev/null
+++ b/sample/pillow/segmentation_info.json
@@ -0,0 +1,85 @@
+{
+  "total_frames": 62,
+  "frame_width": 3840,
+  "frame_height": 2160,
+  "fps": 12.0,
+  "num_points": 8,
+  "points_by_frame": {
+    "0": [
+      [
+        1507,
+        724
+      ],
+      [
+        1363,
+        638
+      ],
+      [
+        1190,
+        475
+      ],
+      [
+        1276,
+        187
+      ],
+      [
+        1545,
+        168
+      ],
+      [
+        1660,
+        259
+      ],
+      [
+        1684,
+        393
+      ],
+      [
+        1579,
+        825
+      ]
+    ]
+  },
+  "video_path": "teaser3/weight_on_pillow.mp4",
+  "instruction": "segment the weight",
+  "primary_points_by_frame": {
+    "0": [
+      [
+        1507,
+        724
+      ],
+      [
+        1363,
+        638
+      ],
+      [
+        1190,
+        475
+      ],
+      [
+        1276,
+        187
+      ],
+      [
+        1545,
+        168
+      ],
+      [
+        1660,
+        259
+      ],
+      [
+        1684,
+        393
+      ],
+      [
+        1579,
+        825
+      ]
+    ]
+  },
+  "primary_frames": [
+    0
+  ],
+  "first_appears_frame": 0
+}
\ No newline at end of file
diff --git a/videox_fun/__init__.py b/videox_fun/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/videox_fun/api/api.py b/videox_fun/api/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..e525fda46611b6921b5fd930f582ab359b7ef507
--- /dev/null
+++ b/videox_fun/api/api.py
@@ -0,0 +1,213 @@
+import base64
+import gc
+import hashlib
+import io
+import os
+import tempfile
+from io import BytesIO
+
+import gradio as gr
+import requests
+import torch
+from fastapi import FastAPI
+from PIL import Image
+
+
+# Function to encode a file to Base64
+def encode_file_to_base64(file_path):
+    with open(file_path, "rb") as file:
+        # Encode the data to Base64
+        file_base64 = base64.b64encode(file.read())
+        return file_base64
+
+def update_edition_api(_: gr.Blocks, app: FastAPI, controller):
+    @app.post("/videox_fun/update_edition")
+    def _update_edition_api(
+        datas: dict,
+    ):
+        edition = datas.get('edition', 'v2')
+
+        try:
+            controller.update_edition(
+                edition
+            )
+            comment = "Success"
+        except Exception as e:
+            torch.cuda.empty_cache()
+            comment = f"Error. error information is {str(e)}"
+
+        return {"message": comment}
+
+def update_diffusion_transformer_api(_: gr.Blocks, app: FastAPI, controller):
+    @app.post("/videox_fun/update_diffusion_transformer")
+    def _update_diffusion_transformer_api(
+        datas: dict,
+    ):
+        diffusion_transformer_path = datas.get('diffusion_transformer_path', 'none')
+
+        try:
+            controller.update_diffusion_transformer(
+                diffusion_transformer_path
+            )
+            comment = "Success"
+        except Exception as e:
+            torch.cuda.empty_cache()
+            comment = f"Error. error information is {str(e)}"
+
+        return {"message": comment}
+
+def download_from_url(url, timeout=10):
+    try:
+        response = requests.get(url, timeout=timeout)
+        response.raise_for_status()  # 检查请求是否成功
+        return response.content
+    except requests.exceptions.RequestException as e:
+        print(f"Error downloading from {url}: {e}")
+        return None
+
+def save_base64_video(base64_string):
+    video_data = base64.b64decode(base64_string)
+
+    md5_hash = hashlib.md5(video_data).hexdigest()
+    filename = f"{md5_hash}.mp4"  
+    
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, filename)
+
+    with open(file_path, 'wb') as video_file:
+        video_file.write(video_data)
+
+    return file_path
+
+def save_base64_image(base64_string):
+    video_data = base64.b64decode(base64_string)
+
+    md5_hash = hashlib.md5(video_data).hexdigest()
+    filename = f"{md5_hash}.jpg"  
+    
+    temp_dir = tempfile.gettempdir()
+    file_path = os.path.join(temp_dir, filename)
+
+    with open(file_path, 'wb') as video_file:
+        video_file.write(video_data)
+
+    return file_path
+
+def save_url_video(url):
+    video_data = download_from_url(url)
+    if video_data:
+        return save_base64_video(base64.b64encode(video_data))
+    return None
+
+def save_url_image(url):
+    image_data = download_from_url(url)
+    if image_data:
+        return save_base64_image(base64.b64encode(image_data))
+    return None
+
+def infer_forward_api(_: gr.Blocks, app: FastAPI, controller):
+    @app.post("/videox_fun/infer_forward")
+    def _infer_forward_api(
+        datas: dict,
+    ):
+        base_model_path = datas.get('base_model_path', 'none')
+        lora_model_path = datas.get('lora_model_path', 'none')
+        lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
+        prompt_textbox = datas.get('prompt_textbox', None)
+        negative_prompt_textbox = datas.get('negative_prompt_textbox', 'The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. ')
+        sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
+        sample_step_slider = datas.get('sample_step_slider', 30)
+        resize_method = datas.get('resize_method', "Generate by")
+        width_slider = datas.get('width_slider', 672)
+        height_slider = datas.get('height_slider', 384)
+        base_resolution = datas.get('base_resolution', 512)
+        is_image = datas.get('is_image', False)
+        generation_method = datas.get('generation_method', False)
+        length_slider = datas.get('length_slider', 49)
+        overlap_video_length = datas.get('overlap_video_length', 4)
+        partial_video_length = datas.get('partial_video_length', 72)
+        cfg_scale_slider = datas.get('cfg_scale_slider', 6)
+        start_image = datas.get('start_image', None)
+        end_image = datas.get('end_image', None)
+        validation_video = datas.get('validation_video', None)
+        validation_video_mask = datas.get('validation_video_mask', None)
+        control_video = datas.get('control_video', None)
+        denoise_strength = datas.get('denoise_strength', 0.70)
+        seed_textbox = datas.get("seed_textbox", 43)
+
+        generation_method = "Image Generation" if is_image else generation_method
+
+        if start_image is not None:
+            if start_image.startswith('http'):
+                start_image = save_url_image(start_image)
+                start_image = [Image.open(start_image)]
+            else:
+                start_image = base64.b64decode(start_image)
+                start_image = [Image.open(BytesIO(start_image))]
+
+        if end_image is not None:
+            if end_image.startswith('http'):
+                end_image = save_url_image(end_image)
+                end_image = [Image.open(end_image)]
+            else:
+                end_image = base64.b64decode(end_image)
+                end_image = [Image.open(BytesIO(end_image))]
+
+        if validation_video is not None:
+            if validation_video.startswith('http'):
+                validation_video = save_url_video(validation_video)
+            else:
+                validation_video = save_base64_video(validation_video)
+
+        if validation_video_mask is not None:
+            if validation_video_mask.startswith('http'):
+                validation_video_mask = save_url_image(validation_video_mask)
+            else:
+                validation_video_mask = save_base64_image(validation_video_mask)
+
+        if control_video is not None:
+            if control_video.startswith('http'):
+                control_video = save_url_video(control_video)
+            else:
+                control_video = save_base64_video(control_video)
+        
+        try:
+            save_sample_path, comment = controller.generate(
+                "",
+                base_model_path,
+                lora_model_path, 
+                lora_alpha_slider,
+                prompt_textbox, 
+                negative_prompt_textbox, 
+                sampler_dropdown, 
+                sample_step_slider, 
+                resize_method,
+                width_slider, 
+                height_slider, 
+                base_resolution,
+                generation_method,
+                length_slider, 
+                overlap_video_length, 
+                partial_video_length, 
+                cfg_scale_slider, 
+                start_image,
+                end_image,
+                validation_video,
+                validation_video_mask, 
+                control_video, 
+                denoise_strength,
+                seed_textbox,
+                is_api = True,
+            )
+        except Exception as e:
+            gc.collect()
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+            save_sample_path = ""
+            comment = f"Error. error information is {str(e)}"
+            return {"message": comment}
+        
+        if save_sample_path != "":
+            return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
+        else:
+            return {"message": comment, "save_sample_path": save_sample_path}
\ No newline at end of file
diff --git a/videox_fun/api/api_multi_nodes.py b/videox_fun/api/api_multi_nodes.py
new file mode 100644
index 0000000000000000000000000000000000000000..66e8d5941a0766359d96c78289aa4ee6397b8de1
--- /dev/null
+++ b/videox_fun/api/api_multi_nodes.py
@@ -0,0 +1,215 @@
+# This file is modified from https://github.com/xdit-project/xDiT/blob/main/entrypoints/launch.py
+import base64
+import gc
+import os
+from io import BytesIO
+
+import gradio as gr
+import torch
+from fastapi import FastAPI, HTTPException
+from PIL import Image
+
+from .api import (encode_file_to_base64, save_base64_image, save_base64_video,
+                  save_url_image, save_url_video)
+
+try:
+    import ray
+except:
+    print("Ray is not installed. If you want to use multi gpus api. Please install it by running 'pip install ray'.")
+    ray =  None
+
+if ray is not None:
+    @ray.remote(num_gpus=1)
+    class MultiNodesGenerator:
+        def __init__(
+            self, rank: int, world_size: int, Controller,
+            GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint", 
+            config_path=None, ulysses_degree=1, ring_degree=1,
+            enable_teacache=None, teacache_threshold=None, 
+            num_skip_start_steps=None, teacache_offload=None, weight_dtype=None, 
+            savedir_sample=None,
+        ):
+            # Set PyTorch distributed environment variables
+            os.environ["RANK"] = str(rank)
+            os.environ["WORLD_SIZE"] = str(world_size)
+            os.environ["MASTER_ADDR"] = "127.0.0.1"
+            os.environ["MASTER_PORT"] = "29500"
+            
+            self.rank = rank
+            self.controller = Controller(
+                GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, config_path=config_path, 
+                ulysses_degree=ulysses_degree, ring_degree=ring_degree, enable_teacache=enable_teacache, teacache_threshold=teacache_threshold, num_skip_start_steps=num_skip_start_steps, 
+                teacache_offload=teacache_offload, weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+            )
+
+        def generate(self, datas):
+            try:
+                base_model_path = datas.get('base_model_path', 'none')
+                lora_model_path = datas.get('lora_model_path', 'none')
+                lora_alpha_slider = datas.get('lora_alpha_slider', 0.55)
+                prompt_textbox = datas.get('prompt_textbox', None)
+                negative_prompt_textbox = datas.get('negative_prompt_textbox', 'The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. ')
+                sampler_dropdown = datas.get('sampler_dropdown', 'Euler')
+                sample_step_slider = datas.get('sample_step_slider', 30)
+                resize_method = datas.get('resize_method', "Generate by")
+                width_slider = datas.get('width_slider', 672)
+                height_slider = datas.get('height_slider', 384)
+                base_resolution = datas.get('base_resolution', 512)
+                is_image = datas.get('is_image', False)
+                generation_method = datas.get('generation_method', False)
+                length_slider = datas.get('length_slider', 49)
+                overlap_video_length = datas.get('overlap_video_length', 4)
+                partial_video_length = datas.get('partial_video_length', 72)
+                cfg_scale_slider = datas.get('cfg_scale_slider', 6)
+                start_image = datas.get('start_image', None)
+                end_image = datas.get('end_image', None)
+                validation_video = datas.get('validation_video', None)
+                validation_video_mask = datas.get('validation_video_mask', None)
+                control_video = datas.get('control_video', None)
+                denoise_strength = datas.get('denoise_strength', 0.70)
+                seed_textbox = datas.get("seed_textbox", 43)
+
+                generation_method = "Image Generation" if is_image else generation_method
+
+                if start_image is not None:
+                    if start_image.startswith('http'):
+                        start_image = save_url_image(start_image)
+                        start_image = [Image.open(start_image)]
+                    else:
+                        start_image = base64.b64decode(start_image)
+                        start_image = [Image.open(BytesIO(start_image))]
+
+                if end_image is not None:
+                    if end_image.startswith('http'):
+                        end_image = save_url_image(end_image)
+                        end_image = [Image.open(end_image)]
+                    else:
+                        end_image = base64.b64decode(end_image)
+                        end_image = [Image.open(BytesIO(end_image))]
+                        
+                if validation_video is not None:
+                    if validation_video.startswith('http'):
+                        validation_video = save_url_video(validation_video)
+                    else:
+                        validation_video = save_base64_video(validation_video)
+
+                if validation_video_mask is not None:
+                    if validation_video_mask.startswith('http'):
+                        validation_video_mask = save_url_image(validation_video_mask)
+                    else:
+                        validation_video_mask = save_base64_image(validation_video_mask)
+
+                if control_video is not None:
+                    if control_video.startswith('http'):
+                        control_video = save_url_video(control_video)
+                    else:
+                        control_video = save_base64_video(control_video)
+                
+                try:
+                    save_sample_path, comment = self.controller.generate(
+                        "",
+                        base_model_path,
+                        lora_model_path, 
+                        lora_alpha_slider,
+                        prompt_textbox, 
+                        negative_prompt_textbox, 
+                        sampler_dropdown, 
+                        sample_step_slider, 
+                        resize_method,
+                        width_slider, 
+                        height_slider, 
+                        base_resolution,
+                        generation_method,
+                        length_slider, 
+                        overlap_video_length, 
+                        partial_video_length, 
+                        cfg_scale_slider, 
+                        start_image,
+                        end_image,
+                        validation_video,
+                        validation_video_mask, 
+                        control_video, 
+                        denoise_strength,
+                        seed_textbox,
+                        is_api = True,
+                    )
+                except Exception as e:
+                    gc.collect()
+                    torch.cuda.empty_cache()
+                    torch.cuda.ipc_collect()
+                    save_sample_path = ""
+                    comment = f"Error. error information is {str(e)}"
+                    return {"message": comment}
+                
+                import torch.distributed as dist
+                if dist.get_rank() == 0:
+                    if save_sample_path != "":
+                        return {"message": comment, "save_sample_path": save_sample_path, "base64_encoding": encode_file_to_base64(save_sample_path)}
+                    else:
+                        return {"message": comment, "save_sample_path": save_sample_path}
+                return None
+
+            except Exception as e:
+                self.logger.error(f"Error generating image: {str(e)}")
+                raise HTTPException(status_code=500, detail=str(e))
+
+    class MultiNodesEngine:
+        def __init__(
+            self, 
+            world_size, 
+            Controller,
+            GPU_memory_mode, 
+            scheduler_dict, 
+            model_name, 
+            model_type, 
+            config_path,
+            ulysses_degree, 
+            ring_degree, 
+            enable_teacache, 
+            teacache_threshold, 
+            num_skip_start_steps, 
+            teacache_offload, 
+            weight_dtype,
+            savedir_sample
+        ):
+            # Ensure Ray is initialized
+            if not ray.is_initialized():
+                ray.init()
+            
+            num_workers = world_size
+            self.workers = [
+                MultiNodesGenerator.remote(
+                    rank, world_size, Controller, 
+                    GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, config_path=config_path, 
+                    ulysses_degree=ulysses_degree, ring_degree=ring_degree, enable_teacache=enable_teacache, teacache_threshold=teacache_threshold, num_skip_start_steps=num_skip_start_steps, 
+                    teacache_offload=teacache_offload, weight_dtype=weight_dtype, savedir_sample=savedir_sample,
+                )
+                for rank in range(num_workers)
+            ]
+            print("Update workers done")
+            
+        async def generate(self, data):
+            results = ray.get([
+                worker.generate.remote(data)
+                for worker in self.workers
+            ])
+
+            return next(path for path in results if path is not None) 
+
+    def multi_nodes_infer_forward_api(_: gr.Blocks, app: FastAPI, engine):
+
+        @app.post("/videox_fun/infer_forward")
+        async def _multi_nodes_infer_forward_api(
+            datas: dict,
+        ):
+            try:
+                result = await engine.generate(datas)
+                return result
+            except Exception as e:
+                if isinstance(e, HTTPException):
+                    raise e
+                raise HTTPException(status_code=500, detail=str(e))
+else:
+    MultiNodesEngine = None
+    MultiNodesGenerator = None
+    multi_nodes_infer_forward_api = None
\ No newline at end of file
diff --git a/videox_fun/data/bucket_sampler.py b/videox_fun/data/bucket_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfecf7f3b80c402716ac12f0f48b05383e1e84cf
--- /dev/null
+++ b/videox_fun/data/bucket_sampler.py
@@ -0,0 +1,390 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import glob
+from typing import (Generic, Iterable, Iterator, List, Optional, Sequence,
+                    Sized, TypeVar, Union)
+
+import cv2
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import BatchSampler, Dataset, Sampler
+
+ASPECT_RATIO_512 = {
+    '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
+    '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
+    '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
+    '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
+    '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
+    '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
+    '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
+    '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
+    '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
+    '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
+}
+ASPECT_RATIO_RANDOM_CROP_512 = {
+    '0.42': [320.0, 768.0], '0.5': [352.0, 704.0], 
+    '0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0], 
+    '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], 
+    '1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0], 
+    '2.0': [704.0, 352.0],  '2.4': [768.0, 320.0]
+}
+ASPECT_RATIO_RANDOM_CROP_PROB = [
+    1, 2,
+    4, 4, 4, 4,
+    8, 8, 8,
+    4, 4, 4, 4,
+    2, 1
+]
+ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB)
+
+def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512):
+    aspect_ratio = height / width
+    closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
+    return ratios[closest_ratio], float(closest_ratio)
+
+def get_image_size_without_loading(path):
+    with Image.open(path) as img:
+        return img.size  # (width, height)
+
+class RandomSampler(Sampler[int]):
+    r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
+
+    If with replacement, then user can specify :attr:`num_samples` to draw.
+
+    Args:
+        data_source (Dataset): dataset to sample from
+        replacement (bool): samples are drawn on-demand with replacement if ``True``, default=``False``
+        num_samples (int): number of samples to draw, default=`len(dataset)`.
+        generator (Generator): Generator used in sampling.
+    """
+
+    data_source: Sized
+    replacement: bool
+
+    def __init__(self, data_source: Sized, replacement: bool = False,
+                 num_samples: Optional[int] = None, generator=None) -> None:
+        self.data_source = data_source
+        self.replacement = replacement
+        self._num_samples = num_samples
+        self.generator = generator
+        self._pos_start = 0
+
+        if not isinstance(self.replacement, bool):
+            raise TypeError(f"replacement should be a boolean value, but got replacement={self.replacement}")
+
+        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
+            raise ValueError(f"num_samples should be a positive integer value, but got num_samples={self.num_samples}")
+
+    @property
+    def num_samples(self) -> int:
+        # dataset size might change at runtime
+        if self._num_samples is None:
+            return len(self.data_source)
+        return self._num_samples
+
+    def __iter__(self) -> Iterator[int]:
+        n = len(self.data_source)
+        if self.generator is None:
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+            generator = torch.Generator()
+            generator.manual_seed(seed)
+        else:
+            generator = self.generator
+
+        if self.replacement:
+            for _ in range(self.num_samples // 32):
+                yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=generator).tolist()
+            yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, generator=generator).tolist()
+        else:
+            for _ in range(self.num_samples // n):
+                xx = torch.randperm(n, generator=generator).tolist()
+                if self._pos_start >= n:
+                    self._pos_start = 0
+                print("xx top 10", xx[:10], self._pos_start)
+                for idx in range(self._pos_start, n):
+                    yield xx[idx]
+                    self._pos_start = (self._pos_start + 1) % n
+                self._pos_start = 0
+            yield from torch.randperm(n, generator=generator).tolist()[:self.num_samples % n]
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+class AspectRatioBatchImageSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(
+        self,
+        sampler: Sampler,
+        dataset: Dataset,
+        batch_size: int,
+        train_folder: str = None,
+        aspect_ratios: dict = ASPECT_RATIO_512,
+        drop_last: bool = False,
+        config=None,
+        **kwargs
+    ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.train_folder = train_folder
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        self.config = config
+        # buckets for each aspect ratio 
+        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
+        # [str(k) for k, v in aspect_ratios] 
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+
+    def __iter__(self):
+        for idx in self.sampler:
+            try:
+                image_dict = self.dataset[idx]
+
+                width, height = image_dict.get("width", None), image_dict.get("height", None)
+                if width is None or height is None:
+                    image_id, name = image_dict['file_path'], image_dict['text']
+                    if self.train_folder is None:
+                        image_dir = image_id
+                    else:
+                        image_dir = os.path.join(self.train_folder, image_id)
+
+                    width, height = get_image_size_without_loading(image_dir)
+
+                    ratio = height / width # self.dataset[idx]
+                else:
+                    height = int(height)
+                    width = int(width)
+                    ratio = height / width # self.dataset[idx]
+            except Exception as e:
+                print(e)
+                continue
+            # find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            bucket = self._aspect_ratio_buckets[closest_ratio]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+class AspectRatioBatchSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+    def __init__(
+        self,
+        sampler: Sampler,
+        dataset: Dataset,
+        batch_size: int,
+        video_folder: str = None,
+        train_data_format: str = "webvid",
+        aspect_ratios: dict = ASPECT_RATIO_512,
+        drop_last: bool = False,
+        config=None,
+        **kwargs
+    ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.video_folder = video_folder
+        self.train_data_format = train_data_format
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+        self.config = config
+        # buckets for each aspect ratio 
+        self._aspect_ratio_buckets = {ratio: [] for ratio in aspect_ratios}
+        # [str(k) for k, v in aspect_ratios] 
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+
+    def __iter__(self):
+        for idx in self.sampler:
+            try:
+                video_dict = self.dataset[idx]
+                width, more = video_dict.get("width", None), video_dict.get("height", None)
+
+                if width is None or height is None:
+                    if self.train_data_format == "normal":
+                        video_id, name = video_dict['file_path'], video_dict['text']
+                        if self.video_folder is None:
+                            video_dir = video_id
+                        else:
+                            video_dir = os.path.join(self.video_folder, video_id)
+                    else:
+                        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
+                        video_dir = os.path.join(self.video_folder, f"{videoid}.mp4")
+                    cap = cv2.VideoCapture(video_dir)
+
+                    # 获取视频尺寸
+                    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))   # 浮点数转换为整数
+                    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))  # 浮点数转换为整数
+                    
+                    ratio = height / width # self.dataset[idx]
+                else:
+                    height = int(height)
+                    width = int(width)
+                    ratio = height / width # self.dataset[idx]
+            except Exception as e:
+                print(e, self.dataset[idx], "This item is error, please check it.")
+                continue
+            # find the closest aspect ratio
+            closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+            if closest_ratio not in self.current_available_bucket_keys:
+                continue
+            bucket = self._aspect_ratio_buckets[closest_ratio]
+            bucket.append(idx)
+            # yield a batch of indices in the same aspect ratio group
+            if len(bucket) == self.batch_size:
+                yield bucket[:]
+                del bucket[:]
+
+class AspectRatioBatchImageVideoSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 dataset: Dataset,
+                 batch_size: int,
+                 train_folder: str = None,
+                 aspect_ratios: dict = ASPECT_RATIO_512,
+                 drop_last: bool = False
+                ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.train_folder = train_folder
+        self.batch_size = batch_size
+        self.aspect_ratios = aspect_ratios
+        self.drop_last = drop_last
+
+        # buckets for each aspect ratio
+        self.current_available_bucket_keys = list(aspect_ratios.keys())
+        self.bucket = {
+            'image':{ratio: [] for ratio in aspect_ratios}, 
+            'video':{ratio: [] for ratio in aspect_ratios}
+        }
+
+    def __iter__(self):
+        for idx in self.sampler:
+            content_type = self.dataset[idx].get('type', 'image')
+            if content_type == 'image':
+                try:
+                    image_dict = self.dataset[idx]
+
+                    width, height = image_dict.get("width", None), image_dict.get("height", None)
+                    if width is None or height is None:
+                        image_id, name = image_dict['file_path'], image_dict['text']
+                        if self.train_folder is None:
+                            image_dir = image_id
+                        else:
+                            image_dir = os.path.join(self.train_folder, image_id)
+
+                        width, height = get_image_size_without_loading(image_dir)
+
+                        ratio = height / width # self.dataset[idx]
+                    else:
+                        height = int(height)
+                        width = int(width)
+                        ratio = height / width # self.dataset[idx]
+                except Exception as e:
+                    print(e, self.dataset[idx], "This item is error, please check it.")
+                    continue
+                # find the closest aspect ratio
+                closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+                if closest_ratio not in self.current_available_bucket_keys:
+                    continue
+                bucket = self.bucket['image'][closest_ratio]
+                bucket.append(idx)
+                # yield a batch of indices in the same aspect ratio group
+                if len(bucket) == self.batch_size:
+                    yield bucket[:]
+                    del bucket[:]
+            else:
+                try:
+                    video_dict = self.dataset[idx]
+                    width, height = video_dict.get("width", None), video_dict.get("height", None)
+
+                    if width is None or height is None:
+                        if video_dict['type'] == 'video_mask_tuple':
+                            video_dir = video_dict['file_path']
+                            if os.path.isdir(os.path.join(video_dir, 'input')):
+                                sample_path = list(glob.glob(os.path.join(video_dir, 'input', '*.png')))[0]
+                                width, height = get_image_size_without_loading(sample_path)
+                            else:
+                                sample_path = os.path.join(video_dir, 'rgb_full.mp4')
+                                cap = cv2.VideoCapture(sample_path)
+                                width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                                height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                        else:
+                            video_id, name = video_dict['file_path'], video_dict['text']
+                            if self.train_folder is None:
+                                video_dir = video_id
+                            else:
+                                video_dir = os.path.join(self.train_folder, video_id)
+                            cap = cv2.VideoCapture(video_dir)
+
+                            width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                        
+                        ratio = height / width # self.dataset[idx]
+                    else:
+                        height = int(height)
+                        width = int(width)
+                        ratio = height / width # self.dataset[idx]
+                except Exception as e:
+                    print(e, self.dataset[idx], "This item is error, please check it.")
+                    continue
+                # find the closest aspect ratio
+                closest_ratio = min(self.aspect_ratios.keys(), key=lambda r: abs(float(r) - ratio))
+                if closest_ratio not in self.current_available_bucket_keys:
+                    continue
+                bucket = self.bucket['video'][closest_ratio]
+                bucket.append(idx)
+                # yield a batch of indices in the same aspect ratio group
+                if len(bucket) == self.batch_size:
+                    yield bucket[:]
+                    del bucket[:]
\ No newline at end of file
diff --git a/videox_fun/data/dataset_image.py b/videox_fun/data/dataset_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..098d49a4044f8daa351cd01b4cb1ec5415412e80
--- /dev/null
+++ b/videox_fun/data/dataset_image.py
@@ -0,0 +1,76 @@
+import json
+import os
+import random
+
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from PIL import Image
+from torch.utils.data.dataset import Dataset
+
+
+class CC15M(Dataset):
+    def __init__(
+            self,
+            json_path, 
+            video_folder=None,
+            resolution=512,
+            enable_bucket=False,
+        ):
+        print(f"loading annotations from {json_path} ...")
+        self.dataset = json.load(open(json_path, 'r'))
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        
+        self.enable_bucket = enable_bucket
+        self.video_folder = video_folder
+
+        resolution = tuple(resolution) if not isinstance(resolution, int) else (resolution, resolution)
+        self.pixel_transforms = transforms.Compose([
+            transforms.Resize(resolution[0]),
+            transforms.CenterCrop(resolution),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        video_id, name = video_dict['file_path'], video_dict['text']
+
+        if self.video_folder is None:
+            video_dir = video_id
+        else:
+            video_dir = os.path.join(self.video_folder, video_id)
+
+        pixel_values = Image.open(video_dir).convert("RGB")
+        return pixel_values, name
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+            except Exception as e:
+                print(e)
+                idx = random.randint(0, self.length-1)
+
+        if not self.enable_bucket:
+            pixel_values = self.pixel_transforms(pixel_values)
+        else:
+            pixel_values = np.array(pixel_values)
+
+        sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+
+if __name__ == "__main__":
+    dataset = CC15M(
+        csv_path="/mnt_wg/zhoumo.xjq/CCUtils/cc15m_add_index.json",
+        resolution=512,
+    )
+    
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0,)
+    for idx, batch in enumerate(dataloader):
+        print(batch["pixel_values"].shape, len(batch["text"]))
\ No newline at end of file
diff --git a/videox_fun/data/dataset_image_video.py b/videox_fun/data/dataset_image_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3f8cc601db1865643e678793e1d3f2469c8bcd
--- /dev/null
+++ b/videox_fun/data/dataset_image_video.py
@@ -0,0 +1,1067 @@
+import csv
+import io
+import json
+import math
+import os
+import glob
+import random
+from threading import Thread
+import mediapy as media
+import time
+
+import albumentations
+import cv2
+import gc
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from scipy.special import binom
+
+from func_timeout import func_timeout, FunctionTimedOut
+from decord import VideoReader
+from PIL import Image
+from torch.utils.data import BatchSampler, Sampler
+from torch.utils.data.dataset import Dataset
+from contextlib import contextmanager
+
+VIDEO_READER_TIMEOUT = 20
+
+bernstein = lambda n, k, t: binom(n,k)* t**k * (1.-t)**(n-k)
+
+# codes from https://stackoverflow.com/questions/50731785/create-random-shape-contour-using-matplotlib
+def bezier(points, num=200):
+    N = len(points)
+    t = np.linspace(0, 1, num=num)
+    curve = np.zeros((num, 2))
+    for i in range(N):
+        curve += np.outer(bernstein(N - 1, i, t), points[i])
+    return curve
+
+class Segment():
+    def __init__(self, p1, p2, angle1, angle2, **kw):
+        self.p1 = p1
+        self.p2 = p2
+        self.angle1 = angle1
+        self.angle2 = angle2
+        self.numpoints = kw.get("numpoints", 100)
+        r = kw.get("r", 0.3)
+        d = np.sqrt(np.sum((self.p2-self.p1)**2))
+        self.r = r*d
+        self.p = np.zeros((4,2))
+        self.p[0,:] = self.p1[:]
+        self.p[3,:] = self.p2[:]
+        self.calc_intermediate_points(self.r)
+
+    def calc_intermediate_points(self,r):
+        self.p[1,:] = self.p1 + np.array(
+            [self.r*np.cos(self.angle1), self.r*np.sin(self.angle1)])
+        self.p[2,:] = self.p2 + np.array(
+            [self.r*np.cos(self.angle2+np.pi), self.r*np.sin(self.angle2+np.pi)])
+        self.curve = bezier(self.p,self.numpoints)
+
+
+def get_curve(points, **kw):
+    segments = []
+    for i in range(len(points)-1):
+        seg = Segment(points[i,:2], points[i+1,:2], points[i,2],points[i+1,2],**kw)
+        segments.append(seg)
+    curve = np.concatenate([s.curve for s in segments])
+    return segments, curve
+
+
+def ccw_sort(p):
+    d = p-np.mean(p,axis=0)
+    s = np.arctan2(d[:,0], d[:,1])
+    return p[np.argsort(s),:]
+
+
+def get_bezier_curve(a, rad=0.2, edgy=0):
+    """ given an array of points *a*, create a curve through
+    those points. 
+    *rad* is a number between 0 and 1 to steer the distance of
+          control points.
+    *edgy* is a parameter which controls how "edgy" the curve is,
+           edgy=0 is smoothest."""
+    p = np.arctan(edgy)/np.pi+.5
+    a = ccw_sort(a)
+    a = np.append(a, np.atleast_2d(a[0,:]), axis=0)
+    d = np.diff(a, axis=0)
+    ang = np.arctan2(d[:,1],d[:,0])
+    f = lambda ang : (ang>=0)*ang + (ang<0)*(ang+2*np.pi)
+    ang = f(ang)
+    ang1 = ang
+    ang2 = np.roll(ang,1)
+    ang = p*ang1 + (1-p)*ang2 + (np.abs(ang2-ang1) > np.pi )*np.pi
+    ang = np.append(ang, [ang[0]])
+    a = np.append(a, np.atleast_2d(ang).T, axis=1)
+    s, c = get_curve(a, r=rad, method="var")
+    x,y = c.T
+    return x,y, a
+
+
+def get_random_points(n=5, scale=0.8, mindst=None, rec=0):
+    """ create n random points in the unit square, which are *mindst*
+    apart, then scale them."""
+    mindst = mindst or .7/n
+    a = np.random.rand(n,2)
+    d = np.sqrt(np.sum(np.diff(ccw_sort(a), axis=0), axis=1)**2)
+    if np.all(d >= mindst) or rec>=200:
+        return a*scale
+    else:
+        return get_random_points(n=n, scale=scale, mindst=mindst, rec=rec+1)
+
+
+def fill_mask(shape, x, y, fill_val=255):
+    _, _, h, w = shape
+    mask = np.zeros((h, w), dtype=np.uint8)
+    mask = cv2.fillPoly(mask, [np.array([x, y], np.int32).T], fill_val)
+    return mask
+
+
+def random_shift(x, y, scale_range = [0.2, 0.7], trans_perturb_range=[-0.2, 0.2]):
+    w_scale = np.random.uniform(scale_range[0], scale_range[1])
+    h_scale = np.random.uniform(scale_range[0], scale_range[1])
+    x_trans = np.random.uniform(0., 1. - w_scale)
+    y_trans = np.random.uniform(0., 1. - h_scale)
+    x_shifted = x * w_scale + x_trans + np.random.uniform(trans_perturb_range[0], trans_perturb_range[1])
+    y_shifted = y * h_scale + y_trans + np.random.uniform(trans_perturb_range[0], trans_perturb_range[1])
+    return x_shifted, y_shifted
+
+
+def get_random_shape_mask(
+        shape, n_pts_range=[3, 10], rad_range=[0.0, 1.0], edgy_range=[0.0, 0.1], n_keyframes_range=[2, 25],
+        random_drop_range=[0.0, 0.2],
+    ):
+    f, _, h, w = shape
+
+    n_pts = np.random.randint(n_pts_range[0], n_pts_range[1])
+    n_keyframes = np.random.randint(n_keyframes_range[0], n_keyframes_range[1])
+    keyframe_interval = f // (n_keyframes - 1)
+    keyframe_indices = list(range(0, f, keyframe_interval))
+    if len(keyframe_indices) == n_keyframes:
+        keyframe_indices[-1] = f - 1
+    else:
+        keyframe_indices.append(f - 1)
+    x_all_frames, y_all_frames = [], []
+    for i, keyframe_index in enumerate(keyframe_indices):
+        rad = np.random.uniform(rad_range[0], rad_range[1])
+        edgy = np.random.uniform(edgy_range[0], edgy_range[1])
+        x_kf, y_kf, _ = get_bezier_curve(get_random_points(n=n_pts), rad=rad, edgy=edgy)
+        x_kf, y_kf = random_shift(x_kf, y_kf)
+        if i == 0:
+            x_all_frames.append(x_kf[None])
+            y_all_frames.append(y_kf[None])
+        else:
+            x_interval = np.linspace(x_all_frames[-1][-1], x_kf, keyframe_index - keyframe_indices[i - 1] + 1)
+            y_interval = np.linspace(y_all_frames[-1][-1], y_kf, keyframe_index - keyframe_indices[i - 1] + 1)
+            x_all_frames.append(x_interval[1:])
+            y_all_frames.append(y_interval[1:])
+    x_all_frames = np.concatenate(x_all_frames, axis=0)
+    y_all_frames = np.concatenate(y_all_frames, axis=0)
+
+    masks = []
+    for x, y in zip(x_all_frames, y_all_frames):
+        x = np.round(x * w).astype(np.int32)
+        y = np.round(y * h).astype(np.int32)
+        mask = fill_mask(shape, x, y)
+        masks.append(mask)
+    masks = np.stack(masks, axis=0).astype(float) / 255.
+
+    n_frames_random_drop = int(np.random.uniform(random_drop_range[0], random_drop_range[1]) * f)
+    drop_index = np.random.randint(0, f - n_frames_random_drop)
+    masks[drop_index:drop_index + n_frames_random_drop] = 0
+
+    return masks  # (f, h, w), <float>[0, 1]
+
+
+def get_random_mask(shape, mask_type_probs=[0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.8]):
+    f, c, h, w = shape
+
+    if f != 1:
+        mask_index = np.random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], p=mask_type_probs)
+    else:
+        mask_index = np.random.choice([0, 1], p = [0.2, 0.8])
+    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
+
+    if mask_index == 0:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()
+
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+        mask[:, :, start_y:end_y, start_x:end_x] = 1
+    elif mask_index == 1:
+        mask[:, :, :, :] = 1
+    elif mask_index == 2:
+        mask_frame_index = np.random.randint(1, 5)
+        mask[mask_frame_index:, :, :, :] = 1
+    elif mask_index == 3:
+        mask_frame_index = np.random.randint(1, 5)
+        mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
+    elif mask_index == 4:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()
+
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+
+        mask_frame_before = np.random.randint(0, f // 2)
+        mask_frame_after = np.random.randint(f // 2, f)
+        mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
+    elif mask_index == 5:
+        mask = torch.randint(0, 2, (f, 1, h, w), dtype=torch.uint8)
+    elif mask_index == 6:
+        num_frames_to_mask = random.randint(1, max(f // 2, 1))
+        frames_to_mask = random.sample(range(f), num_frames_to_mask)
+
+        for i in frames_to_mask:
+            block_height = random.randint(1, h // 4)
+            block_width = random.randint(1, w // 4)
+            top_left_y = random.randint(0, h - block_height)
+            top_left_x = random.randint(0, w - block_width)
+            mask[i, 0, top_left_y:top_left_y + block_height, top_left_x:top_left_x + block_width] = 1
+    elif mask_index == 7:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        a = torch.randint(min(w, h) // 8, min(w, h) // 4, (1,)).item()
+        b = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item() 
+
+        for i in range(h):
+            for j in range(w):
+                if ((i - center_y) ** 2) / (b ** 2) + ((j - center_x) ** 2) / (a ** 2) < 1:
+                    mask[:, :, i, j] = 1
+    elif mask_index == 8:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        radius = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()
+        for i in range(h):
+            for j in range(w):
+                if (i - center_y) ** 2 + (j - center_x) ** 2 < radius ** 2:
+                    mask[:, :, i, j] = 1
+    elif mask_index == 9:
+        for idx in range(f):
+            if np.random.rand() > 0.5:
+                mask[idx, :, :, :] = 1
+    else:
+        num_objs = np.random.randint(1, 4)
+        mask_npy = get_random_shape_mask(shape)
+        for i in range(num_objs - 1):
+            mask_npy += get_random_shape_mask(shape).clip(0, 1)
+
+        mask = torch.from_numpy(mask_npy).unsqueeze(1)
+
+    return mask.float()
+
+
+def get_random_mask_multi(shape, mask_type_probs, range_num_masks=[1, 7]):
+    num_masks = np.random.randint(range_num_masks[0], range_num_masks[1])
+    masks = None
+    for _ in range(num_masks):
+        mask = get_random_mask(shape, mask_type_probs)
+        if masks is None:
+            masks = mask
+        else:
+            masks = (masks + mask).clip(0, 1)
+    return masks
+
+
+class ImageVideoSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 dataset: Dataset,
+                 batch_size: int,
+                 drop_last: bool = False
+                ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+        # buckets for each aspect ratio
+        self.bucket = {'image':[], 'video':[], 'video_mask_tuple':[]}
+
+    def __iter__(self):
+        for idx in self.sampler:
+            content_type = self.dataset.dataset[idx].get('type', 'image')
+            self.bucket[content_type].append(idx)
+
+            # yield a batch of indices in the same aspect ratio group
+            if len(self.bucket['video']) == self.batch_size:
+                bucket = self.bucket['video']
+                yield bucket[:]
+                del bucket[:]
+            elif len(self.bucket['video_mask_tuple']) == self.batch_size:
+                bucket = self.bucket['video_mask_tuple']
+                yield bucket[:]
+                del bucket[:]
+            elif len(self.bucket['image']) == self.batch_size:
+                bucket = self.bucket['image']
+                yield bucket[:]
+                del bucket[:]
+
+
+@contextmanager
+def VideoReader_contextmanager(*args, **kwargs):
+    vr = VideoReader(*args, **kwargs)
+    try:
+        yield vr
+    finally:
+        del vr
+        gc.collect()
+
+
+def get_video_reader_batch(video_reader, batch_index):
+    frames = video_reader.get_batch(batch_index).asnumpy()
+    return frames
+
+
+def _read_video_from_dir(video_dir):
+    frames = []
+    frame_paths = sorted(list(glob.glob(os.path.join(video_dir, '*.png'))))
+
+    if not frame_paths:
+        raise ValueError(f"No PNG files found in directory: {video_dir}")
+
+    for frame_path in frame_paths:
+        frame = media.read_image(frame_path)
+        frames.append(frame)
+
+    if not frames:
+        raise ValueError(f"Failed to read any frames from directory: {video_dir}")
+
+    return np.stack(frames, axis=0)
+
+
+def resize_frame(frame, target_short_side):
+    h, w, _ = frame.shape
+    if h < w:
+        if target_short_side > h:
+            return frame
+        new_h = target_short_side
+        new_w = int(target_short_side * w / h)
+    else:
+        if target_short_side > w:
+            return frame
+        new_w = target_short_side
+        new_h = int(target_short_side * h / w)
+
+    resized_frame = cv2.resize(frame, (new_w, new_h))
+    return resized_frame
+
+
+class ImageVideoDataset(Dataset):
+    def __init__(
+            self,
+            ann_path, data_root=None,
+            video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
+            image_sample_size=512,
+            video_repeat=0,
+            text_drop_ratio=0.1,
+            enable_bucket=False,
+            video_length_drop_start=0.0,
+            video_length_drop_end=1.0,
+            enable_inpaint=False,
+            trimask_zeroout_removal=False,
+            use_quadmask=False,
+            ablation_binary_mask=False,
+        ):
+        # Loading annotations from files
+        print(f"loading annotations from {ann_path} ...")
+        if ann_path.endswith('.csv'):
+            with open(ann_path, 'r') as csvfile:
+                dataset = list(csv.DictReader(csvfile))
+        elif ann_path.endswith('.json'):
+            dataset = json.load(open(ann_path))
+        else:
+            raise ValueError(f"Unsupported annotation file format: {ann_path}. Only .csv and .json files are supported.")
+
+        self.data_root = data_root
+
+        # It's used to balance num of images and videos.
+        self.dataset = []
+        for data in dataset:
+            if data.get('type', 'image') != 'video':
+                self.dataset.append(data)
+        if video_repeat > 0:
+            for _ in range(video_repeat):
+                for data in dataset:
+                    if data.get('type', 'image') == 'video':
+                        self.dataset.append(data)
+        del dataset
+
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        # TODO: enable bucket training
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint  = enable_inpaint
+        self.trimask_zeroout_removal = trimask_zeroout_removal
+        self.use_quadmask = use_quadmask
+        self.ablation_binary_mask = ablation_binary_mask
+
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+
+        if self.use_quadmask:
+            print(f"[QUADMASK MODE] Using 4-value quadmask: [0, 63, 127, 255]")
+        if self.ablation_binary_mask:
+            print(f"[ABLATION BINARY MASK] Remapping quadmask to binary: [0,63]→0, [127,255]→127")
+        else:
+            print(f"[TRIMASK MODE] Using 3-value trimask: [0, 127, 255]")
+
+        # Video params
+        self.video_sample_stride    = video_sample_stride
+        self.video_sample_n_frames  = video_sample_n_frames
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+
+        # Image params
+        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
+        self.image_transforms   = transforms.Compose([
+            transforms.Resize(min(self.image_sample_size)),
+            transforms.CenterCrop(self.image_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+
+        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
+
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+
+        if data_info.get('type', 'image') == 'video' and data_info.get('mask_path', None) is None:
+            video_id, text = data_info['file_path'], data_info['text']
+
+            if self.data_root is None:
+                video_dir = video_id
+            else:
+                video_dir = os.path.join(self.data_root, video_id)
+
+            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames, 
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+                if not self.enable_bucket:
+                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                    pixel_values = pixel_values / 255.
+                    del video_reader
+                else:
+                    pixel_values = pixel_values
+
+                if not self.enable_bucket:
+                    pixel_values = self.video_transforms(pixel_values)
+
+                # Random use no text generation
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+            return {
+                'pixel_values': pixel_values, 
+                'text': text, 
+                'data_type': 'video',
+            }
+        elif data_info.get('type', 'image') == 'video' and data_info.get('mask_path', None) is not None:  # video with known mask
+            video_path, text = data_info['file_path'], data_info['text']
+            mask_video_path = video_path[:-4] + '_mask.mp4'
+            with VideoReader_contextmanager(video_path, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames, 
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    input_video = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+            with VideoReader_contextmanager(mask_video_path, num_threads=2) as video_reader:
+                try:
+                    sample_args = (video_reader, batch_index)
+                    mask_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(mask_values)):
+                        frame = mask_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    mask_video = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+            if len(mask_video.shape) == 3:
+                mask_video = mask_video[..., None]
+            if mask_video.shape[-1] == 3:
+                mask_video = mask_video[..., :1]
+            if len(mask_video.shape) != 4:
+                raise ValueError(f"mask_video shape is {mask_video.shape}.")
+
+            text = data_info['text']
+            if not self.enable_bucket:
+                input_video = torch.from_numpy(input_video).permute(0, 3, 1, 2).contiguous() / 255.
+                mask_video = torch.from_numpy(mask_video).permute(0, 3, 1, 2).contiguous() / 255.
+
+                pixel_values = torch.cat([input_video, mask_video], dim=1)
+                pixel_values = self.video_transforms(pixel_values)
+                input_video = pixel_values[:, :3]
+                mask_video = pixel_values[:, 3:]
+
+            # Random use no text generation
+            if random.random() < self.text_drop_ratio:
+                text = ''
+
+            return {
+                'pixel_values': input_video,
+                'mask': mask_video,
+                'text': text,
+                'data_type': 'video',
+            }
+
+        elif data_info.get('type', 'image') == 'video_mask_tuple':  # object effect removal
+            sample_dir = data_info['file_path']
+            try:
+                if os.path.exists(os.path.join(sample_dir, 'rgb_full.mp4')):
+                    input_video_path = os.path.join(sample_dir, 'rgb_full.mp4')
+                    target_video_path = os.path.join(sample_dir, 'rgb_removed.mp4')
+                    mask_video_path = os.path.join(sample_dir, 'mask.mp4')
+                    depth_video_path = os.path.join(sample_dir, 'depth_removed.mp4')
+
+                    input_video = media.read_video(input_video_path)
+                    target_video = media.read_video(target_video_path)
+                    mask_video = media.read_video(mask_video_path)
+
+                    # Load depth map if it exists
+                    depth_video = None
+                    if os.path.exists(depth_video_path):
+                        depth_video = media.read_video(depth_video_path)
+
+                else:
+                    input_video_path = os.path.join(sample_dir, 'input')
+                    target_video_path = os.path.join(sample_dir, 'bg')
+                    mask_video_path = os.path.join(sample_dir, 'trimask')
+
+                    input_video = _read_video_from_dir(input_video_path)
+                    target_video = _read_video_from_dir(target_video_path)
+                    mask_video = _read_video_from_dir(mask_video_path)
+
+                    # Initialize depth_video as None for this path
+                    depth_video = None
+            except Exception as e:
+                print(f"Error loading video_mask_tuple from {sample_dir}: {e}")
+                import traceback
+                traceback.print_exc()
+                raise
+
+            mask_video = 255 - mask_video  # will be flipped again in when feeding to model
+
+            if len(mask_video.shape) == 3:
+                mask_video = mask_video[..., None]
+            if mask_video.shape[-1] == 3:
+                mask_video = mask_video[..., :1]
+            min_sample_n_frames = min(
+                self.video_sample_n_frames, 
+                int(len(input_video) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+            )
+            video_length = int(self.video_length_drop_end * len(input_video))
+            clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+            start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+            input_video = input_video[batch_index]
+            target_video = target_video[batch_index]
+            mask_video = mask_video[batch_index]
+            if depth_video is not None:
+                depth_video = depth_video[batch_index]
+
+            resized_inputs = []
+            resized_targets = []
+            resized_masks = []
+            resized_depths = []
+            for i in range(len(input_video)):
+                resized_input = resize_frame(input_video[i], self.larger_side_of_image_and_video)
+                resized_target = resize_frame(target_video[i], self.larger_side_of_image_and_video)
+                resized_mask = resize_frame(mask_video[i], self.larger_side_of_image_and_video)
+
+                # Apply mask quantization based on mode
+                if self.ablation_binary_mask:
+                    # Ablation binary mask mode: remap [0, 63, 127, 255] to [0, 127]
+                    # Map 0 and 63 → 0
+                    # Map 127 and 255 → 127
+                    resized_mask = np.where(resized_mask <= 95, 0, resized_mask)
+                    resized_mask = np.where(resized_mask > 95, 127, resized_mask)
+                elif self.use_quadmask:
+                    # Quadmask mode: preserve 4 values [0, 63, 127, 255]
+                    # Quantize to nearest quadmask value for robustness
+                    resized_mask = np.where(resized_mask <= 31, 0, resized_mask)
+                    resized_mask = np.where(np.logical_and(resized_mask > 31, resized_mask <= 95), 63, resized_mask)
+                    resized_mask = np.where(np.logical_and(resized_mask > 95, resized_mask <= 191), 127, resized_mask)
+                    resized_mask = np.where(resized_mask > 191, 255, resized_mask)
+                else:
+                    # Trimask mode: 3 values [0, 127, 255]
+                    resized_mask = np.where(np.logical_and(resized_mask > 63, resized_mask < 192), 127, resized_mask)
+                    resized_mask = np.where(resized_mask >= 192, 255, resized_mask)
+                    resized_mask = np.where(resized_mask <= 63, 0, resized_mask)
+
+                resized_inputs.append(resized_input)
+                resized_targets.append(resized_target)
+                resized_masks.append(resized_mask)
+
+                if depth_video is not None:
+                    resized_depth = resize_frame(depth_video[i], self.larger_side_of_image_and_video)
+                    resized_depths.append(resized_depth)
+
+            input_video = np.array(resized_inputs)
+            target_video = np.array(resized_targets)
+            mask_video = np.array(resized_masks)
+            if depth_video is not None:
+                depth_video = np.array(resized_depths)
+
+            if len(mask_video.shape) == 3:
+                mask_video = mask_video[..., None]
+            if mask_video.shape[-1] == 3:
+                mask_video = mask_video[..., :1]
+            if len(mask_video.shape) != 4:
+                raise ValueError(f"mask_video shape is {mask_video.shape}.")
+
+            text = data_info['text']
+            print(f"DEBUG DATASET: Converting to tensors (enable_bucket={self.enable_bucket})...")
+            if not self.enable_bucket:
+                print(f"DEBUG DATASET: Converting input_video to tensor...")
+                input_video = torch.from_numpy(input_video).permute(0, 3, 1, 2).contiguous() / 255.
+                print(f"DEBUG DATASET: Converting target_video to tensor...")
+                target_video = torch.from_numpy(target_video).permute(0, 3, 1, 2).contiguous() / 255.
+                print(f"DEBUG DATASET: Converting mask_video to tensor...")
+                mask_video = torch.from_numpy(mask_video).permute(0, 3, 1, 2).contiguous() / 255.
+
+                # Process depth video if available
+                if depth_video is not None:
+                    print(f"DEBUG DATASET: Processing depth_video...")
+                    # IMPORTANT: Copy depth_video to ensure it's not memory-mapped
+                    # Memory-mapped files can cause bus errors on GPU transfer
+                    print(f"DEBUG DATASET: Copying depth_video to ensure not memory-mapped...")
+                    depth_video = np.array(depth_video, copy=True)
+                    print(f"DEBUG DATASET: depth_video copied, shape={depth_video.shape}")
+
+                    # Ensure depth has correct shape
+                    if len(depth_video.shape) == 3:
+                        depth_video = depth_video[..., None]
+                    if depth_video.shape[-1] == 3:
+                        # Convert to grayscale if RGB
+                        print(f"DEBUG DATASET: Converting depth to grayscale...")
+                        depth_video = depth_video.mean(axis=-1, keepdims=True)
+                    # Convert to tensor [F, 1, H, W] and normalize to [0, 1]
+                    print(f"DEBUG DATASET: Converting depth to tensor...")
+                    depth_video = torch.from_numpy(depth_video).permute(0, 3, 1, 2).contiguous().float() / 255.
+                    # Ensure tensor is contiguous and owned
+                    print(f"DEBUG DATASET: Cloning depth tensor...")
+                    depth_video = depth_video.clone().contiguous()
+                    print(f"DEBUG DATASET: depth_video final shape: {depth_video.shape}, is_contiguous: {depth_video.is_contiguous()}")
+
+                # Apply transforms to each video separately (they expect 3 channels)
+                print(f"DEBUG DATASET: Applying video transforms...")
+                input_video = self.video_transforms(input_video)
+                target_video = self.video_transforms(target_video)
+                # Don't normalize mask since it's single channel
+                print(f"DEBUG DATASET: Normalizing mask_video...")
+                mask_video = mask_video * 2.0 - 1.0  # Scale to [-1, 1] like other channels
+                print(f"DEBUG DATASET: All tensors ready (non-bucket mode)")
+
+            else:
+                # For bucket mode, keep as numpy until collate
+                # Collate function expects [0, 255] range and will normalize
+                print(f"DEBUG DATASET: Bucket mode - keeping as numpy in [0, 255] range...")
+                print(f"DEBUG DATASET: All numpy arrays ready (bucket mode)")
+
+            # Random use no text generation
+            if random.random() < self.text_drop_ratio:
+                text = ''
+
+            if self.trimask_zeroout_removal:
+                input_video = input_video * np.where(mask_video > 200, 0, 1).astype(input_video.dtype)
+
+            result = {
+                'pixel_values': target_video,
+                'input_condition': input_video,
+                'mask': mask_video,
+                'text': text,
+                'data_type': 'video_mask_tuple',
+            }
+
+            # Add depth maps if available
+            if depth_video is not None:
+                result['depth_maps'] = depth_video
+
+            return result
+
+        else:
+            image_path, text = data_info['file_path'], data_info['text']
+            if self.data_root is not None:
+                image_path = os.path.join(self.data_root, image_path)
+            image = Image.open(image_path).convert('RGB')
+            if not self.enable_bucket:
+                image = self.image_transforms(image).unsqueeze(0)
+            else:
+                image = np.expand_dims(np.array(image), 0)
+            if random.random() < self.text_drop_ratio:
+                text = ''
+            return {
+                'pixel_values': image, 
+                'text': text, 
+                'data_type': 'image',
+            }
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'image')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'image')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+
+                sample = self.get_batch(idx)
+                sample["idx"] = idx
+
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                import traceback
+                print(f"Error loading sample at index {idx}:")
+                print(f"Data info: {self.dataset[idx % len(self.dataset)]}")
+                print(f"Error: {e}")
+                traceback.print_exc()
+                idx = random.randint(0, self.length-1)
+
+        if self.enable_inpaint and not self.enable_bucket:
+            if "mask" not in sample:
+                mask = get_random_mask_multi(sample["pixel_values"].size())
+                sample["mask"] = mask
+            else:
+                mask = sample["mask"]
+
+            if "input_condition" in sample:
+                mask_pixel_values = sample["input_condition"]
+            else:
+                mask_pixel_values = sample["pixel_values"]
+                mask_pixel_values = mask_pixel_values * (1 - mask) + torch.ones_like(mask_pixel_values) * -1 * mask
+
+            sample["mask_pixel_values"] = mask_pixel_values
+
+            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
+            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
+            sample["clip_pixel_values"] = clip_pixel_values
+
+            ref_pixel_values = sample["pixel_values"][0].unsqueeze(0)
+            if (mask == 1).all():
+                ref_pixel_values = torch.ones_like(ref_pixel_values) * -1
+            sample["ref_pixel_values"] = ref_pixel_values
+
+        return sample
+
+
+class ImageVideoControlDataset(Dataset):
+    def __init__(
+            self,
+            ann_path, data_root=None,
+            video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
+            image_sample_size=512,
+            video_repeat=0,
+            text_drop_ratio=0.1,
+            enable_bucket=False,
+            video_length_drop_start=0.0, 
+            video_length_drop_end=1.0,
+            enable_inpaint=False,
+    ):
+        # Loading annotations from files
+        print(f"loading annotations from {ann_path} ...")
+        if ann_path.endswith('.csv'):
+            with open(ann_path, 'r') as csvfile:
+                dataset = list(csv.DictReader(csvfile))
+        elif ann_path.endswith('.json'):
+            dataset = json.load(open(ann_path))
+        else:
+            raise ValueError(f"Unsupported annotation file format: {ann_path}. Only .csv and .json files are supported.")
+
+        self.data_root = data_root
+
+        # It's used to balance num of images and videos.
+        self.dataset = []
+        for data in dataset:
+            if data.get('type', 'image') != 'video':
+                self.dataset.append(data)
+        if video_repeat > 0:
+            for _ in range(video_repeat):
+                for data in dataset:
+                    if data.get('type', 'image') == 'video':
+                        self.dataset.append(data)
+        del dataset
+
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        # TODO: enable bucket training
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint  = enable_inpaint
+
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+
+        # Video params
+        self.video_sample_stride    = video_sample_stride
+        self.video_sample_n_frames  = video_sample_n_frames
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+
+        # Image params
+        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
+        self.image_transforms   = transforms.Compose([
+            transforms.Resize(min(self.image_sample_size)),
+            transforms.CenterCrop(self.image_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+
+        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
+
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        video_id, text = data_info['file_path'], data_info['text']
+
+        if data_info.get('type', 'image')=='video':
+            if self.data_root is None:
+                video_dir = video_id
+            else:
+                video_dir = os.path.join(self.data_root, video_id)
+
+            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames, 
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+                if not self.enable_bucket:
+                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                    pixel_values = pixel_values / 255.
+                    del video_reader
+                else:
+                    pixel_values = pixel_values
+
+                if not self.enable_bucket:
+                    pixel_values = self.video_transforms(pixel_values)
+
+                # Random use no text generation
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+
+            control_video_id = data_info['control_file_path']
+
+            if self.data_root is None:
+                control_video_id = control_video_id
+            else:
+                control_video_id = os.path.join(self.data_root, control_video_id)
+
+            with VideoReader_contextmanager(control_video_id, num_threads=2) as control_video_reader:
+                try:
+                    sample_args = (control_video_reader, batch_index)
+                    control_pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(control_pixel_values)):
+                        frame = control_pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    control_pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+                if not self.enable_bucket:
+                    control_pixel_values = torch.from_numpy(control_pixel_values).permute(0, 3, 1, 2).contiguous()
+                    control_pixel_values = control_pixel_values / 255.
+                    del control_video_reader
+                else:
+                    control_pixel_values = control_pixel_values
+
+                if not self.enable_bucket:
+                    control_pixel_values = self.video_transforms(control_pixel_values)
+            return pixel_values, control_pixel_values, text, "video"
+        else:
+            image_path, text = data_info['file_path'], data_info['text']
+            if self.data_root is not None:
+                image_path = os.path.join(self.data_root, image_path)
+            image = Image.open(image_path).convert('RGB')
+            if not self.enable_bucket:
+                image = self.image_transforms(image).unsqueeze(0)
+            else:
+                image = np.expand_dims(np.array(image), 0)
+
+            if random.random() < self.text_drop_ratio:
+                text = ''
+
+            control_image_id = data_info['control_file_path']
+
+            if self.data_root is None:
+                control_image_id = control_image_id
+            else:
+                control_image_id = os.path.join(self.data_root, control_image_id)
+
+            control_image = Image.open(control_image_id).convert('RGB')
+            if not self.enable_bucket:
+                control_image = self.image_transforms(control_image).unsqueeze(0)
+            else:
+                control_image = np.expand_dims(np.array(control_image), 0)
+            return image, control_image, text, 'image'
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'image')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'image')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+
+                pixel_values, control_pixel_values, name, data_type = self.get_batch(idx)
+                sample["pixel_values"] = pixel_values
+                sample["control_pixel_values"] = control_pixel_values
+                sample["text"] = name
+                sample["data_type"] = data_type
+                sample["idx"] = idx
+
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                print(e, self.dataset[idx % len(self.dataset)])
+                idx = random.randint(0, self.length-1)
+
+        if self.enable_inpaint and not self.enable_bucket:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
+            sample["mask_pixel_values"] = mask_pixel_values
+            sample["mask"] = mask
+
+            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
+            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
+            sample["clip_pixel_values"] = clip_pixel_values
+
+            ref_pixel_values = sample["pixel_values"][0].unsqueeze(0)
+            if (mask == 1).all():
+                ref_pixel_values = torch.ones_like(ref_pixel_values) * -1
+            sample["ref_pixel_values"] = ref_pixel_values
+
+        return sample
diff --git a/videox_fun/data/dataset_image_video_warped.py b/videox_fun/data/dataset_image_video_warped.py
new file mode 100644
index 0000000000000000000000000000000000000000..0928dae13cfa8712926ef6546d3c477743bedec4
--- /dev/null
+++ b/videox_fun/data/dataset_image_video_warped.py
@@ -0,0 +1,1092 @@
+import csv
+import io
+import json
+import math
+import os
+import glob
+import random
+from threading import Thread
+import mediapy as media
+import time
+
+import albumentations
+import cv2
+import gc
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from scipy.special import binom
+
+from func_timeout import func_timeout, FunctionTimedOut
+from decord import VideoReader
+from PIL import Image
+from torch.utils.data import BatchSampler, Sampler
+from torch.utils.data.dataset import Dataset
+from contextlib import contextmanager
+
+VIDEO_READER_TIMEOUT = 20
+
+bernstein = lambda n, k, t: binom(n,k)* t**k * (1.-t)**(n-k)
+
+# codes from https://stackoverflow.com/questions/50731785/create-random-shape-contour-using-matplotlib
+def bezier(points, num=200):
+    N = len(points)
+    t = np.linspace(0, 1, num=num)
+    curve = np.zeros((num, 2))
+    for i in range(N):
+        curve += np.outer(bernstein(N - 1, i, t), points[i])
+    return curve
+
+class Segment():
+    def __init__(self, p1, p2, angle1, angle2, **kw):
+        self.p1 = p1
+        self.p2 = p2
+        self.angle1 = angle1
+        self.angle2 = angle2
+        self.numpoints = kw.get("numpoints", 100)
+        r = kw.get("r", 0.3)
+        d = np.sqrt(np.sum((self.p2-self.p1)**2))
+        self.r = r*d
+        self.p = np.zeros((4,2))
+        self.p[0,:] = self.p1[:]
+        self.p[3,:] = self.p2[:]
+        self.calc_intermediate_points(self.r)
+
+    def calc_intermediate_points(self,r):
+        self.p[1,:] = self.p1 + np.array(
+            [self.r*np.cos(self.angle1), self.r*np.sin(self.angle1)])
+        self.p[2,:] = self.p2 + np.array(
+            [self.r*np.cos(self.angle2+np.pi), self.r*np.sin(self.angle2+np.pi)])
+        self.curve = bezier(self.p,self.numpoints)
+
+
+def get_curve(points, **kw):
+    segments = []
+    for i in range(len(points)-1):
+        seg = Segment(points[i,:2], points[i+1,:2], points[i,2],points[i+1,2],**kw)
+        segments.append(seg)
+    curve = np.concatenate([s.curve for s in segments])
+    return segments, curve
+
+
+def ccw_sort(p):
+    d = p-np.mean(p,axis=0)
+    s = np.arctan2(d[:,0], d[:,1])
+    return p[np.argsort(s),:]
+
+
+def get_bezier_curve(a, rad=0.2, edgy=0):
+    """ given an array of points *a*, create a curve through
+    those points. 
+    *rad* is a number between 0 and 1 to steer the distance of
+          control points.
+    *edgy* is a parameter which controls how "edgy" the curve is,
+           edgy=0 is smoothest."""
+    p = np.arctan(edgy)/np.pi+.5
+    a = ccw_sort(a)
+    a = np.append(a, np.atleast_2d(a[0,:]), axis=0)
+    d = np.diff(a, axis=0)
+    ang = np.arctan2(d[:,1],d[:,0])
+    f = lambda ang : (ang>=0)*ang + (ang<0)*(ang+2*np.pi)
+    ang = f(ang)
+    ang1 = ang
+    ang2 = np.roll(ang,1)
+    ang = p*ang1 + (1-p)*ang2 + (np.abs(ang2-ang1) > np.pi )*np.pi
+    ang = np.append(ang, [ang[0]])
+    a = np.append(a, np.atleast_2d(ang).T, axis=1)
+    s, c = get_curve(a, r=rad, method="var")
+    x,y = c.T
+    return x,y, a
+
+
+def get_random_points(n=5, scale=0.8, mindst=None, rec=0):
+    """ create n random points in the unit square, which are *mindst*
+    apart, then scale them."""
+    mindst = mindst or .7/n
+    a = np.random.rand(n,2)
+    d = np.sqrt(np.sum(np.diff(ccw_sort(a), axis=0), axis=1)**2)
+    if np.all(d >= mindst) or rec>=200:
+        return a*scale
+    else:
+        return get_random_points(n=n, scale=scale, mindst=mindst, rec=rec+1)
+
+
+def fill_mask(shape, x, y, fill_val=255):
+    _, _, h, w = shape
+    mask = np.zeros((h, w), dtype=np.uint8)
+    mask = cv2.fillPoly(mask, [np.array([x, y], np.int32).T], fill_val)
+    return mask
+
+
+def random_shift(x, y, scale_range = [0.2, 0.7], trans_perturb_range=[-0.2, 0.2]):
+    w_scale = np.random.uniform(scale_range[0], scale_range[1])
+    h_scale = np.random.uniform(scale_range[0], scale_range[1])
+    x_trans = np.random.uniform(0., 1. - w_scale)
+    y_trans = np.random.uniform(0., 1. - h_scale)
+    x_shifted = x * w_scale + x_trans + np.random.uniform(trans_perturb_range[0], trans_perturb_range[1])
+    y_shifted = y * h_scale + y_trans + np.random.uniform(trans_perturb_range[0], trans_perturb_range[1])
+    return x_shifted, y_shifted
+
+
+def get_random_shape_mask(
+        shape, n_pts_range=[3, 10], rad_range=[0.0, 1.0], edgy_range=[0.0, 0.1], n_keyframes_range=[2, 25],
+        random_drop_range=[0.0, 0.2],
+    ):
+    f, _, h, w = shape
+
+    n_pts = np.random.randint(n_pts_range[0], n_pts_range[1])
+    n_keyframes = np.random.randint(n_keyframes_range[0], n_keyframes_range[1])
+    keyframe_interval = f // (n_keyframes - 1)
+    keyframe_indices = list(range(0, f, keyframe_interval))
+    if len(keyframe_indices) == n_keyframes:
+        keyframe_indices[-1] = f - 1
+    else:
+        keyframe_indices.append(f - 1)
+    x_all_frames, y_all_frames = [], []
+    for i, keyframe_index in enumerate(keyframe_indices):
+        rad = np.random.uniform(rad_range[0], rad_range[1])
+        edgy = np.random.uniform(edgy_range[0], edgy_range[1])
+        x_kf, y_kf, _ = get_bezier_curve(get_random_points(n=n_pts), rad=rad, edgy=edgy)
+        x_kf, y_kf = random_shift(x_kf, y_kf)
+        if i == 0:
+            x_all_frames.append(x_kf[None])
+            y_all_frames.append(y_kf[None])
+        else:
+            x_interval = np.linspace(x_all_frames[-1][-1], x_kf, keyframe_index - keyframe_indices[i - 1] + 1)
+            y_interval = np.linspace(y_all_frames[-1][-1], y_kf, keyframe_index - keyframe_indices[i - 1] + 1)
+            x_all_frames.append(x_interval[1:])
+            y_all_frames.append(y_interval[1:])
+    x_all_frames = np.concatenate(x_all_frames, axis=0)
+    y_all_frames = np.concatenate(y_all_frames, axis=0)
+
+    masks = []
+    for x, y in zip(x_all_frames, y_all_frames):
+        x = np.round(x * w).astype(np.int32)
+        y = np.round(y * h).astype(np.int32)
+        mask = fill_mask(shape, x, y)
+        masks.append(mask)
+    masks = np.stack(masks, axis=0).astype(float) / 255.
+
+    n_frames_random_drop = int(np.random.uniform(random_drop_range[0], random_drop_range[1]) * f)
+    drop_index = np.random.randint(0, f - n_frames_random_drop)
+    masks[drop_index:drop_index + n_frames_random_drop] = 0
+
+    return masks  # (f, h, w), <float>[0, 1]
+
+
+def get_random_mask(shape, mask_type_probs=[0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.8]):
+    f, c, h, w = shape
+
+    if f != 1:
+        mask_index = np.random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], p=mask_type_probs)
+    else:
+        mask_index = np.random.choice([0, 1], p = [0.2, 0.8])
+    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
+
+    if mask_index == 0:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()
+
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+        mask[:, :, start_y:end_y, start_x:end_x] = 1
+    elif mask_index == 1:
+        mask[:, :, :, :] = 1
+    elif mask_index == 2:
+        mask_frame_index = np.random.randint(1, 5)
+        mask[mask_frame_index:, :, :, :] = 1
+    elif mask_index == 3:
+        mask_frame_index = np.random.randint(1, 5)
+        mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
+    elif mask_index == 4:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()
+
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+
+        mask_frame_before = np.random.randint(0, f // 2)
+        mask_frame_after = np.random.randint(f // 2, f)
+        mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
+    elif mask_index == 5:
+        mask = torch.randint(0, 2, (f, 1, h, w), dtype=torch.uint8)
+    elif mask_index == 6:
+        num_frames_to_mask = random.randint(1, max(f // 2, 1))
+        frames_to_mask = random.sample(range(f), num_frames_to_mask)
+
+        for i in frames_to_mask:
+            block_height = random.randint(1, h // 4)
+            block_width = random.randint(1, w // 4)
+            top_left_y = random.randint(0, h - block_height)
+            top_left_x = random.randint(0, w - block_width)
+            mask[i, 0, top_left_y:top_left_y + block_height, top_left_x:top_left_x + block_width] = 1
+    elif mask_index == 7:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        a = torch.randint(min(w, h) // 8, min(w, h) // 4, (1,)).item()
+        b = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item() 
+
+        for i in range(h):
+            for j in range(w):
+                if ((i - center_y) ** 2) / (b ** 2) + ((j - center_x) ** 2) / (a ** 2) < 1:
+                    mask[:, :, i, j] = 1
+    elif mask_index == 8:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        radius = torch.randint(min(h, w) // 8, min(h, w) // 4, (1,)).item()
+        for i in range(h):
+            for j in range(w):
+                if (i - center_y) ** 2 + (j - center_x) ** 2 < radius ** 2:
+                    mask[:, :, i, j] = 1
+    elif mask_index == 9:
+        for idx in range(f):
+            if np.random.rand() > 0.5:
+                mask[idx, :, :, :] = 1
+    else:
+        num_objs = np.random.randint(1, 4)
+        mask_npy = get_random_shape_mask(shape)
+        for i in range(num_objs - 1):
+            mask_npy += get_random_shape_mask(shape).clip(0, 1)
+
+        mask = torch.from_numpy(mask_npy).unsqueeze(1)
+
+    return mask.float()
+
+
+def get_random_mask_multi(shape, mask_type_probs, range_num_masks=[1, 7]):
+    num_masks = np.random.randint(range_num_masks[0], range_num_masks[1])
+    masks = None
+    for _ in range(num_masks):
+        mask = get_random_mask(shape, mask_type_probs)
+        if masks is None:
+            masks = mask
+        else:
+            masks = (masks + mask).clip(0, 1)
+    return masks
+
+
+class ImageVideoSampler(BatchSampler):
+    """A sampler wrapper for grouping images with similar aspect ratio into a same batch.
+
+    Args:
+        sampler (Sampler): Base sampler.
+        dataset (Dataset): Dataset providing data information.
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``.
+        aspect_ratios (dict): The predefined aspect ratios.
+    """
+
+    def __init__(self,
+                 sampler: Sampler,
+                 dataset: Dataset,
+                 batch_size: int,
+                 drop_last: bool = False
+                ) -> None:
+        if not isinstance(sampler, Sampler):
+            raise TypeError('sampler should be an instance of ``Sampler``, '
+                            f'but got {sampler}')
+        if not isinstance(batch_size, int) or batch_size <= 0:
+            raise ValueError('batch_size should be a positive integer value, '
+                             f'but got batch_size={batch_size}')
+        self.sampler = sampler
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+
+        # buckets for each aspect ratio
+        self.bucket = {'image':[], 'video':[], 'video_mask_tuple':[]}
+
+    def __iter__(self):
+        for idx in self.sampler:
+            content_type = self.dataset.dataset[idx].get('type', 'image')
+            self.bucket[content_type].append(idx)
+
+            # yield a batch of indices in the same aspect ratio group
+            if len(self.bucket['video']) == self.batch_size:
+                bucket = self.bucket['video']
+                yield bucket[:]
+                del bucket[:]
+            elif len(self.bucket['video_mask_tuple']) == self.batch_size:
+                bucket = self.bucket['video_mask_tuple']
+                yield bucket[:]
+                del bucket[:]
+            elif len(self.bucket['image']) == self.batch_size:
+                bucket = self.bucket['image']
+                yield bucket[:]
+                del bucket[:]
+
+
+@contextmanager
+def VideoReader_contextmanager(*args, **kwargs):
+    vr = VideoReader(*args, **kwargs)
+    try:
+        yield vr
+    finally:
+        del vr
+        gc.collect()
+
+
+def get_video_reader_batch(video_reader, batch_index):
+    frames = video_reader.get_batch(batch_index).asnumpy()
+    return frames
+
+
+def _read_video_from_dir(video_dir):
+    frames = []
+    frame_paths = sorted(list(glob.glob(os.path.join(video_dir, '*.png'))))
+
+    if not frame_paths:
+        raise ValueError(f"No PNG files found in directory: {video_dir}")
+
+    for frame_path in frame_paths:
+        frame = media.read_image(frame_path)
+        frames.append(frame)
+
+    if not frames:
+        raise ValueError(f"Failed to read any frames from directory: {video_dir}")
+
+    return np.stack(frames, axis=0)
+
+
+def resize_frame(frame, target_short_side):
+    h, w, _ = frame.shape
+    if h < w:
+        if target_short_side > h:
+            return frame
+        new_h = target_short_side
+        new_w = int(target_short_side * w / h)
+    else:
+        if target_short_side > w:
+            return frame
+        new_w = target_short_side
+        new_h = int(target_short_side * h / w)
+
+    resized_frame = cv2.resize(frame, (new_w, new_h))
+    return resized_frame
+
+
+class ImageVideoDataset(Dataset):
+    def __init__(
+            self,
+            ann_path, data_root=None,
+            video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
+            image_sample_size=512,
+            video_repeat=0,
+            text_drop_ratio=0.1,
+            enable_bucket=False,
+            video_length_drop_start=0.0,
+            video_length_drop_end=1.0,
+            enable_inpaint=False,
+            trimask_zeroout_removal=False,
+            use_quadmask=False,
+            ablation_binary_mask=False,
+        ):
+        # Loading annotations from files
+        print(f"loading annotations from {ann_path} ...")
+        if ann_path.endswith('.csv'):
+            with open(ann_path, 'r') as csvfile:
+                dataset = list(csv.DictReader(csvfile))
+        elif ann_path.endswith('.json'):
+            dataset = json.load(open(ann_path))
+        else:
+            raise ValueError(f"Unsupported annotation file format: {ann_path}. Only .csv and .json files are supported.")
+
+        self.data_root = data_root
+
+        # It's used to balance num of images and videos.
+        self.dataset = []
+        for data in dataset:
+            if data.get('type', 'image') != 'video':
+                self.dataset.append(data)
+        if video_repeat > 0:
+            for _ in range(video_repeat):
+                for data in dataset:
+                    if data.get('type', 'image') == 'video':
+                        self.dataset.append(data)
+        del dataset
+
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        # TODO: enable bucket training
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint  = enable_inpaint
+        self.trimask_zeroout_removal = trimask_zeroout_removal
+        self.use_quadmask = use_quadmask
+        self.ablation_binary_mask = ablation_binary_mask
+
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+
+        if self.use_quadmask:
+            print(f"[QUADMASK MODE] Using 4-value quadmask: [0, 63, 127, 255]")
+        if self.ablation_binary_mask:
+            print(f"[ABLATION BINARY MASK] Remapping quadmask to binary: [0,63]→0, [127,255]→127")
+        else:
+            print(f"[TRIMASK MODE] Using 3-value trimask: [0, 127, 255]")
+
+        # Video params
+        self.video_sample_stride    = video_sample_stride
+        self.video_sample_n_frames  = video_sample_n_frames
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+
+        # Image params
+        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
+        self.image_transforms   = transforms.Compose([
+            transforms.Resize(min(self.image_sample_size)),
+            transforms.CenterCrop(self.image_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+
+        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
+
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+
+        if data_info.get('type', 'image') == 'video' and data_info.get('mask_path', None) is None:
+            video_id, text = data_info['file_path'], data_info['text']
+
+            if self.data_root is None:
+                video_dir = video_id
+            else:
+                video_dir = os.path.join(self.data_root, video_id)
+
+            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames, 
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+                if not self.enable_bucket:
+                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                    pixel_values = pixel_values / 255.
+                    del video_reader
+                else:
+                    pixel_values = pixel_values
+
+                if not self.enable_bucket:
+                    pixel_values = self.video_transforms(pixel_values)
+
+                # Random use no text generation
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+            return {
+                'pixel_values': pixel_values, 
+                'text': text, 
+                'data_type': 'video',
+            }
+        elif data_info.get('type', 'image') == 'video' and data_info.get('mask_path', None) is not None:  # video with known mask
+            video_path, text = data_info['file_path'], data_info['text']
+            mask_video_path = video_path[:-4] + '_mask.mp4'
+            with VideoReader_contextmanager(video_path, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames, 
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    input_video = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+            with VideoReader_contextmanager(mask_video_path, num_threads=2) as video_reader:
+                try:
+                    sample_args = (video_reader, batch_index)
+                    mask_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(mask_values)):
+                        frame = mask_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    mask_video = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+            if len(mask_video.shape) == 3:
+                mask_video = mask_video[..., None]
+            if mask_video.shape[-1] == 3:
+                mask_video = mask_video[..., :1]
+            if len(mask_video.shape) != 4:
+                raise ValueError(f"mask_video shape is {mask_video.shape}.")
+
+            text = data_info['text']
+            if not self.enable_bucket:
+                input_video = torch.from_numpy(input_video).permute(0, 3, 1, 2).contiguous() / 255.
+                mask_video = torch.from_numpy(mask_video).permute(0, 3, 1, 2).contiguous() / 255.
+
+                pixel_values = torch.cat([input_video, mask_video], dim=1)
+                pixel_values = self.video_transforms(pixel_values)
+                input_video = pixel_values[:, :3]
+                mask_video = pixel_values[:, 3:]
+
+            # Random use no text generation
+            if random.random() < self.text_drop_ratio:
+                text = ''
+
+            return {
+                'pixel_values': input_video,
+                'mask': mask_video,
+                'text': text,
+                'data_type': 'video',
+            }
+
+        elif data_info.get('type', 'image') == 'video_mask_tuple':  # object effect removal
+            sample_dir = data_info['file_path'] if self.data_root is None else os.path.join(self.data_root, data_info['file_path'])
+            try:
+                if os.path.exists(os.path.join(sample_dir, 'rgb_full.mp4')):
+                    input_video_path = os.path.join(sample_dir, 'rgb_full.mp4')
+                    target_video_path = os.path.join(sample_dir, 'rgb_removed.mp4')
+                    mask_video_path = os.path.join(sample_dir, 'mask.mp4')
+                    depth_video_path = os.path.join(sample_dir, 'depth_removed.mp4')
+
+                    input_video = media.read_video(input_video_path)
+                    target_video = media.read_video(target_video_path)
+                    mask_video = media.read_video(mask_video_path)
+
+                    # Load depth map if it exists
+                    depth_video = None
+                    if os.path.exists(depth_video_path):
+                        depth_video = media.read_video(depth_video_path)
+
+                else:
+                    input_video_path = os.path.join(sample_dir, 'input')
+                    target_video_path = os.path.join(sample_dir, 'bg')
+                    mask_video_path = os.path.join(sample_dir, 'trimask')
+
+                    input_video = _read_video_from_dir(input_video_path)
+                    target_video = _read_video_from_dir(target_video_path)
+                    mask_video = _read_video_from_dir(mask_video_path)
+
+                    # Initialize depth_video as None for this path
+                    depth_video = None
+            except Exception as e:
+                print(f"Error loading video_mask_tuple from {sample_dir}: {e}")
+                import traceback
+                traceback.print_exc()
+                raise
+
+            mask_video = 255 - mask_video  # will be flipped again in when feeding to model
+
+            if len(mask_video.shape) == 3:
+                mask_video = mask_video[..., None]
+            if mask_video.shape[-1] == 3:
+                mask_video = mask_video[..., :1]
+            min_sample_n_frames = min(
+                self.video_sample_n_frames, 
+                int(len(input_video) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+            )
+            video_length = int(self.video_length_drop_end * len(input_video))
+            clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+            start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+            input_video = input_video[batch_index]
+            target_video = target_video[batch_index]
+            mask_video = mask_video[batch_index]
+            if depth_video is not None:
+                depth_video = depth_video[batch_index]
+
+            resized_inputs = []
+            resized_targets = []
+            resized_masks = []
+            resized_depths = []
+            for i in range(len(input_video)):
+                resized_input = resize_frame(input_video[i], self.larger_side_of_image_and_video)
+                resized_target = resize_frame(target_video[i], self.larger_side_of_image_and_video)
+                resized_mask = resize_frame(mask_video[i], self.larger_side_of_image_and_video)
+
+                # Apply mask quantization based on mode
+                if self.ablation_binary_mask:
+                    # Ablation binary mask mode: remap [0, 63, 127, 255] to [0, 127]
+                    # Map 0 and 63 → 0
+                    # Map 127 and 255 → 127
+                    resized_mask = np.where(resized_mask <= 95, 0, resized_mask)
+                    resized_mask = np.where(resized_mask > 95, 127, resized_mask)
+                elif self.use_quadmask:
+                    # Quadmask mode: preserve 4 values [0, 63, 127, 255]
+                    # Quantize to nearest quadmask value for robustness
+                    resized_mask = np.where(resized_mask <= 31, 0, resized_mask)
+                    resized_mask = np.where(np.logical_and(resized_mask > 31, resized_mask <= 95), 63, resized_mask)
+                    resized_mask = np.where(np.logical_and(resized_mask > 95, resized_mask <= 191), 127, resized_mask)
+                    resized_mask = np.where(resized_mask > 191, 255, resized_mask)
+                else:
+                    # Trimask mode: 3 values [0, 127, 255]
+                    resized_mask = np.where(np.logical_and(resized_mask > 63, resized_mask < 192), 127, resized_mask)
+                    resized_mask = np.where(resized_mask >= 192, 255, resized_mask)
+                    resized_mask = np.where(resized_mask <= 63, 0, resized_mask)
+
+                resized_inputs.append(resized_input)
+                resized_targets.append(resized_target)
+                resized_masks.append(resized_mask)
+
+                if depth_video is not None:
+                    resized_depth = resize_frame(depth_video[i], self.larger_side_of_image_and_video)
+                    resized_depths.append(resized_depth)
+
+            input_video = np.array(resized_inputs)
+            target_video = np.array(resized_targets)
+            mask_video = np.array(resized_masks)
+            if depth_video is not None:
+                depth_video = np.array(resized_depths)
+
+            if len(mask_video.shape) == 3:
+                mask_video = mask_video[..., None]
+            if mask_video.shape[-1] == 3:
+                mask_video = mask_video[..., :1]
+            if len(mask_video.shape) != 4:
+                raise ValueError(f"mask_video shape is {mask_video.shape}.")
+
+            text = data_info['text']
+            print(f"DEBUG DATASET: Converting to tensors (enable_bucket={self.enable_bucket})...")
+            if not self.enable_bucket:
+                print(f"DEBUG DATASET: Converting input_video to tensor...")
+                input_video = torch.from_numpy(input_video).permute(0, 3, 1, 2).contiguous() / 255.
+                print(f"DEBUG DATASET: Converting target_video to tensor...")
+                target_video = torch.from_numpy(target_video).permute(0, 3, 1, 2).contiguous() / 255.
+                print(f"DEBUG DATASET: Converting mask_video to tensor...")
+                mask_video = torch.from_numpy(mask_video).permute(0, 3, 1, 2).contiguous() / 255.
+
+                # Process depth video if available
+                if depth_video is not None:
+                    print(f"DEBUG DATASET: Processing depth_video...")
+                    # IMPORTANT: Copy depth_video to ensure it's not memory-mapped
+                    # Memory-mapped files can cause bus errors on GPU transfer
+                    print(f"DEBUG DATASET: Copying depth_video to ensure not memory-mapped...")
+                    depth_video = np.array(depth_video, copy=True)
+                    print(f"DEBUG DATASET: depth_video copied, shape={depth_video.shape}")
+
+                    # Ensure depth has correct shape
+                    if len(depth_video.shape) == 3:
+                        depth_video = depth_video[..., None]
+                    if depth_video.shape[-1] == 3:
+                        # Convert to grayscale if RGB
+                        print(f"DEBUG DATASET: Converting depth to grayscale...")
+                        depth_video = depth_video.mean(axis=-1, keepdims=True)
+                    # Convert to tensor [F, 1, H, W] and normalize to [0, 1]
+                    print(f"DEBUG DATASET: Converting depth to tensor...")
+                    depth_video = torch.from_numpy(depth_video).permute(0, 3, 1, 2).contiguous().float() / 255.
+                    # Ensure tensor is contiguous and owned
+                    print(f"DEBUG DATASET: Cloning depth tensor...")
+                    depth_video = depth_video.clone().contiguous()
+                    print(f"DEBUG DATASET: depth_video final shape: {depth_video.shape}, is_contiguous: {depth_video.is_contiguous()}")
+
+                # Apply transforms to each video separately (they expect 3 channels)
+                print(f"DEBUG DATASET: Applying video transforms...")
+                input_video = self.video_transforms(input_video)
+                target_video = self.video_transforms(target_video)
+                # Don't normalize mask since it's single channel
+                print(f"DEBUG DATASET: Normalizing mask_video...")
+                mask_video = mask_video * 2.0 - 1.0  # Scale to [-1, 1] like other channels
+                print(f"DEBUG DATASET: All tensors ready (non-bucket mode)")
+
+            else:
+                # For bucket mode, keep as numpy until collate
+                # Collate function expects [0, 255] range and will normalize
+                print(f"DEBUG DATASET: Bucket mode - keeping as numpy in [0, 255] range...")
+                print(f"DEBUG DATASET: All numpy arrays ready (bucket mode)")
+
+            # Load warped noise - REQUIRED if specified in dataset
+            warped_noise = None
+            if 'warped_noise_path' in data_info:
+                warped_noise_dir = data_info['warped_noise_path'] if self.data_root is None else os.path.join(self.data_root, data_info['warped_noise_path'])
+                noise_path = os.path.join(warped_noise_dir, 'noises.npy')
+
+                if not os.path.exists(noise_path):
+                    raise FileNotFoundError(
+                        f"Warped noise path specified in dataset but file not found: {noise_path}\n"
+                        f"Make sure you've generated warped noise for all videos in the dataset."
+                    )
+
+                try:
+                    warped_noise = np.load(noise_path)  # Shape: (T, C, H, W) in float16
+                    warped_noise = torch.from_numpy(warped_noise).float()  # Convert to torch tensor
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Failed to load warped noise from {noise_path}: {e}\n"
+                        f"The noise file may be corrupted. Try regenerating it."
+                    )
+
+            # Random use no text generation
+            if random.random() < self.text_drop_ratio:
+                text = ''
+
+            if self.trimask_zeroout_removal:
+                input_video = input_video * np.where(mask_video > 200, 0, 1).astype(input_video.dtype)
+
+            result = {
+                'pixel_values': target_video,
+                'input_condition': input_video,
+                'mask': mask_video,
+                'text': text,
+                'data_type': 'video_mask_tuple',
+            }
+
+            # Add depth maps if available
+            if depth_video is not None:
+                result['depth_maps'] = depth_video
+
+            # Add warped noise to batch if available
+            if warped_noise is not None:
+                result['warped_noise'] = warped_noise
+
+            return result
+
+        else:
+            image_path, text = data_info['file_path'], data_info['text']
+            if self.data_root is not None:
+                image_path = os.path.join(self.data_root, image_path)
+            image = Image.open(image_path).convert('RGB')
+            if not self.enable_bucket:
+                image = self.image_transforms(image).unsqueeze(0)
+            else:
+                image = np.expand_dims(np.array(image), 0)
+            if random.random() < self.text_drop_ratio:
+                text = ''
+            return {
+                'pixel_values': image, 
+                'text': text, 
+                'data_type': 'image',
+            }
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'image')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'image')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+
+                sample = self.get_batch(idx)
+                sample["idx"] = idx
+
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                import traceback
+                print(f"Error loading sample at index {idx}:")
+                print(f"Data info: {self.dataset[idx % len(self.dataset)]}")
+                print(f"Error: {e}")
+                traceback.print_exc()
+                idx = random.randint(0, self.length-1)
+
+        if self.enable_inpaint and not self.enable_bucket:
+            if "mask" not in sample:
+                mask = get_random_mask_multi(sample["pixel_values"].size())
+                sample["mask"] = mask
+            else:
+                mask = sample["mask"]
+
+            if "input_condition" in sample:
+                mask_pixel_values = sample["input_condition"]
+            else:
+                mask_pixel_values = sample["pixel_values"]
+                mask_pixel_values = mask_pixel_values * (1 - mask) + torch.ones_like(mask_pixel_values) * -1 * mask
+
+            sample["mask_pixel_values"] = mask_pixel_values
+
+            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
+            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
+            sample["clip_pixel_values"] = clip_pixel_values
+
+            ref_pixel_values = sample["pixel_values"][0].unsqueeze(0)
+            if (mask == 1).all():
+                ref_pixel_values = torch.ones_like(ref_pixel_values) * -1
+            sample["ref_pixel_values"] = ref_pixel_values
+
+        return sample
+
+
+class ImageVideoControlDataset(Dataset):
+    def __init__(
+            self,
+            ann_path, data_root=None,
+            video_sample_size=512, video_sample_stride=4, video_sample_n_frames=16,
+            image_sample_size=512,
+            video_repeat=0,
+            text_drop_ratio=0.1,
+            enable_bucket=False,
+            video_length_drop_start=0.0, 
+            video_length_drop_end=1.0,
+            enable_inpaint=False,
+    ):
+        # Loading annotations from files
+        print(f"loading annotations from {ann_path} ...")
+        if ann_path.endswith('.csv'):
+            with open(ann_path, 'r') as csvfile:
+                dataset = list(csv.DictReader(csvfile))
+        elif ann_path.endswith('.json'):
+            dataset = json.load(open(ann_path))
+        else:
+            raise ValueError(f"Unsupported annotation file format: {ann_path}. Only .csv and .json files are supported.")
+
+        self.data_root = data_root
+
+        # It's used to balance num of images and videos.
+        self.dataset = []
+        for data in dataset:
+            if data.get('type', 'image') != 'video':
+                self.dataset.append(data)
+        if video_repeat > 0:
+            for _ in range(video_repeat):
+                for data in dataset:
+                    if data.get('type', 'image') == 'video':
+                        self.dataset.append(data)
+        del dataset
+
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+        # TODO: enable bucket training
+        self.enable_bucket = enable_bucket
+        self.text_drop_ratio = text_drop_ratio
+        self.enable_inpaint  = enable_inpaint
+
+        self.video_length_drop_start = video_length_drop_start
+        self.video_length_drop_end = video_length_drop_end
+
+        # Video params
+        self.video_sample_stride    = video_sample_stride
+        self.video_sample_n_frames  = video_sample_n_frames
+        self.video_sample_size = tuple(video_sample_size) if not isinstance(video_sample_size, int) else (video_sample_size, video_sample_size)
+        self.video_transforms = transforms.Compose(
+            [
+                transforms.Resize(min(self.video_sample_size)),
+                transforms.CenterCrop(self.video_sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+
+        # Image params
+        self.image_sample_size  = tuple(image_sample_size) if not isinstance(image_sample_size, int) else (image_sample_size, image_sample_size)
+        self.image_transforms   = transforms.Compose([
+            transforms.Resize(min(self.image_sample_size)),
+            transforms.CenterCrop(self.image_sample_size),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5, 0.5, 0.5],[0.5, 0.5, 0.5])
+        ])
+
+        self.larger_side_of_image_and_video = max(min(self.image_sample_size), min(self.video_sample_size))
+
+    def get_batch(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        video_id, text = data_info['file_path'], data_info['text']
+
+        if data_info.get('type', 'image')=='video':
+            if self.data_root is None:
+                video_dir = video_id
+            else:
+                video_dir = os.path.join(self.data_root, video_id)
+
+            with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+                min_sample_n_frames = min(
+                    self.video_sample_n_frames, 
+                    int(len(video_reader) * (self.video_length_drop_end - self.video_length_drop_start) // self.video_sample_stride)
+                )
+                if min_sample_n_frames == 0:
+                    raise ValueError(f"No Frames in video.")
+
+                video_length = int(self.video_length_drop_end * len(video_reader))
+                clip_length = min(video_length, (min_sample_n_frames - 1) * self.video_sample_stride + 1)
+                start_idx   = random.randint(int(self.video_length_drop_start * video_length), video_length - clip_length) if video_length != clip_length else 0
+                batch_index = np.linspace(start_idx, start_idx + clip_length - 1, min_sample_n_frames, dtype=int)
+
+                try:
+                    sample_args = (video_reader, batch_index)
+                    pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(pixel_values)):
+                        frame = pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+                if not self.enable_bucket:
+                    pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                    pixel_values = pixel_values / 255.
+                    del video_reader
+                else:
+                    pixel_values = pixel_values
+
+                if not self.enable_bucket:
+                    pixel_values = self.video_transforms(pixel_values)
+
+                # Random use no text generation
+                if random.random() < self.text_drop_ratio:
+                    text = ''
+
+            control_video_id = data_info['control_file_path']
+
+            if self.data_root is None:
+                control_video_id = control_video_id
+            else:
+                control_video_id = os.path.join(self.data_root, control_video_id)
+
+            with VideoReader_contextmanager(control_video_id, num_threads=2) as control_video_reader:
+                try:
+                    sample_args = (control_video_reader, batch_index)
+                    control_pixel_values = func_timeout(
+                        VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                    )
+                    resized_frames = []
+                    for i in range(len(control_pixel_values)):
+                        frame = control_pixel_values[i]
+                        resized_frame = resize_frame(frame, self.larger_side_of_image_and_video)
+                        resized_frames.append(resized_frame)
+                    control_pixel_values = np.array(resized_frames)
+                except FunctionTimedOut:
+                    raise ValueError(f"Read {idx} timeout.")
+                except Exception as e:
+                    raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+                if not self.enable_bucket:
+                    control_pixel_values = torch.from_numpy(control_pixel_values).permute(0, 3, 1, 2).contiguous()
+                    control_pixel_values = control_pixel_values / 255.
+                    del control_video_reader
+                else:
+                    control_pixel_values = control_pixel_values
+
+                if not self.enable_bucket:
+                    control_pixel_values = self.video_transforms(control_pixel_values)
+            return pixel_values, control_pixel_values, text, "video"
+        else:
+            image_path, text = data_info['file_path'], data_info['text']
+            if self.data_root is not None:
+                image_path = os.path.join(self.data_root, image_path)
+            image = Image.open(image_path).convert('RGB')
+            if not self.enable_bucket:
+                image = self.image_transforms(image).unsqueeze(0)
+            else:
+                image = np.expand_dims(np.array(image), 0)
+
+            if random.random() < self.text_drop_ratio:
+                text = ''
+
+            control_image_id = data_info['control_file_path']
+
+            if self.data_root is None:
+                control_image_id = control_image_id
+            else:
+                control_image_id = os.path.join(self.data_root, control_image_id)
+
+            control_image = Image.open(control_image_id).convert('RGB')
+            if not self.enable_bucket:
+                control_image = self.image_transforms(control_image).unsqueeze(0)
+            else:
+                control_image = np.expand_dims(np.array(control_image), 0)
+            return image, control_image, text, 'image'
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        data_info = self.dataset[idx % len(self.dataset)]
+        data_type = data_info.get('type', 'image')
+        while True:
+            sample = {}
+            try:
+                data_info_local = self.dataset[idx % len(self.dataset)]
+                data_type_local = data_info_local.get('type', 'image')
+                if data_type_local != data_type:
+                    raise ValueError("data_type_local != data_type")
+
+                pixel_values, control_pixel_values, name, data_type = self.get_batch(idx)
+                sample["pixel_values"] = pixel_values
+                sample["control_pixel_values"] = control_pixel_values
+                sample["text"] = name
+                sample["data_type"] = data_type
+                sample["idx"] = idx
+
+                if len(sample) > 0:
+                    break
+            except Exception as e:
+                print(e, self.dataset[idx % len(self.dataset)])
+                idx = random.randint(0, self.length-1)
+
+        if self.enable_inpaint and not self.enable_bucket:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
+            sample["mask_pixel_values"] = mask_pixel_values
+            sample["mask"] = mask
+
+            clip_pixel_values = sample["pixel_values"][0].permute(1, 2, 0).contiguous()
+            clip_pixel_values = (clip_pixel_values * 0.5 + 0.5) * 255
+            sample["clip_pixel_values"] = clip_pixel_values
+
+            ref_pixel_values = sample["pixel_values"][0].unsqueeze(0)
+            if (mask == 1).all():
+                ref_pixel_values = torch.ones_like(ref_pixel_values) * -1
+            sample["ref_pixel_values"] = ref_pixel_values
+
+        return sample
diff --git a/videox_fun/data/dataset_video.py b/videox_fun/data/dataset_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78367d0973ceb1abdcd005947612d16e2480831
--- /dev/null
+++ b/videox_fun/data/dataset_video.py
@@ -0,0 +1,262 @@
+import csv
+import gc
+import io
+import json
+import math
+import os
+import random
+from contextlib import contextmanager
+from threading import Thread
+
+import albumentations
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms as transforms
+from decord import VideoReader
+from einops import rearrange
+from func_timeout import FunctionTimedOut, func_timeout
+from PIL import Image
+from torch.utils.data import BatchSampler, Sampler
+from torch.utils.data.dataset import Dataset
+
+VIDEO_READER_TIMEOUT = 20
+
+def get_random_mask(shape):
+    f, c, h, w = shape
+    
+    mask_index = np.random.randint(0, 4)
+    mask = torch.zeros((f, 1, h, w), dtype=torch.uint8)
+    if mask_index == 0:
+        mask[1:, :, :, :] = 1
+    elif mask_index == 1:
+        mask_frame_index = 1
+        mask[mask_frame_index:-mask_frame_index, :, :, :] = 1
+    elif mask_index == 2:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+        mask[:, :, start_y:end_y, start_x:end_x] = 1
+    elif mask_index == 3:
+        center_x = torch.randint(0, w, (1,)).item()
+        center_y = torch.randint(0, h, (1,)).item()
+        block_size_x = torch.randint(w // 4, w // 4 * 3, (1,)).item()  # 方块的宽度范围
+        block_size_y = torch.randint(h // 4, h // 4 * 3, (1,)).item()  # 方块的高度范围
+
+        start_x = max(center_x - block_size_x // 2, 0)
+        end_x = min(center_x + block_size_x // 2, w)
+        start_y = max(center_y - block_size_y // 2, 0)
+        end_y = min(center_y + block_size_y // 2, h)
+
+        mask_frame_before = np.random.randint(0, f // 2)
+        mask_frame_after = np.random.randint(f // 2, f)
+        mask[mask_frame_before:mask_frame_after, :, start_y:end_y, start_x:end_x] = 1
+    else:
+        raise ValueError(f"The mask_index {mask_index} is not define")
+    return mask
+
+
+@contextmanager
+def VideoReader_contextmanager(*args, **kwargs):
+    vr = VideoReader(*args, **kwargs)
+    try:
+        yield vr
+    finally:
+        del vr
+        gc.collect()
+
+
+def get_video_reader_batch(video_reader, batch_index):
+    frames = video_reader.get_batch(batch_index).asnumpy()
+    return frames
+
+
+class WebVid10M(Dataset):
+    def __init__(
+            self,
+            csv_path, video_folder,
+            sample_size=256, sample_stride=4, sample_n_frames=16,
+            enable_bucket=False, enable_inpaint=False, is_image=False,
+        ):
+        print(f"loading annotations from {csv_path} ...")
+        with open(csv_path, 'r') as csvfile:
+            self.dataset = list(csv.DictReader(csvfile))
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+
+        self.video_folder    = video_folder
+        self.sample_stride   = sample_stride
+        self.sample_n_frames = sample_n_frames
+        self.enable_bucket   = enable_bucket
+        self.enable_inpaint  = enable_inpaint
+        self.is_image        = is_image
+        
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose([
+            transforms.Resize(sample_size[0]),
+            transforms.CenterCrop(sample_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        videoid, name, page_dir = video_dict['videoid'], video_dict['name'], video_dict['page_dir']
+        
+        video_dir    = os.path.join(self.video_folder, f"{videoid}.mp4")
+        video_reader = VideoReader(video_dir)
+        video_length = len(video_reader)
+        
+        if not self.is_image:
+            clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
+            start_idx   = random.randint(0, video_length - clip_length)
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+        else:
+            batch_index = [random.randint(0, video_length - 1)]
+
+        if not self.enable_bucket:
+            pixel_values = torch.from_numpy(video_reader.get_batch(batch_index).asnumpy()).permute(0, 3, 1, 2).contiguous()
+            pixel_values = pixel_values / 255.
+            del video_reader
+        else:
+            pixel_values = video_reader.get_batch(batch_index).asnumpy()
+
+        if self.is_image:
+            pixel_values = pixel_values[0]
+        return pixel_values, name
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+
+            except Exception as e:
+                print("Error info:", e)
+                idx = random.randint(0, self.length-1)
+
+        if not self.enable_bucket:
+            pixel_values = self.pixel_transforms(pixel_values)
+        if self.enable_inpaint:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
+            sample = dict(pixel_values=pixel_values, mask_pixel_values=mask_pixel_values, mask=mask, text=name)
+        else:
+            sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+
+
+class VideoDataset(Dataset):
+    def __init__(
+        self,
+        json_path, video_folder=None,
+        sample_size=256, sample_stride=4, sample_n_frames=16,
+        enable_bucket=False, enable_inpaint=False
+    ):
+        print(f"loading annotations from {json_path} ...")
+        self.dataset = json.load(open(json_path, 'r'))
+        self.length = len(self.dataset)
+        print(f"data scale: {self.length}")
+
+        self.video_folder    = video_folder
+        self.sample_stride   = sample_stride
+        self.sample_n_frames = sample_n_frames
+        self.enable_bucket   = enable_bucket
+        self.enable_inpaint  = enable_inpaint
+        
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.pixel_transforms = transforms.Compose(
+            [
+                transforms.Resize(sample_size[0]),
+                transforms.CenterCrop(sample_size),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+            ]
+        )
+    
+    def get_batch(self, idx):
+        video_dict = self.dataset[idx]
+        video_id, name = video_dict['file_path'], video_dict['text']
+
+        if self.video_folder is None:
+            video_dir = video_id
+        else:
+            video_dir = os.path.join(self.video_folder, video_id)
+
+        with VideoReader_contextmanager(video_dir, num_threads=2) as video_reader:
+            video_length = len(video_reader)
+        
+            clip_length = min(video_length, (self.sample_n_frames - 1) * self.sample_stride + 1)
+            start_idx   = random.randint(0, video_length - clip_length)
+            batch_index = np.linspace(start_idx, start_idx + clip_length - 1, self.sample_n_frames, dtype=int)
+
+            try:
+                sample_args = (video_reader, batch_index)
+                pixel_values = func_timeout(
+                    VIDEO_READER_TIMEOUT, get_video_reader_batch, args=sample_args
+                )
+            except FunctionTimedOut:
+                raise ValueError(f"Read {idx} timeout.")
+            except Exception as e:
+                raise ValueError(f"Failed to extract frames from video. Error is {e}.")
+
+            if not self.enable_bucket:
+                pixel_values = torch.from_numpy(pixel_values).permute(0, 3, 1, 2).contiguous()
+                pixel_values = pixel_values / 255.
+                del video_reader
+            else:
+                pixel_values = pixel_values
+
+            return pixel_values, name
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        while True:
+            try:
+                pixel_values, name = self.get_batch(idx)
+                break
+
+            except Exception as e:
+                print("Error info:", e)
+                idx = random.randint(0, self.length-1)
+
+        if not self.enable_bucket:
+            pixel_values = self.pixel_transforms(pixel_values)
+        if self.enable_inpaint:
+            mask = get_random_mask(pixel_values.size())
+            mask_pixel_values = pixel_values * (1 - mask) + torch.ones_like(pixel_values) * -1 * mask
+            sample = dict(pixel_values=pixel_values, mask_pixel_values=mask_pixel_values, mask=mask, text=name)
+        else:
+            sample = dict(pixel_values=pixel_values, text=name)
+        return sample
+
+
+if __name__ == "__main__":
+    if 1:
+        dataset = VideoDataset(
+            json_path="/home/zhoumo.xjq/disk3/datasets/webvidval/results_2M_val.json",
+            sample_size=256,
+            sample_stride=4, sample_n_frames=16,
+        )
+
+    if 0:
+        dataset = WebVid10M(
+            csv_path="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/results_2M_val.csv",
+            video_folder="/mnt/petrelfs/guoyuwei/projects/datasets/webvid/2M_val",
+            sample_size=256,
+            sample_stride=4, sample_n_frames=16,
+            is_image=False,
+        )
+
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, num_workers=0,)
+    for idx, batch in enumerate(dataloader):
+        print(batch["pixel_values"].shape, len(batch["text"]))
\ No newline at end of file
diff --git a/videox_fun/dist/__init__.py b/videox_fun/dist/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a628f166b3b1b574a3f4b822dc9ae1c497c66f33
--- /dev/null
+++ b/videox_fun/dist/__init__.py
@@ -0,0 +1,40 @@
+import torch
+import torch.distributed as dist
+
+try:
+    import xfuser
+    from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                         get_sequence_parallel_world_size,
+                                         get_sp_group, get_world_group,
+                                         init_distributed_environment,
+                                         initialize_model_parallel)
+    from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+except Exception as ex:
+    get_sequence_parallel_world_size = None
+    get_sequence_parallel_rank = None
+    xFuserLongContextAttention = None
+    get_sp_group = None
+    get_world_group = None
+    init_distributed_environment = None
+    initialize_model_parallel = None
+
+def set_multi_gpus_devices(ulysses_degree, ring_degree):
+    if ulysses_degree > 1 or ring_degree > 1:
+        if get_sp_group is None:
+            raise RuntimeError("xfuser is not installed.")
+        dist.init_process_group("nccl")
+        print('parallel inference enabled: ulysses_degree=%d ring_degree=%d rank=%d world_size=%d' % (
+            ulysses_degree, ring_degree, dist.get_rank(),
+            dist.get_world_size()))
+        assert dist.get_world_size() == ring_degree * ulysses_degree, \
+                    "number of GPUs(%d) should be equal to ring_degree * ulysses_degree." % dist.get_world_size()
+        init_distributed_environment(rank=dist.get_rank(), world_size=dist.get_world_size())
+        initialize_model_parallel(sequence_parallel_degree=dist.get_world_size(),
+                ring_degree=ring_degree,
+                ulysses_degree=ulysses_degree)
+        # device = torch.device("cuda:%d" % dist.get_rank())
+        device = torch.device(f"cuda:{get_world_group().local_rank}")
+        print('rank=%d device=%s' % (get_world_group().rank, str(device)))
+    else:
+        device = "cuda"
+    return device
\ No newline at end of file
diff --git a/videox_fun/dist/cogvideox_xfuser.py b/videox_fun/dist/cogvideox_xfuser.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b838d873f55b5f6ddfa25b4be6dd8b8d78f03f
--- /dev/null
+++ b/videox_fun/dist/cogvideox_xfuser.py
@@ -0,0 +1,116 @@
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention import Attention
+from diffusers.models.embeddings import apply_rotary_emb
+
+try:
+    import xfuser
+    from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                         get_sequence_parallel_world_size,
+                                         get_sp_group,
+                                         init_distributed_environment,
+                                         initialize_model_parallel)
+    from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+except Exception as ex:
+    get_sequence_parallel_world_size = None
+    get_sequence_parallel_rank = None
+    xFuserLongContextAttention = None
+    get_sp_group = None
+    init_distributed_environment = None
+    initialize_model_parallel = None
+
+class CogVideoXMultiGPUsAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+
+    def __init__(self):
+        if xFuserLongContextAttention is not None:
+            try:
+                self.hybrid_seq_parallel_attn = xFuserLongContextAttention()
+            except Exception:
+                self.hybrid_seq_parallel_attn = None
+        else:
+            self.hybrid_seq_parallel_attn = None
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+            if not attn.is_cross_attention:
+                key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+
+        if self.hybrid_seq_parallel_attn is None:
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+            hidden_states = hidden_states
+        else:
+            img_q = query[:, :, text_seq_length:].transpose(1, 2)
+            txt_q = query[:, :, :text_seq_length].transpose(1, 2)
+            img_k = key[:, :, text_seq_length:].transpose(1, 2)
+            txt_k = key[:, :, :text_seq_length].transpose(1, 2)
+            img_v = value[:, :, text_seq_length:].transpose(1, 2)
+            txt_v = value[:, :, :text_seq_length].transpose(1, 2)
+
+            hidden_states = self.hybrid_seq_parallel_attn(
+                None,
+                img_q, img_k, img_v, dropout_p=0.0, causal=False,
+                joint_tensor_query=txt_q,
+                joint_tensor_key=txt_k,
+                joint_tensor_value=txt_v,
+                joint_strategy='front',
+            ).transpose(1, 2)
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+        )
+        return hidden_states, encoder_hidden_states
+
diff --git a/videox_fun/dist/wan_xfuser.py b/videox_fun/dist/wan_xfuser.py
new file mode 100644
index 0000000000000000000000000000000000000000..949983df585ed545e8e8f86f17e1f51c38bb805e
--- /dev/null
+++ b/videox_fun/dist/wan_xfuser.py
@@ -0,0 +1,115 @@
+import torch
+import torch.cuda.amp as amp
+
+try:
+    import xfuser
+    from xfuser.core.distributed import (get_sequence_parallel_rank,
+                                         get_sequence_parallel_world_size,
+                                         get_sp_group,
+                                         init_distributed_environment,
+                                         initialize_model_parallel)
+    from xfuser.core.long_ctx_attention import xFuserLongContextAttention
+except Exception as ex:
+    get_sequence_parallel_world_size = None
+    get_sequence_parallel_rank = None
+    xFuserLongContextAttention = None
+    get_sp_group = None
+    init_distributed_environment = None
+    initialize_model_parallel = None
+
+def pad_freqs(original_tensor, target_len):
+    seq_len, s1, s2 = original_tensor.shape
+    pad_size = target_len - seq_len
+    padding_tensor = torch.ones(
+        pad_size,
+        s1,
+        s2,
+        dtype=original_tensor.dtype,
+        device=original_tensor.device)
+    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
+    return padded_tensor
+
+@amp.autocast(enabled=False)
+def rope_apply(x, grid_sizes, freqs):
+    """
+    x:          [B, L, N, C].
+    grid_sizes: [B, 3].
+    freqs:      [M, C // 2].
+    """
+    s, n, c = x.size(1), x.size(2), x.size(3) // 2
+    # split freqs
+    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
+
+    # loop over samples
+    output = []
+    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
+        seq_len = f * h * w
+
+        # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :s].to(torch.float32).reshape(
+            s, n, -1, 2))
+        freqs_i = torch.cat([
+            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
+            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
+            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
+        ],
+        dim=-1).reshape(seq_len, 1, -1)
+
+        # apply rotary embedding
+        sp_size = get_sequence_parallel_world_size()
+        sp_rank = get_sequence_parallel_rank()
+        freqs_i = pad_freqs(freqs_i, s * sp_size)
+        s_per_rank = s
+        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
+                                                       s_per_rank), :, :]
+        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
+        x_i = torch.cat([x_i, x[i, s:]])
+
+        # append to collection
+        output.append(x_i)
+    return torch.stack(output)
+
+def usp_attn_forward(self,
+                     x,
+                     seq_lens,
+                     grid_sizes,
+                     freqs,
+                     dtype=torch.bfloat16):
+    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+    half_dtypes = (torch.float16, torch.bfloat16)
+
+    def half(x):
+        return x if x.dtype in half_dtypes else x.to(dtype)
+
+    # query, key, value function
+    def qkv_fn(x):
+        q = self.norm_q(self.q(x)).view(b, s, n, d)
+        k = self.norm_k(self.k(x)).view(b, s, n, d)
+        v = self.v(x).view(b, s, n, d)
+        return q, k, v
+
+    q, k, v = qkv_fn(x)
+    q = rope_apply(q, grid_sizes, freqs)
+    k = rope_apply(k, grid_sizes, freqs)
+
+    # TODO: We should use unpaded q,k,v for attention.
+    # k_lens = seq_lens // get_sequence_parallel_world_size()
+    # if k_lens is not None:
+    #     q = torch.cat([u[:l] for u, l in zip(q, k_lens)]).unsqueeze(0)
+    #     k = torch.cat([u[:l] for u, l in zip(k, k_lens)]).unsqueeze(0)
+    #     v = torch.cat([u[:l] for u, l in zip(v, k_lens)]).unsqueeze(0)
+
+    x = xFuserLongContextAttention()(
+        None,
+        query=half(q),
+        key=half(k),
+        value=half(v),
+        window_size=self.window_size)
+
+    # TODO: padding after attention.
+    # x = torch.cat([x, x.new_zeros(b, s - x.size(1), n, d)], dim=1)
+
+    # output
+    x = x.flatten(2)
+    x = self.o(x)
+    return x
\ No newline at end of file
diff --git a/videox_fun/models/__init__.py b/videox_fun/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..de0b6c177e63cb2967b2c8eeae7fe87d7822e6ce
--- /dev/null
+++ b/videox_fun/models/__init__.py
@@ -0,0 +1,4 @@
+from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer
+
+from .cogvideox_transformer3d import CogVideoXTransformer3DModel
+from .cogvideox_vae import AutoencoderKLCogVideoX
diff --git a/videox_fun/models/cache_utils.py b/videox_fun/models/cache_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..871b10c45e39e868616597937634a4cd29db1f98
--- /dev/null
+++ b/videox_fun/models/cache_utils.py
@@ -0,0 +1,74 @@
+import numpy as np
+import torch
+
+
+def get_teacache_coefficients(model_name):
+    if "wan2.1-t2v-1.3b" in model_name.lower() or "wan2.1-fun-1.3b" in model_name.lower():
+        return [-5.21862437e+04, 9.23041404e+03, -5.28275948e+02, 1.36987616e+01, -4.99875664e-02]
+    elif "wan2.1-t2v-14b" in model_name.lower():
+        return [-3.03318725e+05, 4.90537029e+04, -2.65530556e+03, 5.87365115e+01, -3.15583525e-01]
+    elif "wan2.1-i2v-14b-480p" in model_name.lower():
+        return [2.57151496e+05, -3.54229917e+04,  1.40286849e+03, -1.35890334e+01, 1.32517977e-01]
+    elif "wan2.1-i2v-14b-720p" in model_name.lower() or "wan2.1-fun-14b" in model_name.lower():
+        return [8.10705460e+03,  2.13393892e+03, -3.72934672e+02,  1.66203073e+01, -4.17769401e-02]
+    else:
+        print(f"The model {model_name} is not supported by TeaCache.")
+        return None
+
+
+class TeaCache():
+    """
+    Timestep Embedding Aware Cache, a training-free caching approach that estimates and leverages
+    the fluctuating differences among model outputs across timesteps, thereby accelerating the inference.
+    Please refer to:
+    1. https://github.com/ali-vilab/TeaCache.
+    2. Liu, Feng, et al. "Timestep Embedding Tells: It's Time to Cache for Video Diffusion Model." arXiv preprint arXiv:2411.19108 (2024).
+    """
+    def __init__(
+        self,
+        coefficients: list[float],
+        num_steps: int,
+        rel_l1_thresh: float = 0.0,
+        num_skip_start_steps: int = 0,
+        offload: bool = True,
+    ):
+        if num_steps < 1:
+            raise ValueError(f"`num_steps` must be greater than 0 but is {num_steps}.")
+        if rel_l1_thresh < 0:
+            raise ValueError(f"`rel_l1_thresh` must be greater than or equal to 0 but is {rel_l1_thresh}.")
+        if num_skip_start_steps < 0 or num_skip_start_steps > num_steps:
+            raise ValueError(
+                "`num_skip_start_steps` must be great than or equal to 0 and "
+                f"less than or equal to `num_steps={num_steps}` but is {num_skip_start_steps}."
+            )
+        self.coefficients = coefficients
+        self.num_steps = num_steps
+        self.rel_l1_thresh = rel_l1_thresh
+        self.num_skip_start_steps = num_skip_start_steps
+        self.offload = offload
+        self.rescale_func = np.poly1d(self.coefficients)
+
+        self.cnt = 0
+        self.should_calc = True
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        # Some pipelines concatenate the unconditional and text guide in forward.
+        self.previous_residual = None
+        # Some pipelines perform forward propagation separately on the unconditional and text guide.
+        self.previous_residual_cond = None
+        self.previous_residual_uncond = None
+
+    @staticmethod
+    def compute_rel_l1_distance(prev: torch.Tensor, cur: torch.Tensor) -> torch.Tensor:
+        rel_l1_distance = (torch.abs(cur - prev).mean()) / torch.abs(prev).mean()
+
+        return rel_l1_distance.cpu().item()
+
+    def reset(self):
+        self.cnt = 0
+        self.should_calc = True
+        self.accumulated_rel_l1_distance = 0
+        self.previous_modulated_input = None
+        self.previous_residual = None
+        self.previous_residual_cond = None
+        self.previous_residual_uncond = None
\ No newline at end of file
diff --git a/videox_fun/models/cogvideox_transformer3d.py b/videox_fun/models/cogvideox_transformer3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..288d58d74d15d077fab43bc3b1e3548739d4fcc1
--- /dev/null
+++ b/videox_fun/models/cogvideox_transformer3d.py
@@ -0,0 +1,845 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import json
+import os
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.attention_processor import (
+    AttentionProcessor, CogVideoXAttnProcessor2_0,
+    FusedCogVideoXAttnProcessor2_0)
+from diffusers.models.embeddings import (CogVideoXPatchEmbed,
+                                         TimestepEmbedding, Timesteps,
+                                         get_3d_sincos_pos_embed)
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero
+from diffusers.utils import is_torch_version, logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from torch import nn
+
+from ..dist import (get_sequence_parallel_rank,
+                    get_sequence_parallel_world_size, 
+                    get_sp_group,
+                    xFuserLongContextAttention)
+from ..dist.cogvideox_xfuser import CogVideoXMultiGPUsAttnProcessor2_0
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class CogVideoXPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
+        in_channels: int = 16,
+        embed_dim: int = 1920,
+        text_embed_dim: int = 4096,
+        bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_positional_embeddings: bool = True,
+        use_learned_positional_embeddings: bool = True,
+    ) -> None:
+        super().__init__()
+
+        post_patch_height = sample_height // patch_size
+        post_patch_width = sample_width // patch_size
+        post_time_compression_frames = (sample_frames - 1) // temporal_compression_ratio + 1
+        self.num_patches = post_patch_height * post_patch_width * post_time_compression_frames
+        self.post_patch_height = post_patch_height
+        self.post_patch_width = post_patch_width
+        self.post_time_compression_frames = post_time_compression_frames
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t
+        self.embed_dim = embed_dim
+        self.sample_height = sample_height
+        self.sample_width = sample_width
+        self.sample_frames = sample_frames
+        self.temporal_compression_ratio = temporal_compression_ratio
+        self.max_text_seq_length = max_text_seq_length
+        self.spatial_interpolation_scale = spatial_interpolation_scale
+        self.temporal_interpolation_scale = temporal_interpolation_scale
+        self.use_positional_embeddings = use_positional_embeddings
+        self.use_learned_positional_embeddings = use_learned_positional_embeddings
+
+        if patch_size_t is None:
+            # CogVideoX 1.0 checkpoints
+            self.proj = nn.Conv2d(
+                in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+            )
+        else:
+            # CogVideoX 1.5 checkpoints
+            self.proj = nn.Linear(in_channels * patch_size * patch_size * patch_size_t, embed_dim)
+
+        self.text_proj = nn.Linear(text_embed_dim, embed_dim)
+
+        if use_positional_embeddings or use_learned_positional_embeddings:
+            persistent = use_learned_positional_embeddings
+            pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
+            self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
+
+    def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
+        post_patch_height = sample_height // self.patch_size
+        post_patch_width = sample_width // self.patch_size
+        post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
+        num_patches = post_patch_height * post_patch_width * post_time_compression_frames
+
+        pos_embedding = get_3d_sincos_pos_embed(
+            self.embed_dim,
+            (post_patch_width, post_patch_height),
+            post_time_compression_frames,
+            self.spatial_interpolation_scale,
+            self.temporal_interpolation_scale,
+        )
+        pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
+        joint_pos_embedding = torch.zeros(
+            1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
+        )
+        joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
+
+        return joint_pos_embedding
+
+    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
+        r"""
+        Args:
+            text_embeds (`torch.Tensor`):
+                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
+            image_embeds (`torch.Tensor`):
+                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+        """
+        text_embeds = self.text_proj(text_embeds)
+
+        text_batch_size, text_seq_length, text_channels = text_embeds.shape
+        batch_size, num_frames, channels, height, width = image_embeds.shape
+
+        if self.patch_size_t is None:
+            image_embeds = image_embeds.reshape(-1, channels, height, width)
+            image_embeds = self.proj(image_embeds)
+            image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
+            image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+            image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+        else:
+            p = self.patch_size
+            p_t = self.patch_size_t
+
+            image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
+            # b, f, h, w, c => b, f // 2, 2, h // 2, 2, w // 2, 2, c
+            image_embeds = image_embeds.reshape(
+                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
+            )
+            # b, f // 2, 2, h // 2, 2, w // 2, 2, c => b, f // 2, h // 2, w // 2, c, 2, 2, 2
+            image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+            image_embeds = self.proj(image_embeds)
+
+        embeds = torch.cat(
+            [text_embeds, image_embeds], dim=1
+        ).contiguous()  # [batch, seq_length + num_frames x height x width, channels]
+
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            seq_length = height * width * num_frames // (self.patch_size**2)
+            # pos_embeds = self.pos_embedding[:, : text_seq_length + seq_length]
+            pos_embeds = self.pos_embedding
+            emb_size = embeds.size()[-1]
+            pos_embeds_without_text = pos_embeds[:, text_seq_length: ].view(1, self.post_time_compression_frames, self.post_patch_height, self.post_patch_width, emb_size)
+            pos_embeds_without_text = pos_embeds_without_text.permute([0, 4, 1, 2, 3])
+            pos_embeds_without_text = F.interpolate(pos_embeds_without_text,size=[self.post_time_compression_frames, height // self.patch_size, width // self.patch_size], mode='trilinear', align_corners=False)
+            pos_embeds_without_text = pos_embeds_without_text.permute([0, 2, 3, 4, 1]).view(1, -1, emb_size)
+            pos_embeds = torch.cat([pos_embeds[:, :text_seq_length], pos_embeds_without_text], dim = 1)
+            pos_embeds = pos_embeds[:, : text_seq_length + seq_length]
+            embeds = embeds + pos_embeds
+
+        return embeds
+
+@maybe_allow_in_graph
+class CogVideoXBlock(nn.Module):
+    r"""
+    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
+
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        dropout: float = 0.0,
+        activation_fn: str = "gelu-approximate",
+        attention_bias: bool = False,
+        qk_norm: bool = True,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = True,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+
+        # 1. Self Attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CogVideoXAttnProcessor2_0(),
+        )
+
+        # 2. Feed Forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
+            hidden_states, encoder_hidden_states, temb
+        )
+
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+        )
+
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
+
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
+            hidden_states, encoder_hidden_states, temb
+        )
+
+        # feed-forward
+        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+
+        return hidden_states, encoder_hidden_states
+
+
+class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin):
+    """
+    A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
+
+    Parameters:
+        num_attention_heads (`int`, defaults to `30`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, defaults to `64`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `16`):
+            The number of channels in the output.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        time_embed_dim (`int`, defaults to `512`):
+            Output dimension of timestep embeddings.
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        num_layers (`int`, defaults to `30`):
+            The number of layers of Transformer blocks to use.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        attention_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in the attention projection layers.
+        sample_width (`int`, defaults to `90`):
+            The width of the input latents.
+        sample_height (`int`, defaults to `60`):
+            The height of the input latents.
+        sample_frames (`int`, defaults to `49`):
+            The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
+            instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
+            but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
+            K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
+        patch_size (`int`, defaults to `2`):
+            The size of the patches to use in the patch embedding layer.
+        temporal_compression_ratio (`int`, defaults to `4`):
+            The compression ratio across the temporal dimension. See documentation for `sample_frames`.
+        max_text_seq_length (`int`, defaults to `226`):
+            The maximum sequence length of the input text embeddings.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to use in feed-forward.
+        timestep_activation_fn (`str`, defaults to `"silu"`):
+            Activation function to use when generating the timestep embeddings.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether or not to use elementwise affine in normalization layers.
+        norm_eps (`float`, defaults to `1e-5`):
+            The epsilon value to use in normalization layers.
+        spatial_interpolation_scale (`float`, defaults to `1.875`):
+            Scaling factor to apply in 3D positional embeddings across spatial dimensions.
+        temporal_interpolation_scale (`float`, defaults to `1.0`):
+            Scaling factor to apply in 3D positional embeddings across temporal dimensions.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30,
+        attention_head_dim: int = 64,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        text_embed_dim: int = 4096,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+        patch_bias: bool = True,
+        add_noise_in_inpaint_model: bool = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.patch_size_t = patch_size_t
+        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+            raise ValueError(
+                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+                "issue at https://github.com/huggingface/diffusers/issues."
+            )
+
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+            patch_size=patch_size,
+            patch_size_t=patch_size_t,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            text_embed_dim=text_embed_dim,
+            bias=patch_bias,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            max_text_seq_length=max_text_seq_length,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+        )
+        self.embedding_dropout = nn.Dropout(dropout)
+
+        # 2. Time embeddings
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+
+        # 3. Define spatio-temporal transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
+
+        # 4. Output blocks
+        self.norm_out = AdaLayerNorm(
+            embedding_dim=time_embed_dim,
+            output_dim=2 * inner_dim,
+            norm_elementwise_affine=norm_elementwise_affine,
+            norm_eps=norm_eps,
+            chunk_dim=1,
+        )
+
+        if patch_size_t is None:
+            # For CogVideox 1.0
+            output_dim = patch_size * patch_size * out_channels
+        else:
+            # For CogVideoX 1.5
+            output_dim = patch_size * patch_size * patch_size_t * out_channels
+
+        self.proj_out = nn.Linear(inner_dim, output_dim)
+
+        self.gradient_checkpointing = False
+        self.sp_world_size = 1
+        self.sp_world_rank = 0
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+
+    def enable_multi_gpus_inference(self,):
+        self.sp_world_size = get_sequence_parallel_world_size()
+        self.sp_world_rank = get_sequence_parallel_rank()
+        self.set_attn_processor(CogVideoXMultiGPUsAttnProcessor2_0())
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedCogVideoXAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedCogVideoXAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        timestep_cond: Optional[torch.Tensor] = None,
+        inpaint_latents: Optional[torch.Tensor] = None,
+        control_latents: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        return_dict: bool = True,
+    ):
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        if num_frames == 1 and self.patch_size_t is not None:
+            hidden_states = torch.cat([hidden_states, torch.zeros_like(hidden_states)], dim=1)
+            if inpaint_latents is not None:
+                inpaint_latents = torch.concat([inpaint_latents, torch.zeros_like(inpaint_latents)], dim=1)
+            if control_latents is not None:
+                control_latents = torch.concat([control_latents, torch.zeros_like(control_latents)], dim=1)
+            local_num_frames = num_frames + 1
+        else:
+            local_num_frames = num_frames
+
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        # 2. Patch embedding
+        if inpaint_latents is not None:
+            hidden_states = torch.concat([hidden_states, inpaint_latents], 2)
+        if control_latents is not None:
+            hidden_states = torch.concat([hidden_states, control_latents], 2)
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        # Context Parallel
+        if self.sp_world_size > 1:
+            hidden_states = torch.chunk(hidden_states, self.sp_world_size, dim=1)[self.sp_world_rank]
+            if image_rotary_emb is not None:
+                image_rotary_emb = (
+                    torch.chunk(image_rotary_emb[0], self.sp_world_size, dim=0)[self.sp_world_rank],
+                    torch.chunk(image_rotary_emb[1], self.sp_world_size, dim=0)[self.sp_world_rank]
+                )
+
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+
+        if self.sp_world_size > 1:
+            hidden_states = get_sp_group().all_gather(hidden_states, dim=1)
+
+        # 5. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, local_num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (local_num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+        
+        if num_frames == 1:
+            output = output[:, :num_frames, :]
+
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_path, subfolder=None, transformer_additional_kwargs={},
+        low_cpu_mem_usage=False, torch_dtype=torch.bfloat16, use_vae_mask=False, stack_mask=False,
+    ):
+        if subfolder is not None:
+            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+        print(f"loaded 3D transformer's pretrained weights from {pretrained_model_path} ...")
+
+        config_file = os.path.join(pretrained_model_path, 'config.json')
+        if not os.path.isfile(config_file):
+            raise RuntimeError(f"{config_file} does not exist")
+        with open(config_file, "r") as f:
+            config = json.load(f)
+
+        if use_vae_mask:
+            print('[DEBUG] use vae to encode mask')
+            config['in_channels'] = 48
+        elif stack_mask:
+            print('[DEBUG] use stacking mask')
+            config['in_channels'] = 36
+
+        from diffusers.utils import WEIGHTS_NAME
+        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
+        model_file_safetensors = model_file.replace(".bin", ".safetensors")
+
+        if "dict_mapping" in transformer_additional_kwargs.keys():
+            for key in transformer_additional_kwargs["dict_mapping"]:
+                transformer_additional_kwargs[transformer_additional_kwargs["dict_mapping"][key]] = config[key]
+
+        if low_cpu_mem_usage:
+            try:
+                import re
+
+                from diffusers.models.modeling_utils import \
+                    load_model_dict_into_meta
+                from diffusers.utils import is_accelerate_available
+                if is_accelerate_available():
+                    import accelerate
+                
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **transformer_additional_kwargs)
+
+                param_device = "cpu"
+                if os.path.exists(model_file):
+                    state_dict = torch.load(model_file, map_location="cpu")
+                elif os.path.exists(model_file_safetensors):
+                    from safetensors.torch import load_file, safe_open
+                    state_dict = load_file(model_file_safetensors)
+                else:
+                    from safetensors.torch import load_file, safe_open
+                    model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
+                    state_dict = {}
+                    for _model_file_safetensors in model_files_safetensors:
+                        _state_dict = load_file(_model_file_safetensors)
+                        for key in _state_dict:
+                            state_dict[key] = _state_dict[key]
+                model._convert_deprecated_attention_blocks(state_dict)
+                # move the params from meta device to cpu
+                missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                if len(missing_keys) > 0:
+                    raise ValueError(
+                        f"Cannot load {cls} from {pretrained_model_path} because the following keys are"
+                        f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                        " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                        " those weights or else make sure your checkpoint file is correct."
+                    )
+
+                unexpected_keys = load_model_dict_into_meta(
+                    model,
+                    state_dict,
+                    device=param_device,
+                    dtype=torch_dtype,
+                    model_name_or_path=pretrained_model_path,
+                )
+
+                if cls._keys_to_ignore_on_load_unexpected is not None:
+                    for pat in cls._keys_to_ignore_on_load_unexpected:
+                        unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+                if len(unexpected_keys) > 0:
+                    print(
+                        f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                    )
+                return model
+            except Exception as e:
+                print(
+                    f"The low_cpu_mem_usage mode is not work because {e}. Use low_cpu_mem_usage=False instead."
+                )
+        
+        model = cls.from_config(config, **transformer_additional_kwargs)
+        if os.path.exists(model_file):
+            state_dict = torch.load(model_file, map_location="cpu")
+        elif os.path.exists(model_file_safetensors):
+            from safetensors.torch import load_file, safe_open
+            state_dict = load_file(model_file_safetensors)
+        else:
+            from safetensors.torch import load_file, safe_open
+            model_files_safetensors = glob.glob(os.path.join(pretrained_model_path, "*.safetensors"))
+            state_dict = {}
+            for _model_file_safetensors in model_files_safetensors:
+                _state_dict = load_file(_model_file_safetensors)
+                for key in _state_dict:
+                    state_dict[key] = _state_dict[key]
+        
+        if model.state_dict()['patch_embed.proj.weight'].size() != state_dict['patch_embed.proj.weight'].size():
+            new_shape   = model.state_dict()['patch_embed.proj.weight'].size()
+            if len(new_shape) == 5:
+                state_dict['patch_embed.proj.weight'] = state_dict['patch_embed.proj.weight'].unsqueeze(2).expand(new_shape).clone()
+                state_dict['patch_embed.proj.weight'][:, :, :-1] = 0
+            elif len(new_shape) == 2:
+                if model.state_dict()['patch_embed.proj.weight'].size()[1] > state_dict['patch_embed.proj.weight'].size()[1]:
+                    if use_vae_mask:
+                        print('[DEBUG] patch_embed.proj.weight size does not match due to vae-encoded mask')
+                        latent_ch = 16
+                        feat_scale = 8
+                        feat_dim = int(latent_ch * feat_scale)
+                        old_total_dim = state_dict['patch_embed.proj.weight'].size(1)
+                        new_total_dim = model.state_dict()['patch_embed.proj.weight'].size(1)
+                        model.state_dict()['patch_embed.proj.weight'][:, :feat_dim] = state_dict['patch_embed.proj.weight'][:, :feat_dim]
+                        model.state_dict()['patch_embed.proj.weight'][:, -feat_dim:] = state_dict['patch_embed.proj.weight'][:, -feat_dim:]
+                        for i in range(feat_dim, new_total_dim - feat_dim, feat_scale):
+                            model.state_dict()['patch_embed.proj.weight'][:, i:i+feat_scale] = state_dict['patch_embed.proj.weight'][:, feat_dim:-feat_dim]
+                        state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
+                    else:
+                        model.state_dict()['patch_embed.proj.weight'][:, :state_dict['patch_embed.proj.weight'].size()[1]] = state_dict['patch_embed.proj.weight']
+                        model.state_dict()['patch_embed.proj.weight'][:, state_dict['patch_embed.proj.weight'].size()[1]:] = 0
+                        state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
+                else:
+                    model.state_dict()['patch_embed.proj.weight'][:, :] = state_dict['patch_embed.proj.weight'][:, :model.state_dict()['patch_embed.proj.weight'].size()[1]]
+                    state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
+            else:
+                if model.state_dict()['patch_embed.proj.weight'].size()[1] > state_dict['patch_embed.proj.weight'].size()[1]:
+                    model.state_dict()['patch_embed.proj.weight'][:, :state_dict['patch_embed.proj.weight'].size()[1], :, :] = state_dict['patch_embed.proj.weight']
+                    model.state_dict()['patch_embed.proj.weight'][:, state_dict['patch_embed.proj.weight'].size()[1]:, :, :] = 0
+                    state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
+                else:
+                    model.state_dict()['patch_embed.proj.weight'][:, :, :, :] = state_dict['patch_embed.proj.weight'][:, :model.state_dict()['patch_embed.proj.weight'].size()[1], :, :]
+                    state_dict['patch_embed.proj.weight'] = model.state_dict()['patch_embed.proj.weight']
+
+        tmp_state_dict = {} 
+        for key in state_dict:
+            if key in model.state_dict().keys() and model.state_dict()[key].size() == state_dict[key].size():
+                tmp_state_dict[key] = state_dict[key]
+            else:
+                print(key, "Size don't match, skip")
+                
+        state_dict = tmp_state_dict
+
+        m, u = model.load_state_dict(state_dict, strict=False)
+        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        print(m)
+        
+        params = [p.numel() if "." in n else 0 for n, p in model.named_parameters()]
+        print(f"### All Parameters: {sum(params) / 1e6} M")
+
+        params = [p.numel() if "attn1." in n else 0 for n, p in model.named_parameters()]
+        print(f"### attn1 Parameters: {sum(params) / 1e6} M")
+        
+        model = model.to(torch_dtype)
+        return model
\ No newline at end of file
diff --git a/videox_fun/models/cogvideox_vae.py b/videox_fun/models/cogvideox_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..56e0a3ea121e1d53d4fe6557c2dd4b0786634e0e
--- /dev/null
+++ b/videox_fun/models/cogvideox_vae.py
@@ -0,0 +1,1675 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import json
+import os
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.utils import logging
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.activations import get_activation
+from diffusers.models.downsampling import CogVideoXDownsample3D
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.upsampling import CogVideoXUpsample3D
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class CogVideoXSafeConv3d(nn.Conv3d):
+    r"""
+    A 3D convolution layer that splits the input tensor into smaller parts to avoid OOM in CogVideoX Model.
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        memory_count = (
+            (input.shape[0] * input.shape[1] * input.shape[2] * input.shape[3] * input.shape[4]) * 2 / 1024**3
+        )
+
+        # Set to 2GB, suitable for CuDNN
+        if memory_count > 2:
+            kernel_size = self.kernel_size[0]
+            part_num = int(memory_count / 2) + 1
+            input_chunks = torch.chunk(input, part_num, dim=2)
+
+            if kernel_size > 1:
+                input_chunks = [input_chunks[0]] + [
+                    torch.cat((input_chunks[i - 1][:, :, -kernel_size + 1 :], input_chunks[i]), dim=2)
+                    for i in range(1, len(input_chunks))
+                ]
+
+            output_chunks = []
+            for input_chunk in input_chunks:
+                output_chunks.append(super().forward(input_chunk))
+            output = torch.cat(output_chunks, dim=2)
+            return output
+        else:
+            return super().forward(input)
+
+
+class CogVideoXCausalConv3d(nn.Module):
+    r"""A 3D causal convolution layer that pads the input tensor to ensure causality in CogVideoX Model.
+
+    Args:
+        in_channels (`int`): Number of channels in the input tensor.
+        out_channels (`int`): Number of output channels produced by the convolution.
+        kernel_size (`int` or `Tuple[int, int, int]`): Kernel size of the convolutional kernel.
+        stride (`int`, defaults to `1`): Stride of the convolution.
+        dilation (`int`, defaults to `1`): Dilation rate of the convolution.
+        pad_mode (`str`, defaults to `"constant"`): Padding mode.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]],
+        stride: int = 1,
+        dilation: int = 1,
+        pad_mode: str = "constant",
+    ):
+        super().__init__()
+
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size,) * 3
+
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+
+        # TODO(aryan): configure calculation based on stride and dilation in the future.
+        # Since CogVideoX does not use it, it is currently tailored to "just work" with Mochi
+        time_pad = time_kernel_size - 1
+        height_pad = (height_kernel_size - 1) // 2
+        width_pad = (width_kernel_size - 1) // 2
+
+        self.pad_mode = pad_mode
+        self.height_pad = height_pad
+        self.width_pad = width_pad
+        self.time_pad = time_pad
+        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+
+        self.temporal_dim = 2
+        self.time_kernel_size = time_kernel_size
+
+        stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
+        dilation = (dilation, 1, 1)
+        self.conv = CogVideoXSafeConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+        )
+
+    def fake_context_parallel_forward(
+        self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if self.pad_mode == "replicate":
+            inputs = F.pad(inputs, self.time_causal_padding, mode="replicate")
+        else:
+            kernel_size = self.time_kernel_size
+            if kernel_size > 1:
+                cached_inputs = [conv_cache] if conv_cache is not None else [inputs[:, :, :1]] * (kernel_size - 1)
+                inputs = torch.cat(cached_inputs + [inputs], dim=2)
+        return inputs
+
+    def forward(self, inputs: torch.Tensor, conv_cache: Optional[torch.Tensor] = None) -> torch.Tensor:
+        inputs = self.fake_context_parallel_forward(inputs, conv_cache)
+
+        if self.pad_mode == "replicate":
+            conv_cache = None
+        else:
+            padding_2d = (self.width_pad, self.width_pad, self.height_pad, self.height_pad)
+            conv_cache = inputs[:, :, -self.time_kernel_size + 1 :].clone()
+            inputs = F.pad(inputs, padding_2d, mode="constant", value=0)
+
+        output = self.conv(inputs)
+        return output, conv_cache
+
+
+class CogVideoXSpatialNorm3D(nn.Module):
+    r"""
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002. This implementation is specific
+    to 3D-video like data.
+
+    CogVideoXSafeConv3d is used instead of nn.Conv3d to avoid OOM in CogVideoX Model.
+
+    Args:
+        f_channels (`int`):
+            The number of channels for input to group normalization layer, and output of the spatial norm layer.
+        zq_channels (`int`):
+            The number of channels for the quantized vector as described in the paper.
+        groups (`int`):
+            Number of groups to separate the channels into for group normalization.
+    """
+
+    def __init__(
+        self,
+        f_channels: int,
+        zq_channels: int,
+        groups: int = 32,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
+        self.conv_y = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+        self.conv_b = CogVideoXCausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+
+    def forward(
+        self, f: torch.Tensor, zq: torch.Tensor, conv_cache: Optional[Dict[str, torch.Tensor]] = None
+    ) -> torch.Tensor:
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+
+        if f.shape[2] > 1 and f.shape[2] % 2 == 1:
+            f_first, f_rest = f[:, :, :1], f[:, :, 1:]
+            f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:]
+            z_first, z_rest = zq[:, :, :1], zq[:, :, 1:]
+            z_first = F.interpolate(z_first, size=f_first_size)
+            z_rest = F.interpolate(z_rest, size=f_rest_size)
+            zq = torch.cat([z_first, z_rest], dim=2)
+        else:
+            zq = F.interpolate(zq, size=f.shape[-3:])
+
+        conv_y, new_conv_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
+        conv_b, new_conv_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
+
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * conv_y + conv_b
+        return new_f, new_conv_cache
+
+
+class CogVideoXUpsample3D(nn.Module):
+    r"""
+    A 3D Upsample layer using in CogVideoX by Tsinghua University & ZhipuAI # Todo: Wait for paper relase.
+
+    Args:
+        in_channels (`int`):
+            Number of channels in the input image.
+        out_channels (`int`):
+            Number of channels produced by the convolution.
+        kernel_size (`int`, defaults to `3`):
+            Size of the convolving kernel.
+        stride (`int`, defaults to `1`):
+            Stride of the convolution.
+        padding (`int`, defaults to `1`):
+            Padding added to all four sides of the input.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to compress the time dimension.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        padding: int = 1,
+        compress_time: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.compress_time = compress_time
+        
+        self.auto_split_process = True
+        self.first_frame_flag = False
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        if self.compress_time:
+            if self.auto_split_process:
+                if inputs.shape[2] > 1 and inputs.shape[2] % 2 == 1:
+                    # split first frame
+                    x_first, x_rest = inputs[:, :, 0], inputs[:, :, 1:]
+
+                    x_first = F.interpolate(x_first, scale_factor=2.0)
+                    x_rest = F.interpolate(x_rest, scale_factor=2.0)
+                    x_first = x_first[:, :, None, :, :]
+                    inputs = torch.cat([x_first, x_rest], dim=2)
+                elif inputs.shape[2] > 1:
+                    inputs = F.interpolate(inputs, scale_factor=2.0)
+                else:
+                    inputs = inputs.squeeze(2)
+                    inputs = F.interpolate(inputs, scale_factor=2.0)
+                    inputs = inputs[:, :, None, :, :]
+            else:
+                if self.first_frame_flag:
+                    inputs = inputs.squeeze(2)
+                    inputs = F.interpolate(inputs, scale_factor=2.0)
+                    inputs = inputs[:, :, None, :, :]
+                else:
+                    inputs = F.interpolate(inputs, scale_factor=2.0)
+        else:
+            # only interpolate 2D
+            b, c, t, h, w = inputs.shape
+            inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+            inputs = F.interpolate(inputs, scale_factor=2.0)
+            inputs = inputs.reshape(b, t, c, *inputs.shape[2:]).permute(0, 2, 1, 3, 4)
+
+        b, c, t, h, w = inputs.shape
+        inputs = inputs.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+        inputs = self.conv(inputs)
+        inputs = inputs.reshape(b, t, *inputs.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        return inputs
+
+
+class CogVideoXResnetBlock3D(nn.Module):
+    r"""
+    A 3D ResNet block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        non_linearity (`str`, defaults to `"swish"`):
+            Activation function to use.
+        conv_shortcut (bool, defaults to `False`):
+            Whether or not to use a convolution shortcut.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        conv_shortcut: bool = False,
+        spatial_norm_dim: Optional[int] = None,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        out_channels = out_channels or in_channels
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.nonlinearity = get_activation(non_linearity)
+        self.use_conv_shortcut = conv_shortcut
+        self.spatial_norm_dim = spatial_norm_dim
+
+        if spatial_norm_dim is None:
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
+        else:
+            self.norm1 = CogVideoXSpatialNorm3D(
+                f_channels=in_channels,
+                zq_channels=spatial_norm_dim,
+                groups=groups,
+            )
+            self.norm2 = CogVideoXSpatialNorm3D(
+                f_channels=out_channels,
+                zq_channels=spatial_norm_dim,
+                groups=groups,
+            )
+
+        self.conv1 = CogVideoXCausalConv3d(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        if temb_channels > 0:
+            self.temb_proj = nn.Linear(in_features=temb_channels, out_features=out_channels)
+
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = CogVideoXCausalConv3d(
+            in_channels=out_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = CogVideoXCausalConv3d(
+                    in_channels=in_channels, out_channels=out_channels, kernel_size=3, pad_mode=pad_mode
+                )
+            else:
+                self.conv_shortcut = CogVideoXSafeConv3d(
+                    in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, padding=0
+                )
+
+    def forward(
+        self,
+        inputs: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+
+        hidden_states = inputs
+
+        if zq is not None:
+            hidden_states, new_conv_cache["norm1"] = self.norm1(hidden_states, zq, conv_cache=conv_cache.get("norm1"))
+        else:
+            hidden_states = self.norm1(hidden_states)
+
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states, new_conv_cache["conv1"] = self.conv1(hidden_states, conv_cache=conv_cache.get("conv1"))
+
+        if temb is not None:
+            hidden_states = hidden_states + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+
+        if zq is not None:
+            hidden_states, new_conv_cache["norm2"] = self.norm2(hidden_states, zq, conv_cache=conv_cache.get("norm2"))
+        else:
+            hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states, new_conv_cache["conv2"] = self.conv2(hidden_states, conv_cache=conv_cache.get("conv2"))
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                inputs, new_conv_cache["conv_shortcut"] = self.conv_shortcut(
+                    inputs, conv_cache=conv_cache.get("conv_shortcut")
+                )
+            else:
+                inputs = self.conv_shortcut(inputs)
+
+        hidden_states = hidden_states + inputs
+        return hidden_states, new_conv_cache
+
+
+class CogVideoXDownBlock3D(nn.Module):
+    r"""
+    A downsampling block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        add_downsample (`bool`, defaults to `True`):
+            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        add_downsample: bool = True,
+        downsample_padding: int = 0,
+        compress_time: bool = False,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        resnets = []
+        for i in range(num_layers):
+            in_channel = in_channels if i == 0 else out_channels
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channel,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    pad_mode=pad_mode,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.downsamplers = None
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    CogVideoXDownsample3D(
+                        out_channels, out_channels, padding=downsample_padding, compress_time=compress_time
+                    )
+                ]
+            )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        r"""Forward method of the `CogVideoXDownBlock3D` class."""
+
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+
+        for i, resnet in enumerate(self.resnets):
+            conv_cache_key = f"resnet_{i}"
+
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+
+                    return create_forward
+
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    zq,
+                    conv_cache.get(conv_cache_key),
+                )
+            else:
+                hidden_states, new_conv_cache[conv_cache_key] = resnet(
+                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
+                )
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, new_conv_cache
+
+
+class CogVideoXMidBlock3D(nn.Module):
+    r"""
+    A middle block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, *optional*):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        spatial_norm_dim: Optional[int] = None,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        resnets = []
+        for _ in range(num_layers):
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    spatial_norm_dim=spatial_norm_dim,
+                    non_linearity=resnet_act_fn,
+                    pad_mode=pad_mode,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        r"""Forward method of the `CogVideoXMidBlock3D` class."""
+
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+
+        for i, resnet in enumerate(self.resnets):
+            conv_cache_key = f"resnet_{i}"
+
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+
+                    return create_forward
+
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet), hidden_states, temb, zq, conv_cache.get(conv_cache_key)
+                )
+            else:
+                hidden_states, new_conv_cache[conv_cache_key] = resnet(
+                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
+                )
+
+        return hidden_states, new_conv_cache
+
+
+class CogVideoXUpBlock3D(nn.Module):
+    r"""
+    An upsampling block used in the CogVideoX model.
+
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        temb_channels (`int`, defaults to `512`):
+            Number of time embedding channels.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        resnet_groups (`int`, defaults to `32`):
+            Number of groups to separate the channels into for group normalization.
+        spatial_norm_dim (`int`, defaults to `16`):
+            The dimension to use for spatial norm if it is to be used instead of group norm.
+        add_upsample (`bool`, defaults to `True`):
+            Whether or not to use a upsampling layer. If not used, output dimension would be same as input dimension.
+        compress_time (`bool`, defaults to `False`):
+            Whether or not to downsample across temporal dimension.
+        pad_mode (str, defaults to `"first"`):
+            Padding mode.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        spatial_norm_dim: int = 16,
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+        compress_time: bool = False,
+        pad_mode: str = "first",
+    ):
+        super().__init__()
+
+        resnets = []
+        for i in range(num_layers):
+            in_channel = in_channels if i == 0 else out_channels
+            resnets.append(
+                CogVideoXResnetBlock3D(
+                    in_channels=in_channel,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=resnet_groups,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    spatial_norm_dim=spatial_norm_dim,
+                    pad_mode=pad_mode,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.upsamplers = None
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    CogVideoXUpsample3D(
+                        out_channels, out_channels, padding=upsample_padding, compress_time=compress_time
+                    )
+                ]
+            )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        zq: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        r"""Forward method of the `CogVideoXUpBlock3D` class."""
+
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+
+        for i, resnet in enumerate(self.resnets):
+            conv_cache_key = f"resnet_{i}"
+
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def create_forward(*inputs):
+                        return module(*inputs)
+
+                    return create_forward
+
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    zq,
+                    conv_cache.get(conv_cache_key),
+                )
+            else:
+                hidden_states, new_conv_cache[conv_cache_key] = resnet(
+                    hidden_states, temb, zq, conv_cache=conv_cache.get(conv_cache_key)
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states, new_conv_cache
+
+
+class CogVideoXEncoder3D(nn.Module):
+    r"""
+    The `CogVideoXEncoder3D` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 16,
+        down_block_types: Tuple[str, ...] = (
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-6,
+        norm_num_groups: int = 32,
+        dropout: float = 0.0,
+        pad_mode: str = "first",
+        temporal_compression_ratio: float = 4,
+    ):
+        super().__init__()
+
+        # log2 of temporal_compress_times
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+        self.conv_in = CogVideoXCausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
+        self.down_blocks = nn.ModuleList([])
+
+        # down blocks
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+
+            if down_block_type == "CogVideoXDownBlock3D":
+                down_block = CogVideoXDownBlock3D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    temb_channels=0,
+                    dropout=dropout,
+                    num_layers=layers_per_block,
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    add_downsample=not is_final_block,
+                    compress_time=compress_time,
+                )
+            else:
+                raise ValueError("Invalid `down_block_type` encountered. Must be `CogVideoXDownBlock3D`")
+
+            self.down_blocks.append(down_block)
+
+        # mid block
+        self.mid_block = CogVideoXMidBlock3D(
+            in_channels=block_out_channels[-1],
+            temb_channels=0,
+            dropout=dropout,
+            num_layers=2,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            pad_mode=pad_mode,
+        )
+
+        self.norm_out = nn.GroupNorm(norm_num_groups, block_out_channels[-1], eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CogVideoXCausalConv3d(
+            block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        r"""The forward method of the `CogVideoXEncoder3D` class."""
+
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+
+        hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # 1. Down
+            for i, down_block in enumerate(self.down_blocks):
+                conv_cache_key = f"down_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(down_block),
+                    hidden_states,
+                    temb,
+                    None,
+                    conv_cache.get(conv_cache_key),
+                )
+
+            # 2. Mid
+            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block),
+                hidden_states,
+                temb,
+                None,
+                conv_cache.get("mid_block"),
+            )
+        else:
+            # 1. Down
+            for i, down_block in enumerate(self.down_blocks):
+                conv_cache_key = f"down_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = down_block(
+                    hidden_states, temb, None, conv_cache=conv_cache.get(conv_cache_key)
+                )
+
+            # 2. Mid
+            hidden_states, new_conv_cache["mid_block"] = self.mid_block(
+                hidden_states, temb, None, conv_cache=conv_cache.get("mid_block")
+            )
+
+        # 3. Post-process
+        hidden_states = self.norm_out(hidden_states)
+        hidden_states = self.conv_act(hidden_states)
+
+        hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out"))
+
+        return hidden_states, new_conv_cache
+
+
+class CogVideoXDecoder3D(nn.Module):
+    r"""
+    The `CogVideoXDecoder3D` layer of a variational autoencoder that decodes its latent representation into an output
+    sample.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = (
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (128, 256, 256, 512),
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-6,
+        norm_num_groups: int = 32,
+        dropout: float = 0.0,
+        pad_mode: str = "first",
+        temporal_compression_ratio: float = 4,
+    ):
+        super().__init__()
+
+        reversed_block_out_channels = list(reversed(block_out_channels))
+
+        self.conv_in = CogVideoXCausalConv3d(
+            in_channels, reversed_block_out_channels[0], kernel_size=3, pad_mode=pad_mode
+        )
+
+        # mid block
+        self.mid_block = CogVideoXMidBlock3D(
+            in_channels=reversed_block_out_channels[0],
+            temb_channels=0,
+            num_layers=2,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            spatial_norm_dim=in_channels,
+            pad_mode=pad_mode,
+        )
+
+        # up blocks
+        self.up_blocks = nn.ModuleList([])
+
+        output_channel = reversed_block_out_channels[0]
+        temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            compress_time = i < temporal_compress_level
+
+            if up_block_type == "CogVideoXUpBlock3D":
+                up_block = CogVideoXUpBlock3D(
+                    in_channels=prev_output_channel,
+                    out_channels=output_channel,
+                    temb_channels=0,
+                    dropout=dropout,
+                    num_layers=layers_per_block + 1,
+                    resnet_eps=norm_eps,
+                    resnet_act_fn=act_fn,
+                    resnet_groups=norm_num_groups,
+                    spatial_norm_dim=in_channels,
+                    add_upsample=not is_final_block,
+                    compress_time=compress_time,
+                    pad_mode=pad_mode,
+                )
+                prev_output_channel = output_channel
+            else:
+                raise ValueError("Invalid `up_block_type` encountered. Must be `CogVideoXUpBlock3D`")
+
+            self.up_blocks.append(up_block)
+
+        self.norm_out = CogVideoXSpatialNorm3D(reversed_block_out_channels[-1], in_channels, groups=norm_num_groups)
+        self.conv_act = nn.SiLU()
+        self.conv_out = CogVideoXCausalConv3d(
+            reversed_block_out_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        conv_cache: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        r"""The forward method of the `CogVideoXDecoder3D` class."""
+
+        new_conv_cache = {}
+        conv_cache = conv_cache or {}
+
+        hidden_states, new_conv_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # 1. Mid
+            hidden_states, new_conv_cache["mid_block"] = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block),
+                hidden_states,
+                temb,
+                sample,
+                conv_cache.get("mid_block"),
+            )
+
+            # 2. Up
+            for i, up_block in enumerate(self.up_blocks):
+                conv_cache_key = f"up_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(up_block),
+                    hidden_states,
+                    temb,
+                    sample,
+                    conv_cache.get(conv_cache_key),
+                )
+        else:
+            # 1. Mid
+            hidden_states, new_conv_cache["mid_block"] = self.mid_block(
+                hidden_states, temb, sample, conv_cache=conv_cache.get("mid_block")
+            )
+
+            # 2. Up
+            for i, up_block in enumerate(self.up_blocks):
+                conv_cache_key = f"up_block_{i}"
+                hidden_states, new_conv_cache[conv_cache_key] = up_block(
+                    hidden_states, temb, sample, conv_cache=conv_cache.get(conv_cache_key)
+                )
+
+        # 3. Post-process
+        hidden_states, new_conv_cache["norm_out"] = self.norm_out(
+            hidden_states, sample, conv_cache=conv_cache.get("norm_out")
+        )
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states, new_conv_cache["conv_out"] = self.conv_out(hidden_states, conv_cache=conv_cache.get("conv_out"))
+
+        return hidden_states, new_conv_cache
+
+
+class AutoencoderKLCogVideoX(ModelMixin, ConfigMixin, FromOriginalModelMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images. Used in
+    [CogVideoX](https://github.com/THUDM/CogVideo).
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to `1.15258426`):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["CogVideoXResnetBlock3D"]
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = (
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+            "CogVideoXDownBlock3D",
+        ),
+        up_block_types: Tuple[str] = (
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+            "CogVideoXUpBlock3D",
+        ),
+        block_out_channels: Tuple[int] = (128, 256, 256, 512),
+        latent_channels: int = 16,
+        layers_per_block: int = 3,
+        act_fn: str = "silu",
+        norm_eps: float = 1e-6,
+        norm_num_groups: int = 32,
+        temporal_compression_ratio: float = 4,
+        sample_height: int = 480,
+        sample_width: int = 720,
+        scaling_factor: float = 1.15258426,
+        shift_factor: Optional[float] = None,
+        latents_mean: Optional[Tuple[float]] = None,
+        latents_std: Optional[Tuple[float]] = None,
+        force_upcast: float = True,
+        use_quant_conv: bool = False,
+        use_post_quant_conv: bool = False,
+        invert_scale_latents: bool = False,
+    ):
+        super().__init__()
+
+        self.encoder = CogVideoXEncoder3D(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_eps=norm_eps,
+            norm_num_groups=norm_num_groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+        self.decoder = CogVideoXDecoder3D(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_eps=norm_eps,
+            norm_num_groups=norm_num_groups,
+            temporal_compression_ratio=temporal_compression_ratio,
+        )
+        self.quant_conv = CogVideoXSafeConv3d(2 * out_channels, 2 * out_channels, 1) if use_quant_conv else None
+        self.post_quant_conv = CogVideoXSafeConv3d(out_channels, out_channels, 1) if use_post_quant_conv else None
+
+        self.use_slicing = False
+        self.use_tiling = False
+        self.auto_split_process = False
+
+        # Can be increased to decode more latent frames at once, but comes at a reasonable memory cost and it is not
+        # recommended because the temporal parts of the VAE, here, are tricky to understand.
+        # If you decode X latent frames together, the number of output frames is:
+        #     (X + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) => X + 6 frames
+        #
+        # Example with num_latent_frames_batch_size = 2:
+        #     - 12 latent frames: (0, 1), (2, 3), (4, 5), (6, 7), (8, 9), (10, 11) are processed together
+        #         => (12 // 2 frame slices) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
+        #         => 6 * 8 = 48 frames
+        #     - 13 latent frames: (0, 1, 2) (special case), (3, 4), (5, 6), (7, 8), (9, 10), (11, 12) are processed together
+        #         => (1 frame slice) * ((3 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale)) +
+        #            ((13 - 3) // 2) * ((2 num_latent_frames_batch_size) + (2 conv cache) + (2 time upscale_1) + (4 time upscale_2) - (2 causal conv downscale))
+        #         => 1 * 9 + 5 * 8 = 49 frames
+        # It has been implemented this way so as to not have "magic values" in the code base that would be hard to explain. Note that
+        # setting it to anything other than 2 would give poor results because the VAE hasn't been trained to be adaptive with different
+        # number of temporal frames.
+        self.num_latent_frames_batch_size = 2
+        self.num_sample_frames_batch_size = 8
+
+        # We make the minimum height and width of sample for tiling half that of the generally supported
+        self.tile_sample_min_height = sample_height // 2
+        self.tile_sample_min_width = sample_width // 2
+        self.tile_latent_min_height = int(
+            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
+        )
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
+
+        # These are experimental overlap factors that were chosen based on experimentation and seem to work best for
+        # 720x480 (WxH) resolution. The above resolution is the strongly recommended generation resolution in CogVideoX
+        # and so the tiling implementation has only been tested on those specific resolutions.
+        self.tile_overlap_factor_height = 1 / 6
+        self.tile_overlap_factor_width = 1 / 5
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CogVideoXEncoder3D, CogVideoXDecoder3D)):
+            module.gradient_checkpointing = value
+
+    def enable_tiling(
+        self,
+        tile_sample_min_height: Optional[int] = None,
+        tile_sample_min_width: Optional[int] = None,
+        tile_overlap_factor_height: Optional[float] = None,
+        tile_overlap_factor_width: Optional[float] = None,
+    ) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+
+        Args:
+            tile_sample_min_height (`int`, *optional*):
+                The minimum height required for a sample to be separated into tiles across the height dimension.
+            tile_sample_min_width (`int`, *optional*):
+                The minimum width required for a sample to be separated into tiles across the width dimension.
+            tile_overlap_factor_height (`int`, *optional*):
+                The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
+                no tiling artifacts produced across the height dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+            tile_overlap_factor_width (`int`, *optional*):
+                The minimum amount of overlap between two consecutive horizontal tiles. This is to ensure that there
+                are no tiling artifacts produced across the width dimension. Must be between 0 and 1. Setting a higher
+                value might cause more tiles to be processed leading to slow down of the decoding process.
+        """
+        self.use_tiling = True
+        self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
+        self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
+        self.tile_latent_min_height = int(
+            self.tile_sample_min_height / (2 ** (len(self.config.block_out_channels) - 1))
+        )
+        self.tile_latent_min_width = int(self.tile_sample_min_width / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor_height = tile_overlap_factor_height or self.tile_overlap_factor_height
+        self.tile_overlap_factor_width = tile_overlap_factor_width or self.tile_overlap_factor_width
+
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_tiling = False
+
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+            
+    def _set_first_frame(self):
+        for name, module in self.named_modules():
+            if isinstance(module, CogVideoXUpsample3D):
+                module.auto_split_process = False
+                module.first_frame_flag = True
+
+    def _set_rest_frame(self):
+        for name, module in self.named_modules():
+            if isinstance(module, CogVideoXUpsample3D):
+                module.auto_split_process = False
+                module.first_frame_flag = False
+
+    def enable_auto_split_process(self) -> None:
+        self.auto_split_process = True
+        for name, module in self.named_modules():
+            if isinstance(module, CogVideoXUpsample3D):
+                module.auto_split_process = True
+
+    def disable_auto_split_process(self) -> None:
+        self.auto_split_process = False
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = x.shape
+
+        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
+            return self.tiled_encode(x)
+
+        frame_batch_size = self.num_sample_frames_batch_size
+        # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
+        # As the extra single frame is handled inside the loop, it is not required to round up here.
+        num_batches = max(num_frames // frame_batch_size, 1)
+        conv_cache = None
+        enc = []
+
+        for i in range(num_batches):
+            remaining_frames = num_frames % frame_batch_size
+            start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+            end_frame = frame_batch_size * (i + 1) + remaining_frames
+            x_intermediate = x[:, :, start_frame:end_frame]
+            x_intermediate, conv_cache = self.encoder(x_intermediate, conv_cache=conv_cache)
+            if self.quant_conv is not None:
+                x_intermediate = self.quant_conv(x_intermediate)
+            enc.append(x_intermediate)
+
+        enc = torch.cat(enc, dim=2)
+        return enc
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded videos. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+
+        posterior = DiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        batch_size, num_channels, num_frames, height, width = z.shape
+
+        if self.use_tiling and (width > self.tile_latent_min_width or height > self.tile_latent_min_height):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        if self.auto_split_process:
+            frame_batch_size = self.num_latent_frames_batch_size
+            num_batches = max(num_frames // frame_batch_size, 1)
+            conv_cache = None
+            dec = []
+
+            for i in range(num_batches):
+                remaining_frames = num_frames % frame_batch_size
+                start_frame = frame_batch_size * i + (0 if i == 0 else remaining_frames)
+                end_frame = frame_batch_size * (i + 1) + remaining_frames
+                z_intermediate = z[:, :, start_frame:end_frame]
+                if self.post_quant_conv is not None:
+                    z_intermediate = self.post_quant_conv(z_intermediate)
+                z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache)
+                dec.append(z_intermediate)
+        else:
+            conv_cache = None
+            start_frame = 0
+            end_frame = 1
+            dec = []
+
+            self._set_first_frame()
+            z_intermediate = z[:, :, start_frame:end_frame]
+            if self.post_quant_conv is not None:
+                z_intermediate = self.post_quant_conv(z_intermediate)
+            z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache)
+            dec.append(z_intermediate)
+
+            self._set_rest_frame()
+            start_frame = end_frame
+            end_frame += self.num_latent_frames_batch_size
+
+            while start_frame < num_frames:
+                z_intermediate = z[:, :, start_frame:end_frame]
+                if self.post_quant_conv is not None:
+                    z_intermediate = self.post_quant_conv(z_intermediate)
+                z_intermediate, conv_cache = self.decoder(z_intermediate, conv_cache=conv_cache)
+                dec.append(z_intermediate)
+                start_frame = end_frame
+                end_frame += self.num_latent_frames_batch_size
+
+        dec = torch.cat(dec, dim=2)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
+                y / blend_extent
+            )
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
+                x / blend_extent
+            )
+        return b
+
+    def tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of videos.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+        # For a rough memory estimate, take a look at the `tiled_decode` method.
+        batch_size, num_channels, num_frames, height, width = x.shape
+
+        overlap_height = int(self.tile_sample_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_sample_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_latent_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_latent_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_latent_min_height - blend_extent_height
+        row_limit_width = self.tile_latent_min_width - blend_extent_width
+        frame_batch_size = self.num_sample_frames_batch_size
+
+        # Split x into overlapping tiles and encode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                # Note: We expect the number of frames to be either `1` or `frame_batch_size * k` or `frame_batch_size * k + 1` for some k.
+                # As the extra single frame is handled inside the loop, it is not required to round up here.
+                num_batches = max(num_frames // frame_batch_size, 1)
+                conv_cache = None
+                time = []
+
+                for k in range(num_batches):
+                    remaining_frames = num_frames % frame_batch_size
+                    start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
+                    end_frame = frame_batch_size * (k + 1) + remaining_frames
+                    tile = x[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_sample_min_height,
+                        j : j + self.tile_sample_min_width,
+                    ]
+                    tile, conv_cache = self.encoder(tile, conv_cache=conv_cache)
+                    if self.quant_conv is not None:
+                        tile = self.quant_conv(tile)
+                    time.append(tile)
+
+                row.append(torch.cat(time, dim=2))
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=4))
+
+        enc = torch.cat(result_rows, dim=3)
+        return enc
+
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        # Rough memory assessment:
+        #   - In CogVideoX-2B, there are a total of 24 CausalConv3d layers.
+        #   - The biggest intermediate dimensions are: [1, 128, 9, 480, 720].
+        #   - Assume fp16 (2 bytes per value).
+        # Memory required: 1 * 128 * 9 * 480 * 720 * 24 * 2 / 1024**3 = 17.8 GB
+        #
+        # Memory assessment when using tiling:
+        #   - Assume everything as above but now HxW is 240x360 by tiling in half
+        # Memory required: 1 * 128 * 9 * 240 * 360 * 24 * 2 / 1024**3 = 4.5 GB
+
+        batch_size, num_channels, num_frames, height, width = z.shape
+
+        overlap_height = int(self.tile_latent_min_height * (1 - self.tile_overlap_factor_height))
+        overlap_width = int(self.tile_latent_min_width * (1 - self.tile_overlap_factor_width))
+        blend_extent_height = int(self.tile_sample_min_height * self.tile_overlap_factor_height)
+        blend_extent_width = int(self.tile_sample_min_width * self.tile_overlap_factor_width)
+        row_limit_height = self.tile_sample_min_height - blend_extent_height
+        row_limit_width = self.tile_sample_min_width - blend_extent_width
+        frame_batch_size = self.num_latent_frames_batch_size
+
+        # Split z into overlapping tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, height, overlap_height):
+            row = []
+            for j in range(0, width, overlap_width):
+                if self.auto_split_process:
+                    num_batches = max(num_frames // frame_batch_size, 1)
+                    conv_cache = None
+                    time = []
+
+                    for k in range(num_batches):
+                        remaining_frames = num_frames % frame_batch_size
+                        start_frame = frame_batch_size * k + (0 if k == 0 else remaining_frames)
+                        end_frame = frame_batch_size * (k + 1) + remaining_frames
+                        tile = z[
+                            :,
+                            :,
+                            start_frame:end_frame,
+                            i : i + self.tile_latent_min_height,
+                            j : j + self.tile_latent_min_width,
+                        ]
+                        if self.post_quant_conv is not None:
+                            tile = self.post_quant_conv(tile)
+                        tile, conv_cache = self.decoder(tile, conv_cache=conv_cache)
+                        time.append(tile)
+
+                    row.append(torch.cat(time, dim=2))
+                else:
+                    conv_cache = None
+                    start_frame = 0
+                    end_frame = 1
+                    dec = []
+
+                    tile = z[
+                        :,
+                        :,
+                        start_frame:end_frame,
+                        i : i + self.tile_latent_min_height,
+                        j : j + self.tile_latent_min_width,
+                    ]
+
+                    self._set_first_frame()
+                    if self.post_quant_conv is not None:
+                        tile = self.post_quant_conv(tile)
+                    tile, conv_cache = self.decoder(tile, conv_cache=conv_cache)
+                    dec.append(tile)
+                    
+                    self._set_rest_frame()
+                    start_frame = end_frame
+                    end_frame += self.num_latent_frames_batch_size
+
+                    while start_frame < num_frames:
+                        tile = z[
+                            :,
+                            :,
+                            start_frame:end_frame,
+                            i : i + self.tile_latent_min_height,
+                            j : j + self.tile_latent_min_width,
+                        ]
+                        if self.post_quant_conv is not None:
+                            tile = self.post_quant_conv(tile)
+                        tile, conv_cache = self.decoder(tile, conv_cache=conv_cache)
+                        dec.append(tile)
+                        start_frame = end_frame
+                        end_frame += self.num_latent_frames_batch_size
+
+                    row.append(torch.cat(dec, dim=2))
+            rows.append(row)
+
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent_height)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent_width)
+                result_row.append(tile[:, :, :, :row_limit_height, :row_limit_width])
+            result_rows.append(torch.cat(result_row, dim=4))
+
+        dec = torch.cat(result_rows, dim=3)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[torch.Tensor, torch.Tensor]:
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        if not return_dict:
+            return (dec,)
+        return dec
+            
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path, subfolder=None, **vae_additional_kwargs):
+        if subfolder is not None:
+            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+
+        config_file = os.path.join(pretrained_model_path, 'config.json')
+        if not os.path.isfile(config_file):
+            raise RuntimeError(f"{config_file} does not exist")
+        with open(config_file, "r") as f:
+            config = json.load(f)
+
+        model = cls.from_config(config, **vae_additional_kwargs)
+        from diffusers.utils import WEIGHTS_NAME
+        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
+        model_file_safetensors = model_file.replace(".bin", ".safetensors")
+        if os.path.exists(model_file_safetensors):
+            from safetensors.torch import load_file, safe_open
+            state_dict = load_file(model_file_safetensors)
+        else:
+            if not os.path.isfile(model_file):
+                raise RuntimeError(f"{model_file} does not exist")
+            state_dict = torch.load(model_file, map_location="cpu")
+        m, u = model.load_state_dict(state_dict, strict=False)
+        print(f"### missing keys: {len(m)}; \n### unexpected keys: {len(u)};")
+        print(m, u)
+        return model
\ No newline at end of file
diff --git a/videox_fun/pipeline/__init__.py b/videox_fun/pipeline/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1c3face706b78232d0deea56ecd1dc2d742ce94
--- /dev/null
+++ b/videox_fun/pipeline/__init__.py
@@ -0,0 +1,2 @@
+from .pipeline_cogvideox_fun import CogVideoXFunPipeline
+from .pipeline_cogvideox_fun_inpaint import CogVideoXFunInpaintPipeline
diff --git a/videox_fun/pipeline/pipeline_cogvideox_fun.py b/videox_fun/pipeline/pipeline_cogvideox_fun.py
new file mode 100644
index 0000000000000000000000000000000000000000..68568a6069ddc30ad4b11bb833b6307b5de3ceb6
--- /dev/null
+++ b/videox_fun/pipeline/pipeline_cogvideox_fun.py
@@ -0,0 +1,862 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+
+from ..models import (AutoencoderKLCogVideoX,
+                              CogVideoXTransformer3DModel, T5EncoderModel,
+                              T5Tokenizer)
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        pass
+        ```
+"""
+
+
+# Copied from diffusers.models.embeddings.get_3d_rotary_pos_embed
+def get_3d_rotary_pos_embed(
+    embed_dim,
+    crops_coords,
+    grid_size,
+    temporal_size,
+    theta: int = 10000,
+    use_real: bool = True,
+    grid_type: str = "linspace",
+    max_size: Optional[Tuple[int, int]] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    RoPE for video tokens with 3D structure.
+
+    Args:
+    embed_dim: (`int`):
+        The embedding dimension size, corresponding to hidden_size_head.
+    crops_coords (`Tuple[int]`):
+        The top-left and bottom-right coordinates of the crop.
+    grid_size (`Tuple[int]`):
+        The grid size of the spatial positional embedding (height, width).
+    temporal_size (`int`):
+        The size of the temporal dimension.
+    theta (`float`):
+        Scaling factor for frequency computation.
+    grid_type (`str`):
+        Whether to use "linspace" or "slice" to compute grids.
+
+    Returns:
+        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
+    """
+    if use_real is not True:
+        raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed")
+
+    if grid_type == "linspace":
+        start, stop = crops_coords
+        grid_size_h, grid_size_w = grid_size
+        grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32)
+        grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32)
+        grid_t = np.arange(temporal_size, dtype=np.float32)
+        grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
+    elif grid_type == "slice":
+        max_h, max_w = max_size
+        grid_size_h, grid_size_w = grid_size
+        grid_h = np.arange(max_h, dtype=np.float32)
+        grid_w = np.arange(max_w, dtype=np.float32)
+        grid_t = np.arange(temporal_size, dtype=np.float32)
+    else:
+        raise ValueError("Invalid value passed for `grid_type`.")
+
+    # Compute dimensions for each axis
+    dim_t = embed_dim // 4
+    dim_h = embed_dim // 8 * 3
+    dim_w = embed_dim // 8 * 3
+
+    # Temporal frequencies
+    freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True)
+    # Spatial frequencies for height and width
+    freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True)
+    freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True)
+
+    # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
+    def combine_time_height_width(freqs_t, freqs_h, freqs_w):
+        freqs_t = freqs_t[:, None, None, :].expand(
+            -1, grid_size_h, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_w, dim_t
+        freqs_h = freqs_h[None, :, None, :].expand(
+            temporal_size, -1, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_h
+        freqs_w = freqs_w[None, None, :, :].expand(
+            temporal_size, grid_size_h, -1, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_w
+
+        freqs = torch.cat(
+            [freqs_t, freqs_h, freqs_w], dim=-1
+        )  # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
+        freqs = freqs.view(
+            temporal_size * grid_size_h * grid_size_w, -1
+        )  # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
+        return freqs
+
+    t_cos, t_sin = freqs_t  # both t_cos and t_sin has shape: temporal_size, dim_t
+    h_cos, h_sin = freqs_h  # both h_cos and h_sin has shape: grid_size_h, dim_h
+    w_cos, w_sin = freqs_w  # both w_cos and w_sin has shape: grid_size_w, dim_w
+
+    if grid_type == "slice":
+        t_cos, t_sin = t_cos[:temporal_size], t_sin[:temporal_size]
+        h_cos, h_sin = h_cos[:grid_size_h], h_sin[:grid_size_h]
+        w_cos, w_sin = w_cos[:grid_size_w], w_sin[:grid_size_w]
+
+    cos = combine_time_height_width(t_cos, h_cos, w_cos)
+    sin = combine_time_height_width(t_sin, h_sin, w_sin)
+    return cos, sin
+
+
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class CogVideoXFunPipelineOutput(BaseOutput):
+    r"""
+    Output class for CogVideo pipelines.
+
+    Args:
+        video (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    videos: torch.Tensor
+
+
+class CogVideoXFunPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using CogVideoX_Fun.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        shape = (
+            batch_size,
+            (num_frames - 1) // self.vae_scale_factor_temporal + 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        frames = self.vae.decode(latents).sample
+        frames = (frames / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        frames = frames.cpu().float().numpy()
+        return frames
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+
+        if p_t is None:
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,
+                grid_size=(grid_height, grid_width),
+                temporal_size=num_frames,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+            )
+
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "numpy",
+        return_dict: bool = False,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+    ) -> Union[CogVideoXFunPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXFunPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXFunPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if num_frames != 1 and patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if output_type == "numpy":
+            video = self.decode_latents(latents)
+        elif not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            video = torch.from_numpy(video)
+
+        return CogVideoXFunPipelineOutput(videos=video)
diff --git a/videox_fun/pipeline/pipeline_cogvideox_fun_inpaint.py b/videox_fun/pipeline/pipeline_cogvideox_fun_inpaint.py
new file mode 100644
index 0000000000000000000000000000000000000000..0dcc58506d5ea366829dfd1329abe7dbcb9dbea7
--- /dev/null
+++ b/videox_fun/pipeline/pipeline_cogvideox_fun_inpaint.py
@@ -0,0 +1,1244 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.models.embeddings import get_1d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+from einops import rearrange
+
+from ..models import (AutoencoderKLCogVideoX,
+                      CogVideoXTransformer3DModel, T5EncoderModel,
+                      T5Tokenizer)
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        pass
+        ```
+"""
+
+# Copied from diffusers.models.embeddings.get_3d_rotary_pos_embed
+def get_3d_rotary_pos_embed(
+    embed_dim,
+    crops_coords,
+    grid_size,
+    temporal_size,
+    theta: int = 10000,
+    use_real: bool = True,
+    grid_type: str = "linspace",
+    max_size: Optional[Tuple[int, int]] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    """
+    RoPE for video tokens with 3D structure.
+
+    Args:
+    embed_dim: (`int`):
+        The embedding dimension size, corresponding to hidden_size_head.
+    crops_coords (`Tuple[int]`):
+        The top-left and bottom-right coordinates of the crop.
+    grid_size (`Tuple[int]`):
+        The grid size of the spatial positional embedding (height, width).
+    temporal_size (`int`):
+        The size of the temporal dimension.
+    theta (`float`):
+        Scaling factor for frequency computation.
+    grid_type (`str`):
+        Whether to use "linspace" or "slice" to compute grids.
+
+    Returns:
+        `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
+    """
+    if use_real is not True:
+        raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed")
+
+    if grid_type == "linspace":
+        start, stop = crops_coords
+        grid_size_h, grid_size_w = grid_size
+        grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32)
+        grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32)
+        grid_t = np.arange(temporal_size, dtype=np.float32)
+        grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
+    elif grid_type == "slice":
+        max_h, max_w = max_size
+        grid_size_h, grid_size_w = grid_size
+        grid_h = np.arange(max_h, dtype=np.float32)
+        grid_w = np.arange(max_w, dtype=np.float32)
+        grid_t = np.arange(temporal_size, dtype=np.float32)
+    else:
+        raise ValueError("Invalid value passed for `grid_type`.")
+
+    # Compute dimensions for each axis
+    dim_t = embed_dim // 4
+    dim_h = embed_dim // 8 * 3
+    dim_w = embed_dim // 8 * 3
+
+    # Temporal frequencies
+    freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True)
+    # Spatial frequencies for height and width
+    freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True)
+    freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True)
+
+    # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
+    def combine_time_height_width(freqs_t, freqs_h, freqs_w):
+        freqs_t = freqs_t[:, None, None, :].expand(
+            -1, grid_size_h, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_w, dim_t
+        freqs_h = freqs_h[None, :, None, :].expand(
+            temporal_size, -1, grid_size_w, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_h
+        freqs_w = freqs_w[None, None, :, :].expand(
+            temporal_size, grid_size_h, -1, -1
+        )  # temporal_size, grid_size_h, grid_size_2, dim_w
+
+        freqs = torch.cat(
+            [freqs_t, freqs_h, freqs_w], dim=-1
+        )  # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
+        freqs = freqs.view(
+            temporal_size * grid_size_h * grid_size_w, -1
+        )  # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
+        return freqs
+
+    t_cos, t_sin = freqs_t  # both t_cos and t_sin has shape: temporal_size, dim_t
+    h_cos, h_sin = freqs_h  # both h_cos and h_sin has shape: grid_size_h, dim_h
+    w_cos, w_sin = freqs_w  # both w_cos and w_sin has shape: grid_size_w, dim_w
+
+    if grid_type == "slice":
+        t_cos, t_sin = t_cos[:temporal_size], t_sin[:temporal_size]
+        h_cos, h_sin = h_cos[:grid_size_h], h_sin[:grid_size_h]
+        w_cos, w_sin = w_cos[:grid_size_w], w_sin[:grid_size_w]
+
+    cos = combine_time_height_width(t_cos, h_cos, w_cos)
+    sin = combine_time_height_width(t_sin, h_sin, w_sin)
+    return cos, sin
+
+
+# Similar to diffusers.pipelines.hunyuandit.pipeline_hunyuandit.get_resize_crop_region_for_grid
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+def resize_mask(mask, latent, process_first_frame_only=True):
+    latent_size = latent.size()
+    batch_size, channels, num_frames, height, width = mask.shape
+
+    if process_first_frame_only:
+        target_size = list(latent_size[2:])
+        target_size[0] = 1
+        first_frame_resized = F.interpolate(
+            mask[:, :, 0:1, :, :],
+            size=target_size,
+            mode='trilinear',
+            align_corners=False
+        )
+        
+        target_size = list(latent_size[2:])
+        target_size[0] = target_size[0] - 1
+        if target_size[0] != 0:
+            remaining_frames_resized = F.interpolate(
+                mask[:, :, 1:, :, :],
+                size=target_size,
+                mode='trilinear',
+                align_corners=False
+            )
+            resized_mask = torch.cat([first_frame_resized, remaining_frames_resized], dim=2)
+        else:
+            resized_mask = first_frame_resized
+    else:
+        target_size = list(latent_size[2:])
+        resized_mask = F.interpolate(
+            mask,
+            size=target_size,
+            mode='trilinear',
+            align_corners=False
+        )
+    return resized_mask
+
+
+def add_noise_to_reference_video(image, ratio=None):
+    if ratio is None:
+        sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
+        sigma = torch.exp(sigma).to(image.dtype)
+    else:
+        sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
+    
+    image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
+    image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
+    image = image + image_noise
+    return image
+
+
+@dataclass
+class CogVideoXFunPipelineOutput(BaseOutput):
+    r"""
+    Output class for CogVideo pipelines.
+
+    Args:
+        video (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    videos: torch.Tensor
+
+
+class CogVideoXFunInpaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using CogVideoX.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogVideoX_Fun uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogVideoXTransformer3DModel`]):
+            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=False, do_convert_grayscale=True
+        )
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self, 
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        video_length,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        video=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_video_latents=False,
+    ):
+        shape = (
+            batch_size,
+            (video_length - 1) // self.vae_scale_factor_temporal + 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if return_video_latents or (latents is None and not is_strength_max):
+            video = video.to(device=device, dtype=self.vae.dtype)
+            
+            bs = 1
+            new_video = []
+            for i in range(0, video.shape[0], bs):
+                video_bs = video[i : i + bs]
+                video_bs = self.vae.encode(video_bs)[0]
+                video_bs = video_bs.sample()
+                new_video.append(video_bs)
+            video = torch.cat(new_video, dim = 0)
+            video = video * self.vae.config.scaling_factor
+
+            video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
+            video_latents = video_latents.to(device=device, dtype=dtype)
+            video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_video_latents:
+            outputs += (video_latents,)
+
+        return outputs
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance, noise_aug_strength
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+
+        if mask is not None:
+            mask = mask.to(device=device, dtype=self.vae.dtype)
+            bs = 1
+            new_mask = []
+            for i in range(0, mask.shape[0], bs):
+                mask_bs = mask[i : i + bs]
+                mask_bs = self.vae.encode(mask_bs)[0]
+                mask_bs = mask_bs.mode()
+                new_mask.append(mask_bs)
+            mask = torch.cat(new_mask, dim = 0)
+            mask = mask * self.vae.config.scaling_factor
+
+        if masked_image is not None:
+            if self.transformer.config.add_noise_in_inpaint_model:
+                masked_image = add_noise_to_reference_video(masked_image, ratio=noise_aug_strength)
+            masked_image = masked_image.to(device=device, dtype=self.vae.dtype)
+            bs = 1
+            new_mask_pixel_values = []
+            for i in range(0, masked_image.shape[0], bs):
+                mask_pixel_values_bs = masked_image[i : i + bs]
+                mask_pixel_values_bs = self.vae.encode(mask_pixel_values_bs)[0]
+                mask_pixel_values_bs = mask_pixel_values_bs.mode()
+                new_mask_pixel_values.append(mask_pixel_values_bs)
+            masked_image_latents = torch.cat(new_mask_pixel_values, dim = 0)
+            masked_image_latents = masked_image_latents * self.vae.config.scaling_factor
+        else:
+            masked_image_latents = None
+
+        return mask, masked_image_latents
+
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        frames = self.vae.decode(latents).sample
+        frames = (frames / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        frames = frames.cpu().float().numpy()
+        return frames
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+
+        if p_t is None:
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,
+                grid_size=(grid_height, grid_width),
+                temporal_size=num_frames,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+            )
+
+        freqs_cos = freqs_cos.to(device=device)
+        freqs_sin = freqs_sin.to(device=device)
+        return freqs_cos, freqs_sin
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        video: Union[torch.FloatTensor] = None,
+        mask_video: Union[torch.FloatTensor] = None,
+        masked_video_latents: Union[torch.FloatTensor] = None,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "numpy",
+        return_dict: bool = False,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+        strength: float = 1,
+        noise_aug_strength: float = 0.0563,
+        comfyui_progressbar: bool = False,
+        temporal_multidiffusion_stride: int = 16,
+        use_trimask: bool = False,
+        zero_out_mask_region: bool = False,
+        binarize_mask: bool = False,
+        skip_unet: bool = False,
+        use_vae_mask: bool = False,
+        stack_mask: bool = False,
+    ) -> Union[CogVideoXFunPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_frames (`int`, defaults to `48`):
+                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
+                contain 1 extra frame because CogVideoX_Fun is conditioned with (num_seconds * fps + 1) frames where
+                num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
+                needs to be satisfied is that of divisibility mentioned above.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `226`):
+                Maximum sequence length in encoded prompt. Must be consistent with
+                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXFunPipelineOutput`] or `tuple`:
+            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXFunPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        logger.info(f'Use cfg: {do_classifier_free_guidance}, guidance_scale={guidance_scale}')
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        self._num_timesteps = len(timesteps)
+        if comfyui_progressbar:
+            from comfy.utils import ProgressBar
+            pbar = ProgressBar(num_inference_steps + 2)
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Prepare latents.
+        if video is not None:
+            video_length = video.shape[2]
+            init_video = self.image_processor.preprocess(rearrange(video, "b c f h w -> (b f) c h w"), height=height, width=width) 
+            init_video = init_video.to(dtype=torch.float32)
+            init_video = rearrange(init_video, "(b f) c h w -> b c f h w", f=video_length)
+        else:
+            video_length = num_frames
+            init_video = None
+
+        # Magvae needs the number of frames to be 4n + 1.
+        local_latent_length = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        # For CogVideoX 1.5, the latent frames should be clipped to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and local_latent_length % patch_size_t != 0:
+            additional_frames = local_latent_length % patch_size_t
+            num_frames -= additional_frames * self.vae_scale_factor_temporal
+        if num_frames <= 0:
+            num_frames = 1
+
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_transformer = self.transformer.config.in_channels
+        return_image_latents = num_channels_transformer == num_channels_latents
+
+        latents_outputs = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            video_length,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            video=init_video,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_video_latents=return_image_latents,
+        )
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+        if comfyui_progressbar:
+            pbar.update(1)
+
+        if mask_video is not None:
+            if (mask_video == 255).all():
+                mask_latents = torch.zeros_like(latents)[:, :, :1].to(latents.device, latents.dtype)
+                masked_video_latents = torch.zeros_like(latents).to(latents.device, latents.dtype)
+
+                mask_input = torch.cat([mask_latents] * 2) if do_classifier_free_guidance else mask_latents
+                masked_video_latents_input = (
+                    torch.cat([masked_video_latents] * 2) if do_classifier_free_guidance else masked_video_latents
+                )
+                inpaint_latents = torch.cat([mask_input, masked_video_latents_input], dim=2).to(latents.dtype)
+            else:
+                # Prepare mask latent variables
+                video_length = video.shape[2]
+                mask_condition = self.mask_processor.preprocess(rearrange(mask_video, "b c f h w -> (b f) c h w"), height=height, width=width) 
+                if use_trimask:
+                    mask_condition = torch.where(mask_condition > 0.75, 1., mask_condition)
+                    mask_condition = torch.where((mask_condition <= 0.75) * (mask_condition >= 0.25), 127. / 255., mask_condition)
+                    mask_condition = torch.where(mask_condition < 0.25, 0., mask_condition)
+                else:
+                    mask_condition = torch.where(mask_condition > 0.5, 1., 0.)
+                    
+                mask_condition = mask_condition.to(dtype=torch.float32)
+                mask_condition = rearrange(mask_condition, "(b f) c h w -> b c f h w", f=video_length)
+
+                if num_channels_transformer != num_channels_latents:
+                    mask_condition_tile = torch.tile(mask_condition, [1, 3, 1, 1, 1])
+                    if masked_video_latents is None:
+                        if zero_out_mask_region:
+                            masked_video = init_video * (mask_condition_tile < 0.75) + torch.ones_like(init_video) * (mask_condition_tile > 0.75) * -1
+                        else:
+                            masked_video = init_video
+                    else:
+                        masked_video = masked_video_latents
+
+                    mask_encoded, masked_video_latents = self.prepare_mask_latents(
+                        1 - mask_condition_tile if use_vae_mask else None,
+                        masked_video,
+                        batch_size,
+                        height,
+                        width,
+                        prompt_embeds.dtype,
+                        device,
+                        generator,
+                        do_classifier_free_guidance,
+                        noise_aug_strength=noise_aug_strength,
+                    )
+                    if not use_vae_mask and not stack_mask:
+                        mask_latents = resize_mask(1 - mask_condition, masked_video_latents)
+                        if binarize_mask:
+                            if use_trimask:
+                                mask_latents = torch.where(mask_latents > 0.75, 1., mask_latents)
+                                mask_latents = torch.where((mask_latents <= 0.75) * (mask_latents >= 0.25), 0.5, mask_latents)
+                                mask_latents = torch.where(mask_latents < 0.25, 0., mask_latents)
+                            else:
+                                mask_latents = torch.where(mask_latents < 0.9, 0., 1.).to(mask_latents.dtype)
+
+                        mask_latents = mask_latents.to(masked_video_latents.device) * self.vae.config.scaling_factor
+
+                        mask = torch.tile(mask_condition, [1, num_channels_latents, 1, 1, 1])
+                        mask = F.interpolate(mask, size=latents.size()[-3:], mode='trilinear', align_corners=True).to(latents.device, latents.dtype)
+
+                        mask_input = torch.cat([mask_latents] * 2) if do_classifier_free_guidance else mask_latents
+                        mask = rearrange(mask, "b c f h w -> b f c h w")
+                    elif stack_mask:
+                        mask_latents = torch.cat([
+                            torch.repeat_interleave(mask_condition[:, :, 0:1], repeats=4, dim=2),
+                            mask_condition[:, :, 1:],
+                        ], dim=2)
+                        mask_latents = mask_latents.view(
+                            mask_latents.shape[0],
+                            mask_latents.shape[2] // 4,
+                            4,
+                            mask_latents.shape[3],
+                            mask_latents.shape[4],
+                        )
+                        mask_latents = mask_latents.transpose(1, 2)
+                        mask_latents = resize_mask(1 - mask_latents, masked_video_latents).to(latents.device, latents.dtype)
+                        mask_input = torch.cat([mask_latents] * 2) if do_classifier_free_guidance else mask_latents
+                    else:
+                        mask_input = (
+                            torch.cat([mask_encoded] * 2) if do_classifier_free_guidance else mask_encoded
+                        )
+
+                    masked_video_latents_input = (
+                        torch.cat([masked_video_latents] * 2) if do_classifier_free_guidance else masked_video_latents
+                    )
+
+                    mask_input = rearrange(mask_input, "b c f h w -> b f c h w")
+                    masked_video_latents_input = rearrange(masked_video_latents_input, "b c f h w -> b f c h w")
+
+                    # concat(binary mask, encode(mask * video))
+                    inpaint_latents = torch.cat([mask_input, masked_video_latents_input], dim=2).to(latents.dtype)
+                else:
+                    mask = torch.tile(mask_condition, [1, num_channels_latents, 1, 1, 1])
+                    mask = F.interpolate(mask, size=latents.size()[-3:], mode='trilinear', align_corners=True).to(latents.device, latents.dtype)
+                    mask = rearrange(mask, "b c f h w -> b f c h w")
+
+                    inpaint_latents = None
+        else:
+            if num_channels_transformer != num_channels_latents:
+                mask = torch.zeros_like(latents).to(latents.device, latents.dtype)
+                masked_video_latents = torch.zeros_like(latents).to(latents.device, latents.dtype)
+
+                mask_input = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+                masked_video_latents_input = (
+                    torch.cat([masked_video_latents] * 2) if do_classifier_free_guidance else masked_video_latents
+                )
+                inpaint_latents = torch.cat([mask_input, masked_video_latents_input], dim=1).to(latents.dtype)
+            else:
+                mask = torch.zeros_like(init_video[:, :1])
+                mask = torch.tile(mask, [1, num_channels_latents, 1, 1, 1])
+                mask = F.interpolate(mask, size=latents.size()[-3:], mode='trilinear', align_corners=True).to(latents.device, latents.dtype)
+                mask = rearrange(mask, "b c f h w -> b f c h w")
+
+                inpaint_latents = None
+        if comfyui_progressbar:
+            pbar.update(1)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        logger.debug(f'Pipeline mask {mask_condition.shape} {mask_condition.dtype} {mask_condition.min()} {mask_condition.max()}')
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        latent_temporal_window_size = (num_frames - 1) // 4 + 1
+        if latents.size(1) > latent_temporal_window_size:
+            logger.info(f'Adopt temporal multidiffusion for the latents {latents.shape} {latents.dtype}')
+
+        # VAE experiment
+        if skip_unet:
+            masked_video_latents = rearrange(masked_video_latents, "b c f h w -> b f c h w")
+            if output_type == "numpy":
+                video = self.decode_latents(masked_video_latents)
+            elif not output_type == "latent":
+                video = self.decode_latents(masked_video_latents)
+                video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+            else:
+                video = masked_video_latents
+
+            # Offload all models
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                video = torch.from_numpy(video)
+
+            return CogVideoXFunPipelineOutput(videos=video)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                def _sample(_latents, _inpaint_latents):
+                    # 7. Create rotary embeds if required
+                    image_rotary_emb = (
+                        self._prepare_rotary_positional_embeddings(height, width, _latents.size(1), device)
+                        if self.transformer.config.use_rotary_positional_embeddings
+                        else None
+                    )
+
+                    latent_model_input = torch.cat([_latents] * 2) if do_classifier_free_guidance else _latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                    timestep = t.expand(latent_model_input.shape[0])
+
+                    # predict noise model_output
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        timestep=timestep,
+                        image_rotary_emb=image_rotary_emb,
+                        return_dict=False,
+                        inpaint_latents=_inpaint_latents,
+                    )[0]
+                    noise_pred = noise_pred.float()
+
+                    # perform guidance
+                    if use_dynamic_cfg:
+                        self._guidance_scale = 1 + guidance_scale * (
+                            (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                        )
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                        _latents = self.scheduler.step(noise_pred, t, _latents, **extra_step_kwargs, return_dict=False)[0]
+                    else:
+                        _latents, old_pred_original_sample = self.scheduler.step(
+                            noise_pred,
+                            old_pred_original_sample,
+                            t,
+                            timesteps[i - 1] if i > 0 else None,
+                            _latents,
+                            **extra_step_kwargs,
+                            return_dict=False,
+                        )
+                    _latents = _latents.to(prompt_embeds.dtype)
+                    return _latents
+
+                if latents.size(1) <= latent_temporal_window_size:
+                    latents = _sample(latents, inpaint_latents)
+                else:
+                    # adopt temporal multidiffusion
+                    latents_canvas = torch.zeros_like(latents).float()
+                    weights_canvas = torch.zeros(1, latents.size(1), 1, 1, 1).to(latents.device).float()
+                    temporal_stride = temporal_multidiffusion_stride // 4
+                    assert latent_temporal_window_size > temporal_stride
+
+                    time_beg = 0
+                    while time_beg < latents.size(1):
+                        time_end = min(time_beg + latent_temporal_window_size, latents.size(1))
+                        
+                        latents_i = latents[:, time_beg:time_end]
+                        if inpaint_latents is not None:
+                            inpaint_latents_i = inpaint_latents[:, time_beg:time_end]
+                        else:
+                            inpaint_latents_i = None
+                        
+                        latents_i = _sample(latents_i, inpaint_latents_i)
+
+                        weights_i = torch.ones(1, time_end - time_beg, 1, 1, 1).to(latents.device).to(latents.dtype)
+                        if time_beg > 0 and temporal_stride > 0:
+                            weights_i[:, :temporal_stride] = (torch.linspace(0., 1., temporal_stride + 2)[1:-1]
+                                                                .to(latents.device)
+                                                                .to(latents.dtype)
+                                                                .reshape(1, temporal_stride, 1, 1, 1))
+                        if time_end < latents.size(1) and temporal_stride > 0:
+                            weights_i[:, -temporal_stride:] = (torch.linspace(1., 0., temporal_stride + 2)[1:-1]
+                                                                .to(latents.device)
+                                                                .to(latents.dtype)
+                                                                .reshape(1, temporal_stride, 1, 1, 1))
+
+                        latents_canvas[:, time_beg:time_end] += latents_i * weights_i
+                        weights_canvas[:, time_beg:time_end] += weights_i
+
+                        time_beg = time_end - temporal_stride
+                        if time_end >= latents.size(1):
+                            break
+                    latents = (latents_canvas / weights_canvas).to(latents.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if comfyui_progressbar:
+                    pbar.update(1)
+
+        if output_type == "numpy":
+            video = self.decode_latents(latents)
+        elif not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            video = torch.from_numpy(video)
+
+        return CogVideoXFunPipelineOutput(videos=video)
diff --git a/videox_fun/pipeline/pipeline_wan_fun.py b/videox_fun/pipeline/pipeline_wan_fun.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a6317dd4e1145c0a369f3ab03afa20c37db3a5d
--- /dev/null
+++ b/videox_fun/pipeline/pipeline_wan_fun.py
@@ -0,0 +1,558 @@
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.video_processor import VideoProcessor
+
+from ..models import (AutoencoderKLWan, AutoTokenizer,
+                              WanT5EncoderModel, WanTransformer3DModel)
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        pass
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class WanPipelineOutput(BaseOutput):
+    r"""
+    Output class for CogVideo pipelines.
+
+    Args:
+        video (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    videos: torch.Tensor
+
+
+class WanFunPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-video generation using Wan.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: WanT5EncoderModel,
+        vae: AutoencoderKLWan,
+        transformer: WanTransformer3DModel,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae.spacial_compression_ratio)
+
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_attention_mask = text_inputs.attention_mask
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        seq_lens = prompt_attention_mask.gt(0).sum(dim=1).long()
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 512,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            (num_frames - 1) // self.vae.temporal_compression_ratio + 1,
+            height // self.vae.spacial_compression_ratio,
+            width // self.vae.spacial_compression_ratio,
+        )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        if hasattr(self.scheduler, "init_noise_sigma"):
+            latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        frames = self.vae.decode(latents.to(self.vae.dtype)).sample
+        frames = (frames / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        frames = frames.cpu().float().numpy()
+        return frames
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 480,
+        width: int = 720,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "numpy",
+        return_dict: bool = False,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        comfyui_progressbar: bool = False,
+    ) -> Union[WanPipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+        Args:
+
+        Examples:
+
+        Returns:
+
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        weight_dtype = self.text_encoder.dtype
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = negative_prompt_embeds + prompt_embeds
+
+        # 4. Prepare timesteps
+        if isinstance(self.scheduler, FlowMatchEulerDiscreteScheduler):
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps, mu=1)
+        else:
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+        if comfyui_progressbar:
+            from comfy.utils import ProgressBar
+            pbar = ProgressBar(num_inference_steps + 1)
+
+        # 5. Prepare latents
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            weight_dtype,
+            device,
+            generator,
+            latents,
+        )
+        if comfyui_progressbar:
+            pbar.update(1)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        target_shape = (self.vae.latent_channels, (num_frames - 1) // self.vae.temporal_compression_ratio + 1, width // self.vae.spacial_compression_ratio, height // self.vae.spacial_compression_ratio)
+        seq_len = math.ceil((target_shape[2] * target_shape[3]) / (self.transformer.config.patch_size[1] * self.transformer.config.patch_size[2]) * target_shape[1]) 
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                if hasattr(self.scheduler, "scale_model_input"):
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                
+                # predict noise model_output
+                with torch.cuda.amp.autocast(dtype=weight_dtype):
+                    noise_pred = self.transformer(
+                        x=latent_model_input,
+                        context=prompt_embeds,
+                        t=timestep,
+                        seq_len=seq_len,
+                    )
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if comfyui_progressbar:
+                    pbar.update(1)
+
+        if output_type == "numpy":
+            video = self.decode_latents(latents)
+        elif not output_type == "latent":
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            video = torch.from_numpy(video)
+
+        return WanPipelineOutput(videos=video)
diff --git a/videox_fun/reward/MPS/README.md b/videox_fun/reward/MPS/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d66d2ee73284e44bca36c5ba86dbc5911d64defa
--- /dev/null
+++ b/videox_fun/reward/MPS/README.md
@@ -0,0 +1 @@
+This folder is modified from the official [MPS](https://github.com/Kwai-Kolors/MPS/tree/main) repository.
\ No newline at end of file
diff --git a/videox_fun/reward/MPS/trainer/models/base_model.py b/videox_fun/reward/MPS/trainer/models/base_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..df7907f388d3b2b4658e595db4189a80ab5acd77
--- /dev/null
+++ b/videox_fun/reward/MPS/trainer/models/base_model.py
@@ -0,0 +1,7 @@
+from dataclasses import dataclass
+
+
+
+@dataclass
+class BaseModelConfig:
+    pass
\ No newline at end of file
diff --git a/videox_fun/reward/MPS/trainer/models/clip_model.py b/videox_fun/reward/MPS/trainer/models/clip_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..003bb5d583189980c056376025579e6c06cd16f9
--- /dev/null
+++ b/videox_fun/reward/MPS/trainer/models/clip_model.py
@@ -0,0 +1,154 @@
+from dataclasses import dataclass
+from transformers import CLIPModel as HFCLIPModel
+from transformers import AutoTokenizer
+
+from torch import nn, einsum
+
+# Modified: import
+# from trainer.models.base_model import BaseModelConfig
+from .base_model import BaseModelConfig
+
+from transformers import CLIPConfig
+from typing import Any, Optional, Tuple, Union
+import torch
+
+# Modified: import
+# from trainer.models.cross_modeling import Cross_model
+from .cross_modeling import Cross_model
+
+import gc
+
+class XCLIPModel(HFCLIPModel):
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+    
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # pooled_output = text_outputs[1]
+        # text_features = self.text_projection(pooled_output)
+        last_hidden_state = text_outputs[0]
+        text_features = self.text_projection(last_hidden_state)
+
+        pooled_output = text_outputs[1]
+        text_features_EOS = self.text_projection(pooled_output)
+
+
+        # del last_hidden_state, text_outputs
+        # gc.collect()
+
+        return text_features, text_features_EOS
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # pooled_output = vision_outputs[1]  # pooled_output
+        # image_features = self.visual_projection(pooled_output)
+        last_hidden_state = vision_outputs[0]
+        image_features = self.visual_projection(last_hidden_state)
+
+        return image_features
+
+
+
+@dataclass
+class ClipModelConfig(BaseModelConfig):
+    _target_: str = "trainer.models.clip_model.CLIPModel"
+    pretrained_model_name_or_path: str ="openai/clip-vit-base-patch32"
+
+
+class CLIPModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Modified: We convert the original ckpt (contains the entire model) to a `state_dict`.
+        # self.model = XCLIPModel.from_pretrained(ckpt)
+        self.model = XCLIPModel(config)
+        self.cross_model = Cross_model(dim=1024, layer_num=4, heads=16)
+    
+    def get_text_features(self, *args, **kwargs):
+        return self.model.get_text_features(*args, **kwargs)
+
+    def get_image_features(self, *args, **kwargs):
+        return self.model.get_image_features(*args, **kwargs)
+
+    def forward(self, text_inputs=None, image_inputs=None, condition_inputs=None):
+        outputs = ()
+
+        text_f, text_EOS = self.model.get_text_features(text_inputs) # B*77*1024
+        outputs += text_EOS,
+
+        image_f = self.model.get_image_features(image_inputs.half()) # 2B*257*1024
+        # [B, 77, 1024]
+        condition_f, _ = self.model.get_text_features(condition_inputs) # B*5*1024
+
+        sim_text_condition = einsum('b i d, b j d -> b j i', text_f, condition_f)
+        sim_text_condition = torch.max(sim_text_condition, dim=1, keepdim=True)[0]
+        sim_text_condition = sim_text_condition / sim_text_condition.max()
+        mask = torch.where(sim_text_condition > 0.01, 0, float('-inf')) # B*1*77
+
+        # Modified: Support both torch.float16 and torch.bfloat16
+        # mask = mask.repeat(1,image_f.shape[1],1) # B*257*77
+        model_dtype = next(self.cross_model.parameters()).dtype
+        mask = mask.repeat(1,image_f.shape[1],1).to(model_dtype) # B*257*77
+        # bc = int(image_f.shape[0]/2)
+
+        # Modified: The original input consists of a (batch of) text and two (batches of) images, 
+        # primarily used to compute which (batch of) image is more consistent with the text.
+        # The modified input consists of a (batch of) text and a (batch of) images.
+        # sim0 = self.cross_model(image_f[:bc,:,:], text_f,mask.half())
+        # sim1 = self.cross_model(image_f[bc:,:,:], text_f,mask.half())
+        # outputs += sim0[:,0,:],
+        # outputs += sim1[:,0,:],
+        sim = self.cross_model(image_f, text_f,mask)
+        outputs += sim[:,0,:],
+
+        return outputs
+
+    @property
+    def logit_scale(self):
+        return self.model.logit_scale
+
+    def save(self, path):
+        self.model.save_pretrained(path)
diff --git a/videox_fun/reward/MPS/trainer/models/cross_modeling.py b/videox_fun/reward/MPS/trainer/models/cross_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..31dcee5bb3b6ecebe993f9b8c75b23f32c3ad03e
--- /dev/null
+++ b/videox_fun/reward/MPS/trainer/models/cross_modeling.py
@@ -0,0 +1,291 @@
+import torch
+from torch import einsum, nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+
+# helper functions
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
+# normalization
+# they use layernorm without bias, something that pytorch does not offer
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.register_buffer("bias", torch.zeros(dim))
+
+    def forward(self, x):
+        return F.layer_norm(x, x.shape[-1:], self.weight, self.bias)
+
+# residual
+
+
+class Residual(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+
+    def forward(self, x, *args, **kwargs):
+        return self.fn(x, *args, **kwargs) + x
+
+
+# rotary positional embedding
+# https://arxiv.org/abs/2104.09864
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+
+    def forward(self, max_seq_len, *, device):
+        seq = torch.arange(max_seq_len, device=device, dtype=self.inv_freq.dtype)
+        freqs = einsum("i , j -> i j", seq, self.inv_freq)
+        return torch.cat((freqs, freqs), dim=-1)
+
+
+def rotate_half(x):
+    x = rearrange(x, "... (j d) -> ... j d", j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(pos, t):
+    return (t * pos.cos()) + (rotate_half(t) * pos.sin())
+
+
+# classic Noam Shazeer paper, except here they use SwiGLU instead of the more popular GEGLU for gating the feedforward
+# https://arxiv.org/abs/2002.05202
+
+
+class SwiGLU(nn.Module):
+    def forward(self, x):
+        x, gate = x.chunk(2, dim=-1)
+        return F.silu(gate) * x
+
+
+# parallel attention and feedforward with residual
+# discovered by Wang et al + EleutherAI from GPT-J fame
+
+class ParallelTransformerBlock(nn.Module):
+    def __init__(self, dim, dim_head=64, heads=8, ff_mult=4):
+        super().__init__()
+        self.norm = LayerNorm(dim)
+
+        attn_inner_dim = dim_head * heads
+        ff_inner_dim = dim * ff_mult
+        self.fused_dims = (attn_inner_dim, dim_head, dim_head, (ff_inner_dim * 2))
+
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        self.rotary_emb = RotaryEmbedding(dim_head)
+
+        self.fused_attn_ff_proj = nn.Linear(dim, sum(self.fused_dims), bias=False)
+        self.attn_out = nn.Linear(attn_inner_dim, dim, bias=False)
+
+        self.ff_out = nn.Sequential(
+            SwiGLU(),
+            nn.Linear(ff_inner_dim, dim, bias=False)
+        )
+
+        self.register_buffer("pos_emb", None, persistent=False)
+
+
+    def get_rotary_embedding(self, n, device):
+        if self.pos_emb is not None and self.pos_emb.shape[-2] >= n:
+            return self.pos_emb[:n]
+
+        pos_emb = self.rotary_emb(n, device=device)
+        self.register_buffer("pos_emb", pos_emb, persistent=False)
+        return pos_emb
+
+    def forward(self, x, attn_mask=None):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+
+        n, device, h = x.shape[1], x.device, self.heads
+
+        # pre layernorm
+
+        x = self.norm(x)
+
+        # attention queries, keys, values, and feedforward inner
+
+        q, k, v, ff = self.fused_attn_ff_proj(x).split(self.fused_dims, dim=-1)
+
+        # split heads
+        # they use multi-query single-key-value attention, yet another Noam Shazeer paper
+        # they found no performance loss past a certain scale, and more efficient decoding obviously
+        # https://arxiv.org/abs/1911.02150
+
+        q = rearrange(q, "b n (h d) -> b h n d", h=h)
+
+        # rotary embeddings
+
+        positions = self.get_rotary_embedding(n, device)
+        q, k = map(lambda t: apply_rotary_pos_emb(positions, t), (q, k))
+
+        # scale
+
+        q = q * self.scale
+
+        # similarity
+
+        sim = einsum("b h i d, b j d -> b h i j", q, k)
+
+
+        # extra attention mask - for masking out attention from text CLS token to padding
+
+        if exists(attn_mask):
+            attn_mask = rearrange(attn_mask, 'b i j -> b 1 i j')
+            sim = sim.masked_fill(~attn_mask, -torch.finfo(sim.dtype).max)
+
+        # attention
+
+        sim = sim - sim.amax(dim=-1, keepdim=True).detach()
+        attn = sim.softmax(dim=-1)
+
+        # aggregate values
+
+        out = einsum("b h i j, b j d -> b h i d", attn, v)
+
+        # merge heads
+
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.attn_out(out) + self.ff_out(ff)
+
+# cross attention - using multi-query + one-headed key / values as in PaLM w/ optional parallel feedforward
+
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        context_dim=None,
+        dim_head=64,
+        heads=12,
+        parallel_ff=False,
+        ff_mult=4,
+        norm_context=False
+    ):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        inner_dim = heads * dim_head
+        context_dim = default(context_dim, dim)
+
+        self.norm = LayerNorm(dim)
+        self.context_norm = LayerNorm(context_dim) if norm_context else nn.Identity()
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(context_dim, dim_head * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+        # whether to have parallel feedforward
+
+        ff_inner_dim = ff_mult * dim
+
+        self.ff = nn.Sequential(
+            nn.Linear(dim, ff_inner_dim * 2, bias=False),
+            SwiGLU(),
+            nn.Linear(ff_inner_dim, dim, bias=False)
+        ) if parallel_ff else None
+
+    def forward(self, x, context, mask):
+        """
+        einstein notation
+        b - batch
+        h - heads
+        n, i, j - sequence length (base sequence length, source, target)
+        d - feature dimension
+        """
+
+        # pre-layernorm, for queries and context
+
+        x = self.norm(x)
+        context = self.context_norm(context)
+
+        # get queries
+
+        q = self.to_q(x)
+        q = rearrange(q, 'b n (h d) -> b h n d', h = self.heads)
+
+        # scale
+
+        q = q * self.scale
+
+        # get key / values
+
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+
+        # query / key similarity
+
+        sim = einsum('b h i d, b j d -> b h i j', q, k)
+
+        # attention
+        mask = mask.unsqueeze(1).repeat(1,self.heads,1,1)
+        sim = sim + mask  # context mask
+        sim = sim - sim.amax(dim=-1, keepdim=True)
+        attn = sim.softmax(dim=-1)
+
+        # aggregate
+
+        out = einsum('b h i j, b j d -> b h i d', attn, v)
+
+        # merge and combine heads
+
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        out = self.to_out(out)
+
+        # add parallel feedforward (for multimodal layers)
+
+        if exists(self.ff):
+            out = out + self.ff(x)
+
+        return out
+
+
+class Cross_model(nn.Module):
+    def __init__(
+        self,
+        dim=512,
+        layer_num=4,
+        dim_head=64,
+        heads=8,
+        ff_mult=4
+    ):
+        super().__init__()
+
+        self.layers = nn.ModuleList([])
+
+
+        for ind in range(layer_num):
+            self.layers.append(nn.ModuleList([
+                Residual(CrossAttention(dim=dim, dim_head=dim_head, heads=heads, parallel_ff=True, ff_mult=ff_mult)),
+                Residual(ParallelTransformerBlock(dim=dim, dim_head=dim_head, heads=heads, ff_mult=ff_mult))
+            ]))
+
+    def forward(
+        self,
+        query_tokens,
+        context_tokens,
+        mask
+    ):
+        for cross_attn, self_attn_ff in self.layers:
+            query_tokens = cross_attn(query_tokens, context_tokens,mask)
+            query_tokens = self_attn_ff(query_tokens)
+
+        return query_tokens
\ No newline at end of file
diff --git a/videox_fun/reward/aesthetic_predictor_v2_5/__init__.py b/videox_fun/reward/aesthetic_predictor_v2_5/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d3d8f197ab036c531c5ba7efb65a0800f6d4fff
--- /dev/null
+++ b/videox_fun/reward/aesthetic_predictor_v2_5/__init__.py
@@ -0,0 +1,13 @@
+from .siglip_v2_5 import (
+    AestheticPredictorV2_5Head,
+    AestheticPredictorV2_5Model,
+    AestheticPredictorV2_5Processor,
+    convert_v2_5_from_siglip,
+)
+
+__all__ = [
+    "AestheticPredictorV2_5Head",
+    "AestheticPredictorV2_5Model",
+    "AestheticPredictorV2_5Processor",
+    "convert_v2_5_from_siglip",
+]
\ No newline at end of file
diff --git a/videox_fun/reward/aesthetic_predictor_v2_5/siglip_v2_5.py b/videox_fun/reward/aesthetic_predictor_v2_5/siglip_v2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..867f4295eb20ff2d1acff02c9303e9fd0a7c00de
--- /dev/null
+++ b/videox_fun/reward/aesthetic_predictor_v2_5/siglip_v2_5.py
@@ -0,0 +1,133 @@
+# Borrowed from https://github.com/discus0434/aesthetic-predictor-v2-5/blob/3125a9e/src/aesthetic_predictor_v2_5/siglip_v2_5.py
+import os
+from collections import OrderedDict
+from os import PathLike
+from typing import Final
+
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+from transformers import (
+    SiglipImageProcessor,
+    SiglipVisionConfig,
+    SiglipVisionModel,
+    logging,
+)
+from transformers.image_processing_utils import BatchFeature
+from transformers.modeling_outputs import ImageClassifierOutputWithNoAttention
+
+logging.set_verbosity_error()
+
+URL: Final[str] = (
+    "https://github.com/discus0434/aesthetic-predictor-v2-5/raw/main/models/aesthetic_predictor_v2_5.pth"
+)
+
+
+class AestheticPredictorV2_5Head(nn.Module):
+    def __init__(self, config: SiglipVisionConfig) -> None:
+        super().__init__()
+        self.scoring_head = nn.Sequential(
+            nn.Linear(config.hidden_size, 1024),
+            nn.Dropout(0.5),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.5),
+            nn.Linear(128, 64),
+            nn.Dropout(0.5),
+            nn.Linear(64, 16),
+            nn.Dropout(0.2),
+            nn.Linear(16, 1),
+        )
+
+    def forward(self, image_embeds: torch.Tensor) -> torch.Tensor:
+        return self.scoring_head(image_embeds)
+
+
+class AestheticPredictorV2_5Model(SiglipVisionModel):
+    PATCH_SIZE = 14
+
+    def __init__(self, config: SiglipVisionConfig, *args, **kwargs) -> None:
+        super().__init__(config, *args, **kwargs)
+        self.layers = AestheticPredictorV2_5Head(config)
+        self.post_init()
+        self.transforms = transforms.Compose([
+            transforms.Resize((384, 384)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor | None = None,
+        labels: torch.Tensor | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | ImageClassifierOutputWithNoAttention:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        outputs = super().forward(
+            pixel_values=pixel_values,
+            return_dict=return_dict,
+        )
+        image_embeds = outputs.pooler_output
+        image_embeds_norm = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
+        prediction = self.layers(image_embeds_norm)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.MSELoss()
+            loss = loss_fct()
+
+        if not return_dict:
+            return (loss, prediction, image_embeds)
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=prediction,
+            hidden_states=image_embeds,
+        )
+
+
+class AestheticPredictorV2_5Processor(SiglipImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, *args, **kwargs) -> BatchFeature:
+        return super().__call__(*args, **kwargs)
+
+    @classmethod
+    def from_pretrained(
+        self,
+        pretrained_model_name_or_path: str
+        | PathLike = "google/siglip-so400m-patch14-384",
+        *args,
+        **kwargs,
+    ) -> "AestheticPredictorV2_5Processor":
+        return super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+
+def convert_v2_5_from_siglip(
+    predictor_name_or_path: str | PathLike | None = None,
+    encoder_model_name: str = "google/siglip-so400m-patch14-384",
+    *args,
+    **kwargs,
+) -> tuple[AestheticPredictorV2_5Model, AestheticPredictorV2_5Processor]:
+    model = AestheticPredictorV2_5Model.from_pretrained(
+        encoder_model_name, *args, **kwargs
+    )
+
+    processor = AestheticPredictorV2_5Processor.from_pretrained(
+        encoder_model_name, *args, **kwargs
+    )
+
+    if predictor_name_or_path is None or not os.path.exists(predictor_name_or_path):
+        state_dict = torch.hub.load_state_dict_from_url(URL, map_location="cpu")
+    else:
+        state_dict = torch.load(predictor_name_or_path, map_location="cpu")
+
+    assert isinstance(state_dict, OrderedDict)
+
+    model.layers.load_state_dict(state_dict)
+    model.eval()
+
+    return model, processor
\ No newline at end of file
diff --git a/videox_fun/reward/improved_aesthetic_predictor.py b/videox_fun/reward/improved_aesthetic_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..43037b9fd5d80dc1402707c74f6b6a9584f26ce7
--- /dev/null
+++ b/videox_fun/reward/improved_aesthetic_predictor.py
@@ -0,0 +1,49 @@
+import os
+
+import torch
+import torch.nn as nn
+from transformers import CLIPModel
+from torchvision.datasets.utils import download_url
+
+URL = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Third_Party/sac%2Blogos%2Bava1-l14-linearMSE.pth"
+FILENAME = "sac+logos+ava1-l14-linearMSE.pth"
+MD5 = "b1047fd767a00134b8fd6529bf19521a"
+
+
+class MLP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(768, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            nn.Linear(16, 1),
+        )
+
+
+    def forward(self, embed):
+        return self.layers(embed)
+
+
+class ImprovedAestheticPredictor(nn.Module):
+    def __init__(self, encoder_path="openai/clip-vit-large-patch14", predictor_path=None):
+        super().__init__()
+        self.encoder = CLIPModel.from_pretrained(encoder_path)
+        self.predictor = MLP()
+        if predictor_path is None or not os.path.exists(predictor_path):
+            download_url(URL, torch.hub.get_dir(), FILENAME, md5=MD5)
+            predictor_path = os.path.join(torch.hub.get_dir(), FILENAME)
+        state_dict = torch.load(predictor_path, map_location="cpu")
+        self.predictor.load_state_dict(state_dict)
+        self.eval()
+    
+
+    def forward(self, pixel_values):
+        embed = self.encoder.get_image_features(pixel_values=pixel_values)
+        embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
+
+        return self.predictor(embed).squeeze(1)
diff --git a/videox_fun/reward/reward_fn.py b/videox_fun/reward/reward_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6526919cbea00cb859a542d7758326c89d49ecf9
--- /dev/null
+++ b/videox_fun/reward/reward_fn.py
@@ -0,0 +1,385 @@
+import os
+from abc import ABC, abstractmethod
+
+import torch
+import torchvision.transforms as transforms
+from einops import rearrange
+from torchvision.datasets.utils import download_url
+from typing import Optional, Tuple
+
+
+# All reward models.
+__all__ = ["AestheticReward", "HPSReward", "PickScoreReward", "MPSReward"]
+
+
+class BaseReward(ABC):
+    """An base class for reward models. A custom Reward class must implement two functions below.
+    """
+    def __init__(self):
+        """Define your reward model and image transformations (optional) here.
+        """
+        pass
+
+    @abstractmethod
+    def __call__(self, batch_frames: torch.Tensor, batch_prompt: Optional[list[str]]=None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Given batch frames with shape `[B, C, T, H, W]` extracted from a list of videos and a list of prompts 
+        (optional) correspondingly, return the loss and reward computed by your reward model (reduction by mean).
+        """
+        pass
+
+class AestheticReward(BaseReward):
+    """Aesthetic Predictor [V2](https://github.com/christophschuhmann/improved-aesthetic-predictor) 
+    and [V2.5](https://github.com/discus0434/aesthetic-predictor-v2-5) reward model.
+    """
+    def __init__(
+        self,
+        encoder_path="openai/clip-vit-large-patch14",
+        predictor_path=None,
+        version="v2",
+        device="cpu",
+        dtype=torch.float16,
+        max_reward=10,
+        loss_scale=0.1,
+    ):
+        from .improved_aesthetic_predictor import ImprovedAestheticPredictor
+        from ..video_caption.utils.siglip_v2_5 import convert_v2_5_from_siglip
+
+        self.encoder_path = encoder_path
+        self.predictor_path = predictor_path
+        self.version = version
+        self.device = device
+        self.dtype = dtype
+        self.max_reward = max_reward
+        self.loss_scale = loss_scale
+
+        if self.version != "v2" and self.version != "v2.5":
+            raise ValueError("Only v2 and v2.5 are supported.")
+        if self.version == "v2":
+            assert "clip-vit-large-patch14" in encoder_path.lower()
+            self.model = ImprovedAestheticPredictor(encoder_path=self.encoder_path, predictor_path=self.predictor_path)
+            # https://huggingface.co/openai/clip-vit-large-patch14/blob/main/preprocessor_config.json
+            # TODO: [transforms.Resize(224), transforms.CenterCrop(224)] for any aspect ratio.
+            self.transform = transforms.Compose([
+                transforms.Resize((224, 224), interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
+            ])
+        elif self.version == "v2.5":
+            assert "siglip-so400m-patch14-384" in encoder_path.lower()
+            self.model, _ = convert_v2_5_from_siglip(encoder_model_name=self.encoder_path)
+            # https://huggingface.co/google/siglip-so400m-patch14-384/blob/main/preprocessor_config.json
+            self.transform = transforms.Compose([
+                transforms.Resize((384, 384), interpolation=transforms.InterpolationMode.BICUBIC),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+            ])
+        
+        self.model.to(device=self.device, dtype=self.dtype)
+        self.model.requires_grad_(False)
+    
+
+    def __call__(self, batch_frames: torch.Tensor, batch_prompt: Optional[list[str]]=None) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_frames = rearrange(batch_frames, "b c t h w -> t b c h w")
+        batch_loss, batch_reward = 0, 0
+        for frames in batch_frames:
+            pixel_values = torch.stack([self.transform(frame) for frame in frames])
+            pixel_values = pixel_values.to(self.device, dtype=self.dtype)
+            if self.version == "v2":
+                reward = self.model(pixel_values)
+            elif self.version == "v2.5":
+                reward = self.model(pixel_values).logits.squeeze()
+            # Convert reward to loss in [0, 1].
+            if self.max_reward is None:
+                loss = (-1 * reward) * self.loss_scale
+            else:
+                loss = abs(reward - self.max_reward) * self.loss_scale
+            batch_loss, batch_reward = batch_loss + loss.mean(), batch_reward + reward.mean()
+        
+        return batch_loss / batch_frames.shape[0], batch_reward / batch_frames.shape[0]
+
+
+class HPSReward(BaseReward):
+    """[HPS](https://github.com/tgxs002/HPSv2) v2 and v2.1 reward model.
+    """
+    def __init__(
+        self,
+        model_path=None,
+        version="v2.0",
+        device="cpu",
+        dtype=torch.float16,
+        max_reward=1,
+        loss_scale=1,
+    ):
+        from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
+
+        self.model_path = model_path
+        self.version = version
+        self.device = device
+        self.dtype = dtype
+        self.max_reward = max_reward
+        self.loss_scale = loss_scale
+
+        self.model, _, _ = create_model_and_transforms(
+            "ViT-H-14",
+            "laion2B-s32B-b79K",
+            precision=self.dtype,
+            device=self.device,
+            jit=False,
+            force_quick_gelu=False,
+            force_custom_text=False,
+            force_patch_dropout=False,
+            force_image_size=None,
+            pretrained_image=False,
+            image_mean=None,
+            image_std=None,
+            light_augmentation=True,
+            aug_cfg={},
+            output_dict=True,
+            with_score_predictor=False,
+            with_region_predictor=False,
+        )
+        self.tokenizer = get_tokenizer("ViT-H-14")
+
+        # https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/preprocessor_config.json
+        # TODO: [transforms.Resize(224), transforms.CenterCrop(224)] for any aspect ratio.
+        self.transform = transforms.Compose([
+            transforms.Resize((224, 224), interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
+        ])
+
+        if version == "v2.0":
+            url = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Third_Party/HPS_v2_compressed.pt"
+            filename = "HPS_v2_compressed.pt"
+            md5 = "fd9180de357abf01fdb4eaad64631db4"
+        elif version == "v2.1":
+            url = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Third_Party/HPS_v2.1_compressed.pt"
+            filename = "HPS_v2.1_compressed.pt"
+            md5 = "4067542e34ba2553a738c5ac6c1d75c0"
+        else:
+            raise ValueError("Only v2.0 and v2.1 are supported.")
+        if self.model_path is None or not os.path.exists(self.model_path):
+            download_url(url, torch.hub.get_dir(), md5=md5)
+            model_path = os.path.join(torch.hub.get_dir(), filename)
+
+        state_dict = torch.load(model_path, map_location="cpu")["state_dict"]
+        self.model.load_state_dict(state_dict)
+        self.model.to(device=self.device, dtype=self.dtype)
+        self.model.requires_grad_(False)
+        self.model.eval()
+    
+    def __call__(self, batch_frames: torch.Tensor, batch_prompt: list[str]) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert batch_frames.shape[0] == len(batch_prompt)
+        # Compute batch reward and loss in frame-wise.
+        batch_frames = rearrange(batch_frames, "b c t h w -> t b c h w")
+        batch_loss, batch_reward = 0, 0
+        for frames in batch_frames:
+            image_inputs = torch.stack([self.transform(frame) for frame in frames])
+            image_inputs = image_inputs.to(device=self.device, dtype=self.dtype)
+            text_inputs = self.tokenizer(batch_prompt).to(device=self.device)
+            outputs = self.model(image_inputs, text_inputs)
+
+            image_features, text_features = outputs["image_features"], outputs["text_features"]
+            logits = image_features @ text_features.T
+            reward = torch.diagonal(logits)
+            # Convert reward to loss in [0, 1].
+            if self.max_reward is None:
+                loss = (-1 * reward) * self.loss_scale
+            else:
+                loss = abs(reward - self.max_reward) * self.loss_scale
+            
+            batch_loss, batch_reward = batch_loss + loss.mean(), batch_reward + reward.mean()
+        
+        return batch_loss / batch_frames.shape[0], batch_reward / batch_frames.shape[0]
+
+
+class PickScoreReward(BaseReward):
+    """[PickScore](https://github.com/yuvalkirstain/PickScore) reward model.
+    """
+    def __init__(
+        self,
+        model_path="yuvalkirstain/PickScore_v1",
+        device="cpu",
+        dtype=torch.float16,
+        max_reward=1,
+        loss_scale=1,
+    ):
+        from transformers import AutoProcessor, AutoModel
+
+        self.model_path = model_path
+        self.device = device
+        self.dtype = dtype
+        self.max_reward = max_reward
+        self.loss_scale = loss_scale
+
+        # https://huggingface.co/yuvalkirstain/PickScore_v1/blob/main/preprocessor_config.json
+        self.transform = transforms.Compose([
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
+        ])
+        self.processor = AutoProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=self.dtype)
+        self.model = AutoModel.from_pretrained(model_path, torch_dtype=self.dtype).eval().to(device)
+        self.model.requires_grad_(False)
+        self.model.eval()
+     
+    def __call__(self, batch_frames: torch.Tensor, batch_prompt: list[str]) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert batch_frames.shape[0] == len(batch_prompt)
+        # Compute batch reward and loss in frame-wise.
+        batch_frames = rearrange(batch_frames, "b c t h w -> t b c h w")
+        batch_loss, batch_reward = 0, 0
+        for frames in batch_frames:
+            image_inputs = torch.stack([self.transform(frame) for frame in frames])
+            image_inputs = image_inputs.to(device=self.device, dtype=self.dtype)
+            text_inputs = self.processor(
+                text=batch_prompt,
+                padding=True,
+                truncation=True,
+                max_length=77,
+                return_tensors="pt",
+            ).to(self.device)
+            image_features = self.model.get_image_features(pixel_values=image_inputs)
+            text_features = self.model.get_text_features(**text_inputs)
+            image_features = image_features / torch.norm(image_features, dim=-1, keepdim=True)
+            text_features = text_features / torch.norm(text_features, dim=-1, keepdim=True)
+
+            logits = image_features @ text_features.T
+            reward = torch.diagonal(logits)
+            # Convert reward to loss in [0, 1].
+            if self.max_reward is None:
+                loss = (-1 * reward) * self.loss_scale
+            else:
+                loss = abs(reward - self.max_reward) * self.loss_scale
+
+            batch_loss, batch_reward = batch_loss + loss.mean(), batch_reward + reward.mean()
+
+        return batch_loss / batch_frames.shape[0], batch_reward / batch_frames.shape[0]
+
+
+class MPSReward(BaseReward):
+    """[MPS](https://github.com/Kwai-Kolors/MPS) reward model.
+    """
+    def __init__(
+        self,
+        model_path=None,
+        device="cpu",
+        dtype=torch.float16,
+        max_reward=1,
+        loss_scale=1,
+    ):
+        from transformers import AutoTokenizer, AutoConfig
+        from .MPS.trainer.models.clip_model import CLIPModel
+
+        self.model_path = model_path
+        self.device = device
+        self.dtype = dtype
+        self.condition = "light, color, clarity, tone, style, ambiance, artistry, shape, face, hair, hands, limbs, structure, instance, texture, quantity, attributes, position, number, location, word, things."
+        self.max_reward = max_reward
+        self.loss_scale = loss_scale
+
+        processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
+        # https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/preprocessor_config.json
+        # TODO: [transforms.Resize(224), transforms.CenterCrop(224)] for any aspect ratio.
+        self.transform = transforms.Compose([
+            transforms.Resize((224, 224), interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711]),
+        ])
+
+        # We convert the original [ckpt](http://drive.google.com/file/d/17qrK_aJkVNM75ZEvMEePpLj6L867MLkN/view?usp=sharing)
+        # (contains the entire model) to a `state_dict`.
+        url = "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/Third_Party/MPS_overall.pth"
+        filename = "MPS_overall.pth"
+        md5 = "1491cbbbd20565747fe07e7572e2ac56"
+        if self.model_path is None or not os.path.exists(self.model_path):
+            download_url(url, torch.hub.get_dir(), md5=md5)
+            model_path = os.path.join(torch.hub.get_dir(), filename)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(processor_name_or_path, trust_remote_code=True)
+        config = AutoConfig.from_pretrained(processor_name_or_path)
+        self.model = CLIPModel(config)
+        state_dict = torch.load(model_path, map_location="cpu")
+        self.model.load_state_dict(state_dict, strict=False)
+        self.model.to(device=self.device, dtype=self.dtype)
+        self.model.requires_grad_(False)
+        self.model.eval()
+    
+    def _tokenize(self, caption):
+        input_ids = self.tokenizer(
+            caption,
+            max_length=self.tokenizer.model_max_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        ).input_ids
+
+        return input_ids
+    
+    def __call__(
+        self,
+        batch_frames: torch.Tensor,
+        batch_prompt: list[str],
+        batch_condition: Optional[list[str]] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if batch_condition is None:
+            batch_condition = [self.condition] * len(batch_prompt)
+        batch_frames = rearrange(batch_frames, "b c t h w -> t b c h w")
+        batch_loss, batch_reward = 0, 0
+        for frames in batch_frames:
+            image_inputs = torch.stack([self.transform(frame) for frame in frames])
+            image_inputs = image_inputs.to(device=self.device, dtype=self.dtype)
+            text_inputs = self._tokenize(batch_prompt).to(self.device)
+            condition_inputs = self._tokenize(batch_condition).to(device=self.device)
+            text_features, image_features = self.model(text_inputs, image_inputs, condition_inputs)
+
+            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+            # reward = self.model.logit_scale.exp() * torch.diag(torch.einsum('bd,cd->bc', text_features, image_features))
+            logits = image_features @ text_features.T
+            reward = torch.diagonal(logits)
+            # Convert reward to loss in [0, 1].
+            if self.max_reward is None:
+                loss = (-1 * reward) * self.loss_scale
+            else:
+                loss = abs(reward - self.max_reward) * self.loss_scale
+            
+            batch_loss, batch_reward = batch_loss + loss.mean(), batch_reward + reward.mean()
+        
+        return batch_loss / batch_frames.shape[0], batch_reward / batch_frames.shape[0]
+
+
+if __name__ == "__main__":
+    import numpy as np
+    from decord import VideoReader
+
+    video_path_list = ["your_video_path_1.mp4", "your_video_path_2.mp4"]
+    prompt_list = ["your_prompt_1", "your_prompt_2"]
+    num_sampled_frames = 8
+
+    to_tensor = transforms.ToTensor()
+
+    sampled_frames_list = []
+    for video_path in video_path_list:
+        vr = VideoReader(video_path)
+        sampled_frame_indices = np.linspace(0, len(vr), num_sampled_frames, endpoint=False, dtype=int)
+        sampled_frames = vr.get_batch(sampled_frame_indices).asnumpy()
+        sampled_frames = torch.stack([to_tensor(frame) for frame in sampled_frames])
+        sampled_frames_list.append(sampled_frames)
+    sampled_frames = torch.stack(sampled_frames_list)
+    sampled_frames = rearrange(sampled_frames, "b t c h w -> b c t h w")
+
+    aesthetic_reward_v2 = AestheticReward(device="cuda", dtype=torch.bfloat16)
+    print(f"aesthetic_reward_v2: {aesthetic_reward_v2(sampled_frames)}")
+
+    aesthetic_reward_v2_5 = AestheticReward(
+        encoder_path="google/siglip-so400m-patch14-384", version="v2.5", device="cuda", dtype=torch.bfloat16
+    )
+    print(f"aesthetic_reward_v2_5: {aesthetic_reward_v2_5(sampled_frames)}")
+
+    hps_reward_v2 = HPSReward(device="cuda", dtype=torch.bfloat16)
+    print(f"hps_reward_v2: {hps_reward_v2(sampled_frames, prompt_list)}")
+
+    hps_reward_v2_1 = HPSReward(version="v2.1", device="cuda", dtype=torch.bfloat16)
+    print(f"hps_reward_v2_1: {hps_reward_v2_1(sampled_frames, prompt_list)}")
+
+    pick_score = PickScoreReward(device="cuda", dtype=torch.bfloat16)
+    print(f"pick_score_reward: {pick_score(sampled_frames, prompt_list)}")
+
+    mps_score = MPSReward(device="cuda", dtype=torch.bfloat16)
+    print(f"mps_reward: {mps_score(sampled_frames, prompt_list)}")
\ No newline at end of file
diff --git a/videox_fun/ui/cogvideox_fun_ui.py b/videox_fun/ui/cogvideox_fun_ui.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f87d391f0836eca11ba8bdf9e30eb0fe1396f56
--- /dev/null
+++ b/videox_fun/ui/cogvideox_fun_ui.py
@@ -0,0 +1,667 @@
+"""Modified from https://github.com/guoyww/AnimateDiff/blob/main/app.py
+"""
+import os
+import random
+
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from safetensors import safe_open
+
+from ..data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from ..models import (AutoencoderKLCogVideoX, CogVideoXTransformer3DModel,
+                      T5EncoderModel, T5Tokenizer)
+from ..pipeline import (CogVideoXFunControlPipeline,
+                        CogVideoXFunInpaintPipeline, CogVideoXFunPipeline)
+from ..utils.fp8_optimization import convert_weight_dtype_wrapper
+from ..utils.lora_utils import merge_lora, unmerge_lora
+from ..utils.utils import (get_image_to_video_latent,
+                           get_video_to_video_latent, save_videos_grid)
+from .controller import (Fun_Controller, Fun_Controller_Client,
+                         all_cheduler_dict, css, ddpm_scheduler_dict,
+                         flow_scheduler_dict, gradio_version,
+                         gradio_version_is_above_4)
+from .ui import (create_cfg_and_seedbox,
+                 create_fake_finetune_models_checkpoints,
+                 create_fake_height_width, create_fake_model_checkpoints,
+                 create_fake_model_type, create_finetune_models_checkpoints,
+                 create_generation_method,
+                 create_generation_methods_and_video_length,
+                 create_height_width, create_model_checkpoints,
+                 create_model_type, create_prompts, create_samplers,
+                 create_ui_outputs)
+
+
+class CogVideoXFunController(Fun_Controller):
+    def update_diffusion_transformer(self, diffusion_transformer_dropdown):
+        print("Update diffusion transformer")
+        self.diffusion_transformer_dropdown = diffusion_transformer_dropdown
+        if diffusion_transformer_dropdown == "none":
+            return gr.update()
+        self.vae = AutoencoderKLCogVideoX.from_pretrained(
+            diffusion_transformer_dropdown, 
+            subfolder="vae", 
+        ).to(self.weight_dtype)
+
+        # Get Transformer
+        self.transformer = CogVideoXTransformer3DModel.from_pretrained(
+            diffusion_transformer_dropdown, 
+            subfolder="transformer",
+            low_cpu_mem_usage=True, 
+        ).to(self.weight_dtype)
+        
+        # Get tokenizer and text_encoder
+        tokenizer = T5Tokenizer.from_pretrained(
+            diffusion_transformer_dropdown, subfolder="tokenizer"
+        )
+        text_encoder = T5EncoderModel.from_pretrained(
+            diffusion_transformer_dropdown, subfolder="text_encoder", torch_dtype=self.weight_dtype
+        )
+    
+        # Get pipeline
+        if self.model_type == "Inpaint":
+            if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                self.pipeline = CogVideoXFunInpaintPipeline(
+                    tokenizer=tokenizer,
+                    text_encoder=text_encoder,
+                    vae=self.vae, 
+                    transformer=self.transformer,
+                    scheduler=self.scheduler_dict[list(self.scheduler_dict.keys())[0]].from_pretrained(diffusion_transformer_dropdown, subfolder="scheduler"),
+                )
+            else:
+                self.pipeline = CogVideoXFunPipeline(
+                    tokenizer=tokenizer,
+                    text_encoder=text_encoder,
+                    vae=self.vae, 
+                    transformer=self.transformer,
+                    scheduler=self.scheduler_dict[list(self.scheduler_dict.keys())[0]].from_pretrained(diffusion_transformer_dropdown, subfolder="scheduler"),
+                )
+        else:
+            self.pipeline = CogVideoXFunControlPipeline(
+                diffusion_transformer_dropdown,
+                vae=self.vae, 
+                transformer=self.transformer,
+                scheduler=self.scheduler_dict[list(self.scheduler_dict.keys())[0]].from_pretrained(diffusion_transformer_dropdown, subfolder="scheduler"),
+                torch_dtype=self.weight_dtype
+            )
+
+        if self.ulysses_degree > 1 or self.ring_degree > 1:
+            self.transformer.enable_multi_gpus_inference()
+
+        if self.GPU_memory_mode == "sequential_cpu_offload":
+            self.pipeline.enable_sequential_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+            convert_weight_dtype_wrapper(self.pipeline.transformer, self.weight_dtype)
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload":
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        else:
+            self.pipeline.to(self.device)
+        print("Update diffusion transformer done")
+        return gr.update()
+
+    def generate(
+        self,
+        diffusion_transformer_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown, 
+        lora_alpha_slider,
+        prompt_textbox, 
+        negative_prompt_textbox, 
+        sampler_dropdown, 
+        sample_step_slider, 
+        resize_method,
+        width_slider, 
+        height_slider, 
+        base_resolution, 
+        generation_method, 
+        length_slider, 
+        overlap_video_length, 
+        partial_video_length, 
+        cfg_scale_slider, 
+        start_image, 
+        end_image, 
+        validation_video,
+        validation_video_mask,
+        control_video,
+        denoise_strength,
+        seed_textbox,
+        is_api = False,
+    ):
+        self.clear_cache()
+
+        self.input_check(
+            resize_method, generation_method, start_image, end_image, validation_video,control_video, is_api
+        )
+        is_image = True if generation_method == "Image Generation" else False
+
+        if self.base_model_path != base_model_dropdown:
+            self.update_base_model(base_model_dropdown)
+
+        if self.lora_model_path != lora_model_dropdown:
+            self.update_lora_model(lora_model_dropdown)
+
+        self.pipeline.scheduler = self.scheduler_dict[sampler_dropdown].from_config(self.pipeline.scheduler.config)
+
+        if resize_method == "Resize according to Reference":
+            height_slider, width_slider = self.get_height_width_from_reference(
+                base_resolution, start_image, validation_video, control_video,
+            )
+        if self.lora_model_path != "none":
+            # lora part
+            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+
+        if int(seed_textbox) != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
+        else: seed_textbox = np.random.randint(0, 1e10)
+        generator = torch.Generator(device=self.device).manual_seed(int(seed_textbox))
+        
+        try:
+            if self.model_type == "Inpaint":
+                if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                    if generation_method == "Long Video Generation":
+                        if validation_video is not None:
+                            raise gr.Error(f"Video to Video is not Support Long Video Generation now.")
+                        init_frames = 0
+                        last_frames = init_frames + partial_video_length
+                        while init_frames < length_slider:
+                            if last_frames >= length_slider:
+                                _partial_video_length = length_slider - init_frames
+                                _partial_video_length = int((_partial_video_length - 1) // self.vae.config.temporal_compression_ratio * self.vae.config.temporal_compression_ratio) + 1
+                                
+                                if _partial_video_length <= 0:
+                                    break
+                            else:
+                                _partial_video_length = partial_video_length
+
+                            if last_frames >= length_slider:
+                                input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, video_length=_partial_video_length, sample_size=(height_slider, width_slider))
+                            else:
+                                input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, None, video_length=_partial_video_length, sample_size=(height_slider, width_slider))
+
+                            with torch.no_grad():
+                                sample = self.pipeline(
+                                    prompt_textbox, 
+                                    negative_prompt     = negative_prompt_textbox,
+                                    num_inference_steps = sample_step_slider,
+                                    guidance_scale      = cfg_scale_slider,
+                                    width               = width_slider,
+                                    height              = height_slider,
+                                    num_frames          = _partial_video_length,
+                                    generator           = generator,
+
+                                    video        = input_video,
+                                    mask_video   = input_video_mask,
+                                    strength     = 1,
+                                ).videos
+                            
+                            if init_frames != 0:
+                                mix_ratio = torch.from_numpy(
+                                    np.array([float(_index) / float(overlap_video_length) for _index in range(overlap_video_length)], np.float32)
+                                ).unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                                
+                                new_sample[:, :, -overlap_video_length:] = new_sample[:, :, -overlap_video_length:] * (1 - mix_ratio) + \
+                                    sample[:, :, :overlap_video_length] * mix_ratio
+                                new_sample = torch.cat([new_sample, sample[:, :, overlap_video_length:]], dim = 2)
+
+                                sample = new_sample
+                            else:
+                                new_sample = sample
+
+                            if last_frames >= length_slider:
+                                break
+
+                            start_image = [
+                                Image.fromarray(
+                                    (sample[0, :, _index].transpose(0, 1).transpose(1, 2) * 255).numpy().astype(np.uint8)
+                                ) for _index in range(-overlap_video_length, 0)
+                            ]
+
+                            init_frames = init_frames + _partial_video_length - overlap_video_length
+                            last_frames = init_frames + _partial_video_length
+                    else:
+                        if validation_video is not None:
+                            input_video, input_video_mask, ref_image, clip_image = get_video_to_video_latent(validation_video, length_slider if not is_image else 1, sample_size=(height_slider, width_slider), validation_video_mask=validation_video_mask, fps=8)
+                            strength = denoise_strength
+                        else:
+                            input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, length_slider if not is_image else 1, sample_size=(height_slider, width_slider))
+                            strength = 1
+
+                        sample = self.pipeline(
+                            prompt_textbox,
+                            negative_prompt     = negative_prompt_textbox,
+                            num_inference_steps = sample_step_slider,
+                            guidance_scale      = cfg_scale_slider,
+                            width               = width_slider,
+                            height              = height_slider,
+                            num_frames          = length_slider if not is_image else 1,
+                            generator           = generator,
+
+                            video        = input_video,
+                            mask_video   = input_video_mask,
+                            strength     = strength,
+                        ).videos
+                else:
+                    sample = self.pipeline(
+                        prompt_textbox,
+                        negative_prompt     = negative_prompt_textbox,
+                        num_inference_steps = sample_step_slider,
+                        guidance_scale      = cfg_scale_slider,
+                        width               = width_slider,
+                        height              = height_slider,
+                        num_frames          = length_slider if not is_image else 1,
+                        generator           = generator
+                    ).videos
+            else:
+                input_video, input_video_mask, ref_image, clip_image = get_video_to_video_latent(control_video, length_slider if not is_image else 1, sample_size=(height_slider, width_slider), fps=8)
+
+                sample = self.pipeline(
+                    prompt_textbox,
+                    negative_prompt     = negative_prompt_textbox,
+                    num_inference_steps = sample_step_slider,
+                    guidance_scale      = cfg_scale_slider,
+                    width               = width_slider,
+                    height              = height_slider,
+                    num_frames          = length_slider if not is_image else 1,
+                    generator           = generator,
+
+                    control_video = input_video,
+                ).videos
+        except Exception as e:
+            self.clear_cache()
+            if self.lora_model_path != "none":
+                self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            if is_api:
+                return "", f"Error. error information is {str(e)}"
+            else:
+                return gr.update(), gr.update(), f"Error. error information is {str(e)}"
+
+        self.clear_cache()
+        # lora part
+        if self.lora_model_path != "none":
+            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+
+        save_sample_path = self.save_outputs(
+            is_image, length_slider, sample, fps=8
+        )
+
+        if is_image or length_slider == 1:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(value=save_sample_path, visible=True), gr.Video(value=None, visible=False), "Success"
+                else:
+                    return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
+        else:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+                else:
+                    return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
+
+CogVideoXFunController_Host = CogVideoXFunController
+CogVideoXFunController_Client = Fun_Controller_Client
+
+def ui(GPU_memory_mode, scheduler_dict, ulysses_degree, ring_degree, weight_dtype, savedir_sample=None):
+    controller = CogVideoXFunController(
+        GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint", 
+        ulysses_degree=ulysses_degree, ring_degree=ring_degree,
+        config_path=None, enable_teacache=None, teacache_threshold=None, weight_dtype=weight_dtype, 
+        savedir_sample=savedir_sample,
+    )
+
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # CogVideoX-Fun:
+
+            A CogVideoX with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 49), as well as image generated videos. 
+
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            model_type = create_model_type(visible=True)
+            diffusion_transformer_dropdown, diffusion_transformer_refresh_button = \
+                create_model_checkpoints(controller, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider, personalized_refresh_button = \
+                create_finetune_models_checkpoints(controller, visible=True)
+
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts()
+
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 384, default_width = 672, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    gr.Markdown(
+                        """
+                        V1.0 and V1.1 support up to 49 frames of video generation, while V1.5 supports up to 85 frames.  
+                        (V1.0和V1.1支持最大49帧视频生成，V1.5支持最大85帧视频生成。)
+                        """
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation", "Long Video Generation"],
+                            default_video_length=49,
+                            maximum_video_length=85,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video to Video (视频到视频)", "Video Control (视频控制)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                    
+                result_image, result_video, infer_progress = create_ui_outputs()
+
+            model_type.change(
+                fn=controller.update_model_type, 
+                inputs=[model_type], 
+                outputs=[]
+            )
+
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return [gr.update(visible=True, maximum=85, value=49, interactive=True), gr.update(visible=False), gr.update(visible=False)]
+                elif generation_method == "Image Generation":
+                    return [gr.update(minimum=1, maximum=1, value=1, interactive=False), gr.update(visible=False), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=True, maximum=1344), gr.update(visible=True), gr.update(visible=True)]
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider, overlap_video_length, partial_video_length]
+            )
+
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image, 
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox, 
+                    negative_prompt_textbox, 
+                    sampler_dropdown, 
+                    sample_step_slider, 
+                    resize_method,
+                    width_slider, 
+                    height_slider, 
+                    base_resolution, 
+                    generation_method, 
+                    length_slider, 
+                    overlap_video_length, 
+                    partial_video_length, 
+                    cfg_scale_slider, 
+                    start_image, 
+                    end_image, 
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength, 
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+
+def ui_host(GPU_memory_mode, scheduler_dict, model_name, model_type, ulysses_degree, ring_degree, weight_dtype, savedir_sample=None):
+    controller = CogVideoXFunController_Host(
+        GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, 
+        ulysses_degree=ulysses_degree, ring_degree=ring_degree,
+        config_path=None, enable_teacache=None, teacache_threshold=None, weight_dtype=weight_dtype, 
+        savedir_sample=savedir_sample,
+    )
+
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # CogVideoX-Fun
+
+            A CogVideoX with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 49), as well as image generated videos. 
+
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            model_type = create_fake_model_type(visible=True)
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = create_fake_finetune_models_checkpoints(visible=True)
+        
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts()
+
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 384, default_width = 672, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    gr.Markdown(
+                        """
+                        V1.0 and V1.1 support up to 49 frames of video generation, while V1.5 supports up to 85 frames.  
+                        (V1.0和V1.1支持最大49帧视频生成，V1.5支持最大85帧视频生成。)
+                        """
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=49,
+                            maximum_video_length=85,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video to Video (视频到视频)", "Video Control (视频控制)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+
+                result_image, result_video, infer_progress = create_ui_outputs()
+
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=8, maximum=85, value=49, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image, 
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown, 
+                    lora_alpha_slider,
+                    prompt_textbox, 
+                    negative_prompt_textbox, 
+                    sampler_dropdown, 
+                    sample_step_slider, 
+                    resize_method,
+                    width_slider, 
+                    height_slider, 
+                    base_resolution, 
+                    generation_method, 
+                    length_slider, 
+                    overlap_video_length, 
+                    partial_video_length, 
+                    cfg_scale_slider, 
+                    start_image, 
+                    end_image, 
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength, 
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+
+def ui_client(scheduler_dict, model_name, savedir_sample=None):
+    controller = CogVideoXFunController_Client(scheduler_dict, savedir_sample)
+
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # CogVideoX-Fun
+
+            A CogVideoX with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 49), as well as image generated videos. 
+
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = create_fake_finetune_models_checkpoints(visible=True)
+        
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts()
+
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller, maximum_step=50)
+
+                    resize_method, width_slider, height_slider, base_resolution = create_fake_height_width(
+                        default_height = 384, default_width = 672, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    gr.Markdown(
+                        """
+                        V1.0 and V1.1 support up to 49 frames of video generation, while V1.5 supports up to 85 frames.  
+                        (V1.0和V1.1支持最大49帧视频生成，V1.5支持最大85帧视频生成。)
+                        """
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=49,
+                            maximum_video_length=85,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)", "Video to Video (视频到视频)"], prompt_textbox
+                    )
+
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+
+                result_image, result_video, infer_progress = create_ui_outputs()
+
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=5, maximum=85, value=49, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [image_to_video_col, video_to_video_col, start_image, end_image, validation_video, validation_video_mask]
+            )
+
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown, 
+                    lora_alpha_slider,
+                    prompt_textbox, 
+                    negative_prompt_textbox, 
+                    sampler_dropdown, 
+                    sample_step_slider, 
+                    resize_method,
+                    width_slider, 
+                    height_slider, 
+                    base_resolution, 
+                    generation_method, 
+                    length_slider, 
+                    cfg_scale_slider, 
+                    start_image, 
+                    end_image, 
+                    validation_video,
+                    validation_video_mask,
+                    denoise_strength, 
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
\ No newline at end of file
diff --git a/videox_fun/ui/ui.py b/videox_fun/ui/ui.py
new file mode 100644
index 0000000000000000000000000000000000000000..f984042e9f194ade5f96fc9f641b2f4640a95b47
--- /dev/null
+++ b/videox_fun/ui/ui.py
@@ -0,0 +1,290 @@
+import random
+
+import gradio as gr
+
+
+def create_model_type(visible):
+    gr.Markdown(
+        """
+        ### Model Type (模型的种类，正常模型还是控制模型).
+        """,
+        visible=visible,
+    )
+    with gr.Row():
+        model_type = gr.Dropdown(
+            label="The model type of the model (模型的种类，正常模型还是控制模型)",
+            choices=["Inpaint", "Control"],
+            value="Inpaint",
+            visible=visible,
+            interactive=True,
+        )
+    return model_type
+
+def create_fake_model_type(visible):
+    gr.Markdown(
+        """
+        ### Model Type (模型的种类，正常模型还是控制模型).
+        """,
+        visible=visible,
+    )
+    with gr.Row():
+        model_type = gr.Dropdown(
+            label="The model type of the model (模型的种类，正常模型还是控制模型)",
+            choices=["Inpaint", "Control"],
+            value="Inpaint",
+            interactive=False,
+            visible=visible,
+        )
+    return model_type
+
+def create_model_checkpoints(controller, visible):
+    gr.Markdown(
+        """
+        ### Model checkpoints (模型路径).
+        """
+    )
+    with gr.Row(visible=visible):
+        diffusion_transformer_dropdown = gr.Dropdown(
+            label="Pretrained Model Path (预训练模型路径)",
+            choices=controller.diffusion_transformer_list,
+            value="none",
+            interactive=True,
+        )
+        diffusion_transformer_dropdown.change(
+            fn=controller.update_diffusion_transformer, 
+            inputs=[diffusion_transformer_dropdown], 
+            outputs=[diffusion_transformer_dropdown]
+        )
+        
+        diffusion_transformer_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
+        def refresh_diffusion_transformer():
+            controller.refresh_diffusion_transformer()
+            return gr.update(choices=controller.diffusion_transformer_list)
+        diffusion_transformer_refresh_button.click(fn=refresh_diffusion_transformer, inputs=[], outputs=[diffusion_transformer_dropdown])
+    
+    return diffusion_transformer_dropdown, diffusion_transformer_refresh_button
+
+def create_fake_model_checkpoints(model_name, visible):
+    gr.Markdown(
+        """
+        ### Model checkpoints (模型路径).
+        """
+    )
+    with gr.Row(visible=visible):
+        diffusion_transformer_dropdown = gr.Dropdown(
+            label="Pretrained Model Path (预训练模型路径)",
+            choices=[model_name],
+            value=model_name,
+            interactive=False,
+        )
+    return diffusion_transformer_dropdown
+
+def create_finetune_models_checkpoints(controller, visible):
+    with gr.Row(visible=visible):
+        base_model_dropdown = gr.Dropdown(
+            label="Select base Dreambooth model (选择基模型[非必需])",
+            choices=controller.personalized_model_list,
+            value="none",
+            interactive=True,
+        )
+        
+        lora_model_dropdown = gr.Dropdown(
+            label="Select LoRA model (选择LoRA模型[非必需])",
+            choices=["none"] + controller.personalized_model_list,
+            value="none",
+            interactive=True,
+        )
+
+        lora_alpha_slider = gr.Slider(label="LoRA alpha (LoRA权重)", value=0.55, minimum=0, maximum=2, interactive=True)
+        
+        personalized_refresh_button = gr.Button(value="\U0001F503", elem_classes="toolbutton")
+        def update_personalized_model():
+            controller.refresh_personalized_model()
+            return [
+                gr.update(choices=controller.personalized_model_list),
+                gr.update(choices=["none"] + controller.personalized_model_list)
+            ]
+        personalized_refresh_button.click(fn=update_personalized_model, inputs=[], outputs=[base_model_dropdown, lora_model_dropdown])
+
+    return base_model_dropdown, lora_model_dropdown, lora_alpha_slider, personalized_refresh_button
+
+def create_fake_finetune_models_checkpoints(visible):
+    with gr.Row():
+        base_model_dropdown = gr.Dropdown(
+            label="Select base Dreambooth model (选择基模型[非必需])",
+            choices=["none"],
+            value="none",
+            interactive=False,
+            visible=False
+        )
+        with gr.Column(visible=False):
+            gr.Markdown(
+                """
+                ### Minimalism is an example portrait of Lora, triggered by specific prompt words. More details can be found on [Wiki](https://github.com/aigc-apps/CogVideoX-Fun/wiki/Training-Lora).
+                """
+            )
+            with gr.Row():
+                lora_model_dropdown = gr.Dropdown(
+                    label="Select LoRA model",
+                    choices=["none"],
+                    value="none",
+                    interactive=True,
+                )
+
+                lora_alpha_slider = gr.Slider(label="LoRA alpha (LoRA权重)", value=0.55, minimum=0, maximum=2, interactive=True)
+        
+    return base_model_dropdown, lora_model_dropdown, lora_alpha_slider
+
+def create_prompts(
+    prompt="A young woman with beautiful and clear eyes and blonde hair standing and white dress in a forest wearing a crown. She seems to be lost in thought, and the camera focuses on her face. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+    negative_prompt="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion. "
+):
+    gr.Markdown(
+        """
+        ### Configs for Generation (生成参数配置).
+        """
+    )
+    
+    prompt_textbox = gr.Textbox(label="Prompt (正向提示词)", lines=2, value=prompt)
+    negative_prompt_textbox = gr.Textbox(label="Negative prompt (负向提示词)", lines=2, value=negative_prompt)
+    return prompt_textbox, negative_prompt_textbox
+
+def create_samplers(controller, maximum_step=100):
+    with gr.Row():
+        sampler_dropdown   = gr.Dropdown(label="Sampling method (采样器种类)", choices=list(controller.scheduler_dict.keys()), value=list(controller.scheduler_dict.keys())[0])
+        sample_step_slider = gr.Slider(label="Sampling steps (生成步数)", value=50, minimum=10, maximum=maximum_step, step=1)
+        
+    return sampler_dropdown, sample_step_slider
+
+def create_height_width(default_height, default_width, maximum_height, maximum_width):
+    resize_method = gr.Radio(
+        ["Generate by", "Resize according to Reference"],
+        value="Generate by",
+        show_label=False,
+    )
+    width_slider     = gr.Slider(label="Width (视频宽度)", value=default_width, minimum=128, maximum=maximum_width, step=16)
+    height_slider    = gr.Slider(label="Height (视频高度)", value=default_height, minimum=128, maximum=maximum_height, step=16)
+    base_resolution  = gr.Radio(label="Base Resolution of Pretrained Models", value=512, choices=[512, 768, 960], visible=False)
+
+    return resize_method, width_slider, height_slider, base_resolution
+
+def create_fake_height_width(default_height, default_width, maximum_height, maximum_width):
+    resize_method = gr.Radio(
+        ["Generate by", "Resize according to Reference"],
+        value="Generate by",
+        show_label=False,
+    )
+    width_slider     = gr.Slider(label="Width (视频宽度)", value=default_width, minimum=128, maximum=maximum_width, step=16, interactive=False)
+    height_slider    = gr.Slider(label="Height (视频高度)", value=default_height, minimum=128, maximum=maximum_height, step=16, interactive=False)
+    base_resolution  = gr.Radio(label="Base Resolution of Pretrained Models", value=512, choices=[512, 768, 960], interactive=False, visible=False)
+
+    return resize_method, width_slider, height_slider, base_resolution
+
+def create_generation_methods_and_video_length(
+    generation_method_options,
+    default_video_length,
+    maximum_video_length
+):
+    with gr.Group():
+        generation_method = gr.Radio(
+            generation_method_options,
+            value="Video Generation",
+            show_label=False,
+        )
+        with gr.Row():
+            length_slider = gr.Slider(label="Animation length (视频帧数)", value=default_video_length, minimum=1,   maximum=maximum_video_length,  step=4)
+            overlap_video_length = gr.Slider(label="Overlap length (视频续写的重叠帧数)", value=4, minimum=1,   maximum=4,  step=1, visible=False)
+            partial_video_length = gr.Slider(label="Partial video generation length (每个部分的视频生成帧数)", value=25, minimum=5,   maximum=maximum_video_length,  step=4, visible=False)
+                    
+    return generation_method, length_slider, overlap_video_length, partial_video_length
+
+def create_generation_method(source_method_options, prompt_textbox, support_end_image=True):
+    source_method = gr.Radio(
+        source_method_options,
+        value="Text to Video (文本到视频)",
+        show_label=False,
+    )
+    with gr.Column(visible = False) as image_to_video_col:
+        start_image = gr.Image(
+            label="The image at the beginning of the video (图片到视频的开始图片)",  show_label=True, 
+            elem_id="i2v_start", sources="upload", type="filepath", 
+        )
+        
+        template_gallery_path = ["asset/1.png", "asset/2.png", "asset/3.png", "asset/4.png", "asset/5.png"]
+        def select_template(evt: gr.SelectData):
+            text = {
+                "asset/1.png": "A brown dog is shaking its head and sitting on a light colored sofa in a comfortable room. Behind the dog, there is a framed painting on the shelf surrounded by pink flowers. The soft and warm lighting in the room creates a comfortable atmosphere.", 
+                "asset/2.png": "A sailboat navigates through moderately rough seas, with waves and ocean spray visible. The sailboat features a white hull and sails, accompanied by an orange sail catching the wind. The sky above shows dramatic, cloudy formations with a sunset or sunrise backdrop, casting warm colors across the scene. The water reflects the golden light, enhancing the visual contrast between the dark ocean and the bright horizon. The camera captures the scene with a dynamic and immersive angle, showcasing the movement of the boat and the energy of the ocean.", 
+                "asset/3.png": "A stunningly beautiful woman with flowing long hair stands gracefully, her elegant dress rippling and billowing in the gentle wind. Petals falling off. Her serene expression and the natural movement of her attire create an enchanting and captivating scene, full of ethereal charm.", 
+                "asset/4.png": "An astronaut, clad in a full space suit with a helmet, plays an electric guitar while floating in a cosmic environment filled with glowing particles and rocky textures. The scene is illuminated by a warm light source, creating dramatic shadows and contrasts. The background features a complex geometry, similar to a space station or an alien landscape, indicating a futuristic or otherworldly setting.", 
+                "asset/5.png": "Fireworks light up the evening sky over a sprawling cityscape with gothic-style buildings featuring pointed towers and clock faces. The city is lit by both artificial lights from the buildings and the colorful bursts of the fireworks. The scene is viewed from an elevated angle, showcasing a vibrant urban environment set against a backdrop of a dramatic, partially cloudy sky at dusk.", 
+            }[template_gallery_path[evt.index]]
+            return template_gallery_path[evt.index], text
+
+        template_gallery = gr.Gallery(
+            template_gallery_path,
+            columns=5, rows=1,
+            height=140,
+            allow_preview=False,
+            container=False,
+            label="Template Examples",
+        )
+        template_gallery.select(select_template, None, [start_image, prompt_textbox])
+        
+        with gr.Accordion("The image at the ending of the video (图片到视频的结束图片[非必需, Optional])", open=False, visible=support_end_image):
+            end_image   = gr.Image(label="The image at the ending of the video (图片到视频的结束图片[非必需, Optional])", show_label=False, elem_id="i2v_end", sources="upload", type="filepath")
+
+    with gr.Column(visible = False) as video_to_video_col:
+        with gr.Row():
+            validation_video = gr.Video(
+                label="The video to convert (视频转视频的参考视频)",  show_label=True, 
+                elem_id="v2v", sources="upload", 
+            )
+        with gr.Accordion("The mask of the video to inpaint (视频重新绘制的mask[非必需, Optional])", open=False):
+            gr.Markdown(
+                """
+                - Please set a larger denoise_strength when using validation_video_mask, such as 1.00 instead of 0.70  
+                (请设置更大的denoise_strength，当使用validation_video_mask的时候，比如1而不是0.70)
+                """
+            )
+            validation_video_mask = gr.Image(
+                label="The mask of the video to inpaint (视频重新绘制的mask[非必需, Optional])",
+                show_label=False, elem_id="v2v_mask", sources="upload", type="filepath"
+            )
+        denoise_strength = gr.Slider(label="Denoise strength (重绘系数)", value=0.70, minimum=0.10, maximum=1.00, step=0.01)
+
+    with gr.Column(visible = False) as control_video_col:
+        gr.Markdown(
+            """
+            Demo pose control video can be downloaded here [URL](https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/cogvideox_fun/asset/v1.1/pose.mp4).
+            """
+        )
+        control_video = gr.Video(
+            label="The control video (用于提供控制信号的video)",  show_label=True, 
+            elem_id="v2v_control", sources="upload", 
+        )
+    return image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video
+
+def create_cfg_and_seedbox(gradio_version_is_above_4):
+    cfg_scale_slider  = gr.Slider(label="CFG Scale (引导系数)",        value=6.0, minimum=0,   maximum=20)
+    
+    with gr.Row():
+        seed_textbox = gr.Textbox(label="Seed (随机种子)", value=43)
+        seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
+        seed_button.click(
+            fn=lambda: gr.Textbox(value=random.randint(1, 1e8)) if gradio_version_is_above_4 else gr.Textbox.update(value=random.randint(1, 1e8)), 
+            inputs=[], 
+            outputs=[seed_textbox]
+        )
+    return cfg_scale_slider, seed_textbox, seed_button
+
+def create_ui_outputs():
+    with gr.Column():
+        result_image = gr.Image(label="Generated Image (生成图片)", interactive=False, visible=False)
+        result_video = gr.Video(label="Generated Animation (生成视频)", interactive=False)
+        infer_progress = gr.Textbox(
+            label="Generation Info (生成信息)",
+            value="No task currently",
+            interactive=False
+    )
+    return result_image, result_video, infer_progress
diff --git a/videox_fun/ui/wan_fun_ui.py b/videox_fun/ui/wan_fun_ui.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc67205bbae1c25e37722d985784c1fa7f4bcd39
--- /dev/null
+++ b/videox_fun/ui/wan_fun_ui.py
@@ -0,0 +1,630 @@
+"""Modified from https://github.com/guoyww/AnimateDiff/blob/main/app.py
+"""
+import os
+import random
+
+import cv2
+import gradio as gr
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from PIL import Image
+from safetensors import safe_open
+
+from ..data.bucket_sampler import ASPECT_RATIO_512, get_closest_ratio
+from ..models import (AutoencoderKLWan, AutoTokenizer, CLIPModel,
+                      WanT5EncoderModel, WanTransformer3DModel)
+from ..models.cache_utils import get_teacache_coefficients
+from ..pipeline import WanFunInpaintPipeline, WanFunPipeline, WanFunControlPipeline
+from ..utils.fp8_optimization import (convert_model_weight_to_float8,
+                                      convert_weight_dtype_wrapper,
+                                      replace_parameters_by_name)
+from ..utils.lora_utils import merge_lora, unmerge_lora
+from ..utils.utils import (filter_kwargs, get_image_to_video_latent,
+                           get_video_to_video_latent, save_videos_grid)
+from .controller import (Fun_Controller, Fun_Controller_Client,
+                         all_cheduler_dict, css, ddpm_scheduler_dict,
+                         flow_scheduler_dict, gradio_version,
+                         gradio_version_is_above_4)
+from .ui import (create_cfg_and_seedbox,
+                 create_fake_finetune_models_checkpoints,
+                 create_fake_height_width, create_fake_model_checkpoints,
+                 create_fake_model_type, create_finetune_models_checkpoints,
+                 create_generation_method,
+                 create_generation_methods_and_video_length,
+                 create_height_width, create_model_checkpoints,
+                 create_model_type, create_prompts, create_samplers,
+                 create_ui_outputs)
+
+
+class Wan_Fun_Controller(Fun_Controller):
+    def update_diffusion_transformer(self, diffusion_transformer_dropdown):
+        print("Update diffusion transformer")
+        self.diffusion_transformer_dropdown = diffusion_transformer_dropdown
+        if diffusion_transformer_dropdown == "none":
+            return gr.update()
+        self.vae = AutoencoderKLWan.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['vae_kwargs'].get('vae_subpath', 'vae')),
+            additional_kwargs=OmegaConf.to_container(self.config['vae_kwargs']),
+        ).to(self.weight_dtype)
+
+        # Get Transformer
+        self.transformer = WanTransformer3DModel.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['transformer_additional_kwargs'].get('transformer_subpath', 'transformer')),
+            transformer_additional_kwargs=OmegaConf.to_container(self.config['transformer_additional_kwargs']),
+            low_cpu_mem_usage=True,
+            torch_dtype=self.weight_dtype,
+        )
+
+        # Get Tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['text_encoder_kwargs'].get('tokenizer_subpath', 'tokenizer')),
+        )
+
+        # Get Text encoder
+        self.text_encoder = WanT5EncoderModel.from_pretrained(
+            os.path.join(diffusion_transformer_dropdown, self.config['text_encoder_kwargs'].get('text_encoder_subpath', 'text_encoder')),
+            additional_kwargs=OmegaConf.to_container(self.config['text_encoder_kwargs']),
+            low_cpu_mem_usage=True,
+            torch_dtype=self.weight_dtype,
+        )
+        self.text_encoder = self.text_encoder.eval()
+
+        if self.transformer.config.in_channels != self.vae.config.latent_channels:
+            # Get Clip Image Encoder
+            self.clip_image_encoder = CLIPModel.from_pretrained(
+                os.path.join(diffusion_transformer_dropdown, self.config['image_encoder_kwargs'].get('image_encoder_subpath', 'image_encoder')),
+            ).to(self.weight_dtype)
+            self.clip_image_encoder = self.clip_image_encoder.eval()
+        else:
+            self.clip_image_encoder = None
+        
+        Choosen_Scheduler = self.scheduler_dict[list(self.scheduler_dict.keys())[0]]
+        self.scheduler = Choosen_Scheduler(
+            **filter_kwargs(Choosen_Scheduler, OmegaConf.to_container(self.config['scheduler_kwargs']))
+        )
+
+        # Get pipeline
+        if self.model_type == "Inpaint":
+            if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                self.pipeline = WanFunInpaintPipeline(
+                    vae=self.vae,
+                    tokenizer=self.tokenizer,
+                    text_encoder=self.text_encoder,
+                    transformer=self.transformer,
+                    scheduler=self.scheduler,
+                    clip_image_encoder=self.clip_image_encoder,
+                )
+            else:
+                self.pipeline = WanFunPipeline(
+                    vae=self.vae,
+                    tokenizer=self.tokenizer,
+                    text_encoder=self.text_encoder,
+                    transformer=self.transformer,
+                    scheduler=self.scheduler,
+                )
+        else:
+            self.pipeline = WanFunControlPipeline(
+                vae=self.vae,
+                tokenizer=self.tokenizer,
+                text_encoder=self.text_encoder,
+                transformer=self.transformer,
+                scheduler=self.scheduler,
+                clip_image_encoder=self.clip_image_encoder,
+            )
+
+        if self.ulysses_degree > 1 or self.ring_degree > 1:
+            self.transformer.enable_multi_gpus_inference()
+
+        if self.GPU_memory_mode == "sequential_cpu_offload":
+            replace_parameters_by_name(self.transformer, ["modulation",], device=self.device)
+            self.transformer.freqs = self.transformer.freqs.to(device=self.device)
+            self.pipeline.enable_sequential_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+            convert_model_weight_to_float8(self.transformer, exclude_module_name=["modulation",])
+            convert_weight_dtype_wrapper(self.transformer, self.weight_dtype)
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        elif self.GPU_memory_mode == "model_cpu_offload":
+            self.pipeline.enable_model_cpu_offload(device=self.device)
+        else:
+            self.pipeline.to(self.device)
+        print("Update diffusion transformer done")
+        return gr.update()
+
+    def generate(
+        self,
+        diffusion_transformer_dropdown,
+        base_model_dropdown,
+        lora_model_dropdown, 
+        lora_alpha_slider,
+        prompt_textbox, 
+        negative_prompt_textbox, 
+        sampler_dropdown, 
+        sample_step_slider, 
+        resize_method,
+        width_slider, 
+        height_slider, 
+        base_resolution, 
+        generation_method, 
+        length_slider, 
+        overlap_video_length, 
+        partial_video_length, 
+        cfg_scale_slider, 
+        start_image, 
+        end_image, 
+        validation_video,
+        validation_video_mask,
+        control_video,
+        denoise_strength,
+        seed_textbox,
+        is_api = False,
+    ):
+        self.clear_cache()
+
+        self.input_check(
+            resize_method, generation_method, start_image, end_image, validation_video,control_video, is_api
+        )
+        is_image = True if generation_method == "Image Generation" else False
+
+        if self.base_model_path != base_model_dropdown:
+            self.update_base_model(base_model_dropdown)
+
+        if self.lora_model_path != lora_model_dropdown:
+            self.update_lora_model(lora_model_dropdown)
+
+        self.pipeline.scheduler = self.scheduler_dict[sampler_dropdown].from_config(self.pipeline.scheduler.config)
+
+        if resize_method == "Resize according to Reference":
+            height_slider, width_slider = self.get_height_width_from_reference(
+                base_resolution, start_image, validation_video, control_video,
+            )
+        if self.lora_model_path != "none":
+            # lora part
+            self.pipeline = merge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+
+        coefficients = get_teacache_coefficients(self.base_model_path) if self.enable_teacache else None
+        if coefficients is not None:
+            print(f"Enable TeaCache with threshold {self.teacache_threshold} and skip the first {self.num_skip_start_steps} steps.")
+            self.pipeline.transformer.enable_teacache(
+                coefficients, sample_step_slider, self.teacache_threshold, num_skip_start_steps=self.num_skip_start_steps, offload=self.teacache_offload
+            )
+
+        if int(seed_textbox) != -1 and seed_textbox != "": torch.manual_seed(int(seed_textbox))
+        else: seed_textbox = np.random.randint(0, 1e10)
+        generator = torch.Generator(device=self.device).manual_seed(int(seed_textbox))
+        
+        if self.enable_riflex:
+            latent_frames = (int(length_slider) - 1) // self.vae.config.temporal_compression_ratio + 1
+            self.pipeline.transformer.enable_riflex(k = self.riflex_k, L_test = latent_frames if not is_image else 1)
+
+        try:
+            if self.model_type == "Inpaint":
+                if self.transformer.config.in_channels != self.vae.config.latent_channels:
+                    if validation_video is not None:
+                        input_video, input_video_mask, ref_image, clip_image = get_video_to_video_latent(validation_video, length_slider if not is_image else 1, sample_size=(height_slider, width_slider), validation_video_mask=validation_video_mask, fps=16)
+                    else:
+                        input_video, input_video_mask, clip_image = get_image_to_video_latent(start_image, end_image, length_slider if not is_image else 1, sample_size=(height_slider, width_slider))
+
+                    sample = self.pipeline(
+                        prompt_textbox,
+                        negative_prompt     = negative_prompt_textbox,
+                        num_inference_steps = sample_step_slider,
+                        guidance_scale      = cfg_scale_slider,
+                        width               = width_slider,
+                        height              = height_slider,
+                        num_frames          = length_slider if not is_image else 1,
+                        generator           = generator,
+
+                        video        = input_video,
+                        mask_video   = input_video_mask,
+                        clip_image   = clip_image
+                    ).videos
+                else:
+                    sample = self.pipeline(
+                        prompt_textbox,
+                        negative_prompt     = negative_prompt_textbox,
+                        num_inference_steps = sample_step_slider,
+                        guidance_scale      = cfg_scale_slider,
+                        width               = width_slider,
+                        height              = height_slider,
+                        num_frames          = length_slider if not is_image else 1,
+                        generator           = generator
+                    ).videos
+            else:
+                input_video, input_video_mask, ref_image, clip_image = get_video_to_video_latent(control_video, length_slider if not is_image else 1, sample_size=(height_slider, width_slider), fps=16)
+
+                sample = self.pipeline(
+                    prompt_textbox,
+                    negative_prompt     = negative_prompt_textbox,
+                    num_inference_steps = sample_step_slider,
+                    guidance_scale      = cfg_scale_slider,
+                    width               = width_slider,
+                    height              = height_slider,
+                    num_frames          = length_slider if not is_image else 1,
+                    generator           = generator,
+
+                    control_video = input_video,
+                ).videos
+        except Exception as e:
+            self.clear_cache()
+            if self.lora_model_path != "none":
+                self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+            if is_api:
+                return "", f"Error. error information is {str(e)}"
+            else:
+                return gr.update(), gr.update(), f"Error. error information is {str(e)}"
+
+        self.clear_cache()
+        # lora part
+        if self.lora_model_path != "none":
+            self.pipeline = unmerge_lora(self.pipeline, self.lora_model_path, multiplier=lora_alpha_slider)
+
+        save_sample_path = self.save_outputs(
+            is_image, length_slider, sample, fps=16
+        )
+
+        if is_image or length_slider == 1:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(value=save_sample_path, visible=True), gr.Video(value=None, visible=False), "Success"
+                else:
+                    return gr.Image.update(value=save_sample_path, visible=True), gr.Video.update(value=None, visible=False), "Success"
+        else:
+            if is_api:
+                return save_sample_path, "Success"
+            else:
+                if gradio_version_is_above_4:
+                    return gr.Image(visible=False, value=None), gr.Video(value=save_sample_path, visible=True), "Success"
+                else:
+                    return gr.Image.update(visible=False, value=None), gr.Video.update(value=save_sample_path, visible=True), "Success"
+
+Wan_Fun_Controller_Host = Wan_Fun_Controller
+Wan_Fun_Controller_Client = Fun_Controller_Client
+
+def ui(GPU_memory_mode, scheduler_dict, config_path, ulysses_degree, ring_degree, enable_teacache, teacache_threshold, num_skip_start_steps, teacache_offload, enable_riflex, riflex_k, weight_dtype, savedir_sample=None):
+    controller = Wan_Fun_Controller(
+        GPU_memory_mode, scheduler_dict, model_name=None, model_type="Inpaint", 
+        config_path=config_path, ulysses_degree=ulysses_degree, ring_degree=ring_degree,
+        enable_teacache=enable_teacache, teacache_threshold=teacache_threshold, 
+        num_skip_start_steps=num_skip_start_steps, teacache_offload=teacache_offload, 
+        enable_riflex=enable_riflex, riflex_k=riflex_k, weight_dtype=weight_dtype, 
+        savedir_sample=savedir_sample,
+    )
+
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # Wan-Fun:
+
+            A Wan with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 81), as well as image generated videos. 
+
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            model_type = create_model_type(visible=True)
+            diffusion_transformer_dropdown, diffusion_transformer_refresh_button = \
+                create_model_checkpoints(controller, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider, personalized_refresh_button = \
+                create_finetune_models_checkpoints(controller, visible=True)
+
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts(negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
+
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 480, default_width = 832, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=81,
+                            maximum_video_length=81,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+                    
+                result_image, result_video, infer_progress = create_ui_outputs()
+
+            model_type.change(
+                fn=controller.update_model_type, 
+                inputs=[model_type], 
+                outputs=[]
+            )
+
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return [gr.update(visible=True, maximum=81, value=81, interactive=True), gr.update(visible=False), gr.update(visible=False)]
+                elif generation_method == "Image Generation":
+                    return [gr.update(minimum=1, maximum=1, value=1, interactive=False), gr.update(visible=False), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=True, maximum=1344), gr.update(visible=True), gr.update(visible=True)]
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider, overlap_video_length, partial_video_length]
+            )
+
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image, 
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown,
+                    lora_alpha_slider,
+                    prompt_textbox, 
+                    negative_prompt_textbox, 
+                    sampler_dropdown, 
+                    sample_step_slider, 
+                    resize_method,
+                    width_slider, 
+                    height_slider, 
+                    base_resolution, 
+                    generation_method, 
+                    length_slider, 
+                    overlap_video_length, 
+                    partial_video_length, 
+                    cfg_scale_slider, 
+                    start_image, 
+                    end_image, 
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength, 
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+
+def ui_host(GPU_memory_mode, scheduler_dict, model_name, model_type, config_path, ulysses_degree, ring_degree, enable_teacache, teacache_threshold, num_skip_start_steps, teacache_offload, enable_riflex, riflex_k, weight_dtype, savedir_sample=None):
+    controller = Wan_Fun_Controller_Host(
+        GPU_memory_mode, scheduler_dict, model_name=model_name, model_type=model_type, 
+        config_path=config_path, ulysses_degree=ulysses_degree, ring_degree=ring_degree,
+        enable_teacache=enable_teacache, teacache_threshold=teacache_threshold, 
+        num_skip_start_steps=num_skip_start_steps, teacache_offload=teacache_offload, 
+        enable_riflex=enable_riflex, riflex_k=riflex_k, weight_dtype=weight_dtype, 
+        savedir_sample=savedir_sample,
+    )
+
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # Wan-Fun:
+
+            A Wan with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 81), as well as image generated videos. 
+
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            model_type = create_fake_model_type(visible=True)
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = create_fake_finetune_models_checkpoints(visible=True)
+        
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts(negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
+
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller)
+
+                    resize_method, width_slider, height_slider, base_resolution = create_height_width(
+                        default_height = 480, default_width = 832, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=81,
+                            maximum_video_length=81,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)"], prompt_textbox
+                    )
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+
+                result_image, result_video, infer_progress = create_ui_outputs()
+
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=1, maximum=81, value=81, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Video to Video (视频到视频)":
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(), gr.update(), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [
+                    image_to_video_col, video_to_video_col, control_video_col, start_image, end_image, 
+                    validation_video, validation_video_mask, control_video
+                ]
+            )
+
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown, 
+                    lora_alpha_slider,
+                    prompt_textbox, 
+                    negative_prompt_textbox, 
+                    sampler_dropdown, 
+                    sample_step_slider, 
+                    resize_method,
+                    width_slider, 
+                    height_slider, 
+                    base_resolution, 
+                    generation_method, 
+                    length_slider, 
+                    overlap_video_length, 
+                    partial_video_length, 
+                    cfg_scale_slider, 
+                    start_image, 
+                    end_image, 
+                    validation_video,
+                    validation_video_mask,
+                    control_video,
+                    denoise_strength, 
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
+
+def ui_client(scheduler_dict, model_name, savedir_sample=None):
+    controller = Wan_Fun_Controller_Client(scheduler_dict, savedir_sample)
+
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # Wan-Fun:
+
+            A Wan with more flexible generation conditions, capable of producing videos of different resolutions, around 6 seconds, and fps 8 (frames 1 to 81), as well as image generated videos. 
+
+            [Github](https://github.com/aigc-apps/CogVideoX-Fun/)
+            """
+        )
+        with gr.Column(variant="panel"):
+            diffusion_transformer_dropdown = create_fake_model_checkpoints(model_name, visible=True)
+            base_model_dropdown, lora_model_dropdown, lora_alpha_slider = create_fake_finetune_models_checkpoints(visible=True)
+        
+        with gr.Column(variant="panel"):
+            prompt_textbox, negative_prompt_textbox = create_prompts(negative_prompt="色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走")
+
+            with gr.Row():
+                with gr.Column():
+                    sampler_dropdown, sample_step_slider = create_samplers(controller, maximum_step=50)
+
+                    resize_method, width_slider, height_slider, base_resolution = create_fake_height_width(
+                        default_height = 480, default_width = 832, maximum_height = 1344,
+                        maximum_width = 1344,
+                    )
+                    generation_method, length_slider, overlap_video_length, partial_video_length = \
+                        create_generation_methods_and_video_length(
+                            ["Video Generation", "Image Generation"],
+                            default_video_length=81,
+                            maximum_video_length=81,
+                        )
+                    image_to_video_col, video_to_video_col, control_video_col, source_method, start_image, template_gallery, end_image, validation_video, validation_video_mask, denoise_strength, control_video = create_generation_method(
+                        ["Text to Video (文本到视频)", "Image to Video (图片到视频)"], prompt_textbox
+                    )
+
+                    cfg_scale_slider, seed_textbox, seed_button = create_cfg_and_seedbox(gradio_version_is_above_4)
+
+                    generate_button = gr.Button(value="Generate (生成)", variant='primary')
+
+                result_image, result_video, infer_progress = create_ui_outputs()
+
+            def upload_generation_method(generation_method):
+                if generation_method == "Video Generation":
+                    return gr.update(visible=True, minimum=5, maximum=81, value=49, interactive=True)
+                elif generation_method == "Image Generation":
+                    return gr.update(minimum=1, maximum=1, value=1, interactive=False)
+            generation_method.change(
+                upload_generation_method, generation_method, [length_slider]
+            )
+
+            def upload_source_method(source_method):
+                if source_method == "Text to Video (文本到视频)":
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=None)]
+                elif source_method == "Image to Video (图片到视频)":
+                    return [gr.update(visible=True), gr.update(visible=False), gr.update(), gr.update(), gr.update(value=None), gr.update(value=None)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=True), gr.update(value=None), gr.update(value=None), gr.update(), gr.update()]
+            source_method.change(
+                upload_source_method, source_method, [image_to_video_col, video_to_video_col, start_image, end_image, validation_video, validation_video_mask]
+            )
+
+            def upload_resize_method(resize_method):
+                if resize_method == "Generate by":
+                    return [gr.update(visible=True), gr.update(visible=True), gr.update(visible=False)]
+                else:
+                    return [gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)]
+            resize_method.change(
+                upload_resize_method, resize_method, [width_slider, height_slider, base_resolution]
+            )
+
+            generate_button.click(
+                fn=controller.generate,
+                inputs=[
+                    diffusion_transformer_dropdown,
+                    base_model_dropdown,
+                    lora_model_dropdown, 
+                    lora_alpha_slider,
+                    prompt_textbox, 
+                    negative_prompt_textbox, 
+                    sampler_dropdown, 
+                    sample_step_slider, 
+                    resize_method,
+                    width_slider, 
+                    height_slider, 
+                    base_resolution, 
+                    generation_method, 
+                    length_slider, 
+                    cfg_scale_slider, 
+                    start_image, 
+                    end_image, 
+                    validation_video,
+                    validation_video_mask,
+                    denoise_strength, 
+                    seed_textbox,
+                ],
+                outputs=[result_image, result_video, infer_progress]
+            )
+    return demo, controller
\ No newline at end of file
diff --git a/videox_fun/utils/__init__.py b/videox_fun/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/videox_fun/utils/discrete_sampler.py b/videox_fun/utils/discrete_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..149dbe7beb94dfea2e6fe0ca3b5acf9437be60f7
--- /dev/null
+++ b/videox_fun/utils/discrete_sampler.py
@@ -0,0 +1,46 @@
+"""Modified from https://github.com/THUDM/CogVideo/blob/3710a612d8760f5cdb1741befeebb65b9e0f2fe0/sat/sgm/modules/diffusionmodules/sigma_sampling.py
+"""
+import torch
+
+class DiscreteSampling:
+    def __init__(self, num_idx, uniform_sampling=False):
+        self.num_idx = num_idx
+        self.uniform_sampling = uniform_sampling
+        self.is_distributed = torch.distributed.is_available() and torch.distributed.is_initialized()
+
+        if self.is_distributed and self.uniform_sampling:
+            world_size = torch.distributed.get_world_size()
+            self.rank = torch.distributed.get_rank()
+
+            i = 1
+            while True:
+                if world_size % i != 0 or num_idx % (world_size // i) != 0:
+                    i += 1
+                else: 
+                    self.group_num = world_size // i
+                    break
+            assert self.group_num > 0 
+            assert world_size % self.group_num == 0 
+            # the number of rank in one group 
+            self.group_width = world_size // self.group_num  
+            self.sigma_interval = self.num_idx // self.group_num
+            print('rank=%d world_size=%d group_num=%d group_width=%d sigma_interval=%s' % (
+                  self.rank, world_size, self.group_num,
+                  self.group_width, self.sigma_interval))
+        
+    def __call__(self, n_samples, generator=None, device=None):
+        if self.is_distributed and self.uniform_sampling: 
+            group_index = self.rank // self.group_width
+            idx = torch.randint(
+                    group_index * self.sigma_interval,
+                    (group_index + 1) * self.sigma_interval,
+                    (n_samples,), 
+                    generator=generator, device=device,
+                )
+            print('proc[%d] idx=%s' % (self.rank, idx))
+        else:   
+            idx = torch.randint(
+                    0, self.num_idx, (n_samples,), 
+                    generator=generator, device=device,
+                )
+        return idx
\ No newline at end of file
diff --git a/videox_fun/utils/fp8_optimization.py b/videox_fun/utils/fp8_optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..1aa6d26fe9a0a0365401ab77ba4103fcc723fca9
--- /dev/null
+++ b/videox_fun/utils/fp8_optimization.py
@@ -0,0 +1,56 @@
+"""Modified from https://github.com/kijai/ComfyUI-MochiWrapper
+"""
+import torch
+import torch.nn as nn
+
+def autocast_model_forward(cls, origin_dtype, *inputs, **kwargs):
+    weight_dtype = cls.weight.dtype
+    cls.to(origin_dtype)
+
+    # Convert all inputs to the original dtype
+    inputs = [input.to(origin_dtype) for input in inputs]
+    out = cls.original_forward(*inputs, **kwargs)
+
+    cls.to(weight_dtype)
+    return out
+
+def replace_parameters_by_name(module, name_keywords, device):
+    from torch import nn
+    for name, param in list(module.named_parameters(recurse=False)):
+        if any(keyword in name for keyword in name_keywords):
+            if isinstance(param, nn.Parameter):
+                tensor = param.data
+                delattr(module, name)
+                setattr(module, name, tensor.to(device=device))
+    for child_name, child_module in module.named_children():
+        replace_parameters_by_name(child_module, name_keywords, device)
+
+def convert_model_weight_to_float8(model, exclude_module_name=['embed_tokens']):
+    for name, module in model.named_modules():
+        flag = False
+        for _exclude_module_name in exclude_module_name:
+            if _exclude_module_name in name:
+                flag = True
+        if flag:
+            continue
+        for param_name, param in module.named_parameters():
+            flag = False
+            for _exclude_module_name in exclude_module_name:
+                if _exclude_module_name in param_name:
+                    flag = True
+            if flag:
+                continue
+            param.data = param.data.to(torch.float8_e4m3fn)
+
+def convert_weight_dtype_wrapper(module, origin_dtype):
+    for name, module in module.named_modules():
+        if name == "" or "embed_tokens" in name:
+            continue
+        original_forward = module.forward
+        if hasattr(module, "weight") and module.weight is not None:
+            setattr(module, "original_forward", original_forward)
+            setattr(
+                module,
+                "forward",
+                lambda *inputs, m=module, **kwargs: autocast_model_forward(m, origin_dtype, *inputs, **kwargs)
+            )
diff --git a/videox_fun/utils/lora_utils.py b/videox_fun/utils/lora_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f801fa8f7a5a4024a4d4b003513813ed38f6649c
--- /dev/null
+++ b/videox_fun/utils/lora_utils.py
@@ -0,0 +1,516 @@
+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+# https://github.com/bmaltais/kohya_ss
+
+import hashlib
+import math
+import os
+from collections import defaultdict
+from io import BytesIO
+from typing import List, Optional, Type, Union
+
+import safetensors.torch
+import torch
+import torch.utils.checkpoint
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from safetensors.torch import load_file
+from transformers import T5EncoderModel
+
+
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+
+    def __init__(
+        self,
+        lora_name,
+        org_module: torch.nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+        dropout=None,
+        rank_dropout=None,
+        module_dropout=None,
+    ):
+        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        super().__init__()
+        self.lora_name = lora_name
+
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+
+        self.lora_dim = lora_dim
+        if org_module.__class__.__name__ == "Conv2d":
+            kernel_size = org_module.kernel_size
+            stride = org_module.stride
+            padding = org_module.padding
+            self.lora_down = torch.nn.Conv2d(in_dim, self.lora_dim, kernel_size, stride, padding, bias=False)
+            self.lora_up = torch.nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+        else:
+            self.lora_down = torch.nn.Linear(in_dim, self.lora_dim, bias=False)
+            self.lora_up = torch.nn.Linear(self.lora_dim, out_dim, bias=False)
+
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = self.lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))
+
+        # same as microsoft's
+        torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+        torch.nn.init.zeros_(self.lora_up.weight)
+
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.dropout = dropout
+        self.rank_dropout = rank_dropout
+        self.module_dropout = module_dropout
+
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+
+    def forward(self, x, *args, **kwargs):
+        weight_dtype = x.dtype
+        org_forwarded = self.org_forward(x)
+
+        # module dropout
+        if self.module_dropout is not None and self.training:
+            if torch.rand(1) < self.module_dropout:
+                return org_forwarded
+
+        lx = self.lora_down(x.to(self.lora_down.weight.dtype))
+
+        # normal dropout
+        if self.dropout is not None and self.training:
+            lx = torch.nn.functional.dropout(lx, p=self.dropout)
+
+        # rank dropout
+        if self.rank_dropout is not None and self.training:
+            mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
+            if len(lx.size()) == 3:
+                mask = mask.unsqueeze(1)  # for Text Encoder
+            elif len(lx.size()) == 4:
+                mask = mask.unsqueeze(-1).unsqueeze(-1)  # for Conv2d
+            lx = lx * mask
+
+            # scaling for rank dropout: treat as if the rank is changed
+            scale = self.scale * (1.0 / (1.0 - self.rank_dropout))  # redundant for readability
+        else:
+            scale = self.scale
+
+        lx = self.lora_up(lx)
+
+        return org_forwarded.to(weight_dtype) + lx.to(weight_dtype) * self.multiplier * scale
+
+
+def addnet_hash_legacy(b):
+    """Old model hash used by sd-webui-additional-networks for .safetensors format files"""
+    m = hashlib.sha256()
+
+    b.seek(0x100000)
+    m.update(b.read(0x10000))
+    return m.hexdigest()[0:8]
+
+
+def addnet_hash_safetensors(b):
+    """New model hash used by sd-webui-additional-networks for .safetensors format files"""
+    hash_sha256 = hashlib.sha256()
+    blksize = 1024 * 1024
+
+    b.seek(0)
+    header = b.read(8)
+    n = int.from_bytes(header, "little")
+
+    offset = n + 8
+    b.seek(offset)
+    for chunk in iter(lambda: b.read(blksize), b""):
+        hash_sha256.update(chunk)
+
+    return hash_sha256.hexdigest()
+
+
+def precalculate_safetensors_hashes(tensors, metadata):
+    """Precalculate the model hashes needed by sd-webui-additional-networks to
+    save time on indexing the model later."""
+
+    # Because writing user metadata to the file can change the result of
+    # sd_models.model_hash(), only retain the training metadata for purposes of
+    # calculating the hash, as they are meant to be immutable
+    metadata = {k: v for k, v in metadata.items() if k.startswith("ss_")}
+
+    bytes = safetensors.torch.save(tensors, metadata)
+    b = BytesIO(bytes)
+
+    model_hash = addnet_hash_safetensors(b)
+    legacy_hash = addnet_hash_legacy(b)
+    return model_hash, legacy_hash
+
+
+class LoRANetwork(torch.nn.Module):
+    TRANSFORMER_TARGET_REPLACE_MODULE = ["CogVideoXTransformer3DModel", "WanTransformer3DModel"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["T5LayerSelfAttention", "T5LayerFF", "BertEncoder", "T5SelfAttention", "T5CrossAttention"]
+    LORA_PREFIX_TRANSFORMER = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    def __init__(
+        self,
+        text_encoder: Union[List[T5EncoderModel], T5EncoderModel],
+        unet,
+        multiplier: float = 1.0,
+        lora_dim: int = 4,
+        alpha: float = 1,
+        dropout: Optional[float] = None,
+        module_class: Type[object] = LoRAModule,
+        skip_name: str = None,
+        varbose: Optional[bool] = False,
+    ) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        self.dropout = dropout
+
+        print(f"create LoRA network. base dim (rank): {lora_dim}, alpha: {alpha}")
+        print(f"neuron dropout: p={self.dropout}")
+
+        # create module instances
+        def create_modules(
+            is_unet: bool,
+            root_module: torch.nn.Module,
+            target_replace_modules: List[torch.nn.Module],
+        ) -> List[LoRAModule]:
+            prefix = (
+                self.LORA_PREFIX_TRANSFORMER
+                if is_unet
+                else self.LORA_PREFIX_TEXT_ENCODER
+            )
+            loras = []
+            skipped = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        is_linear = child_module.__class__.__name__ == "Linear" or child_module.__class__.__name__ == "LoRACompatibleLinear"
+                        is_conv2d = child_module.__class__.__name__ == "Conv2d" or child_module.__class__.__name__ == "LoRACompatibleConv"
+                        is_conv2d_1x1 = is_conv2d and child_module.kernel_size == (1, 1)
+                        
+                        if skip_name is not None and skip_name in child_name:
+                            continue
+
+                        if is_linear or is_conv2d:
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+
+                            dim = None
+                            alpha = None
+
+                            if is_linear or is_conv2d_1x1:
+                                dim = self.lora_dim
+                                alpha = self.alpha
+
+                            if dim is None or dim == 0:
+                                if is_linear or is_conv2d_1x1:
+                                    skipped.append(lora_name)
+                                continue
+
+                            lora = module_class(
+                                lora_name,
+                                child_module,
+                                self.multiplier,
+                                dim,
+                                alpha,
+                                dropout=dropout,
+                            )
+                            loras.append(lora)
+            return loras, skipped
+
+        text_encoders = text_encoder if type(text_encoder) == list else [text_encoder]
+
+        self.text_encoder_loras = []
+        skipped_te = []
+        for i, text_encoder in enumerate(text_encoders):
+            if text_encoder is not None:
+                text_encoder_loras, skipped = create_modules(False, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+                self.text_encoder_loras.extend(text_encoder_loras)
+                skipped_te += skipped
+        print(f"create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+
+        self.unet_loras, skipped_un = create_modules(True, unet, LoRANetwork.TRANSFORMER_TARGET_REPLACE_MODULE)
+        print(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert lora.lora_name not in names, f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+
+    def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True):
+        if apply_text_encoder:
+            print("enable LoRA for text encoder")
+        else:
+            self.text_encoder_loras = []
+
+        if apply_unet:
+            print("enable LoRA for U-Net")
+        else:
+            self.unet_loras = []
+
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.apply_to()
+            self.add_module(lora.lora_name, lora)
+
+    def set_multiplier(self, multiplier):
+        self.multiplier = multiplier
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.multiplier = self.multiplier
+
+    def load_weights(self, file):
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import load_file
+
+            weights_sd = load_file(file)
+        else:
+            weights_sd = torch.load(file, map_location="cpu")
+        info = self.load_state_dict(weights_sd, False)
+        return info
+
+    def prepare_optimizer_params(self, text_encoder_lr, unet_lr, default_lr):
+        self.requires_grad_(True)
+        all_params = []
+
+        def enumerate_params(loras):
+            params = []
+            for lora in loras:
+                params.extend(lora.parameters())
+            return params
+
+        if self.text_encoder_loras:
+            param_data = {"params": enumerate_params(self.text_encoder_loras)}
+            if text_encoder_lr is not None:
+                param_data["lr"] = text_encoder_lr
+            all_params.append(param_data)
+
+        if self.unet_loras:
+            param_data = {"params": enumerate_params(self.unet_loras)}
+            if unet_lr is not None:
+                param_data["lr"] = unet_lr
+            all_params.append(param_data)
+
+        return all_params
+
+    def enable_gradient_checkpointing(self):
+        pass
+
+    def get_trainable_params(self):
+        return self.parameters()
+
+    def save_weights(self, file, dtype, metadata):
+        if metadata is not None and len(metadata) == 0:
+            metadata = None
+
+        state_dict = self.state_dict()
+
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+
+        if os.path.splitext(file)[1] == ".safetensors":
+            from safetensors.torch import save_file
+
+            # Precalculate model hashes to save time on indexing
+            if metadata is None:
+                metadata = {}
+            model_hash, legacy_hash = precalculate_safetensors_hashes(state_dict, metadata)
+            metadata["sshs_model_hash"] = model_hash
+            metadata["sshs_legacy_hash"] = legacy_hash
+
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+
+def create_network(
+    multiplier: float,
+    network_dim: Optional[int],
+    network_alpha: Optional[float],
+    text_encoder: Union[T5EncoderModel, List[T5EncoderModel]],
+    transformer,
+    neuron_dropout: Optional[float] = None,
+    skip_name: str = None,
+    **kwargs,
+):
+    if network_dim is None:
+        network_dim = 4  # default
+    if network_alpha is None:
+        network_alpha = 1.0
+
+    network = LoRANetwork(
+        text_encoder,
+        transformer,
+        multiplier=multiplier,
+        lora_dim=network_dim,
+        alpha=network_alpha,
+        dropout=neuron_dropout,
+        skip_name=skip_name,
+        varbose=True,
+    )
+    return network
+
+def merge_lora(pipeline, lora_path, multiplier, device='cpu', dtype=torch.float32, state_dict=None, transformer_only=False):
+    LORA_PREFIX_TRANSFORMER = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    if state_dict is None:
+        state_dict = load_file(lora_path, device=device)
+    else:
+        state_dict = state_dict
+    updates = defaultdict(dict)
+    for key, value in state_dict.items():
+        layer, elem = key.split('.', 1)
+        updates[layer][elem] = value
+
+    sequential_cpu_offload_flag = False
+    if pipeline.transformer.device == torch.device(type="meta"):
+        pipeline.remove_all_hooks()
+        sequential_cpu_offload_flag = True
+        offload_device = pipeline._offload_device
+
+    for layer, elems in updates.items():
+
+        if "lora_te" in layer:
+            if transformer_only:
+                continue
+            else:
+                layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+                curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = layer.split(LORA_PREFIX_TRANSFORMER + "_")[-1].split("_")
+            curr_layer = pipeline.transformer
+
+        try:
+            curr_layer = curr_layer.__getattr__("_".join(layer_infos[1:]))
+        except Exception:
+            temp_name = layer_infos.pop(0)
+            while len(layer_infos) > -1:
+                try:
+                    curr_layer = curr_layer.__getattr__(temp_name + "_" + "_".join(layer_infos))
+                    break
+                except Exception:
+                    try:
+                        curr_layer = curr_layer.__getattr__(temp_name)
+                        if len(layer_infos) > 0:
+                            temp_name = layer_infos.pop(0)
+                        elif len(layer_infos) == 0:
+                            break
+                    except Exception:
+                        if len(layer_infos) == 0:
+                            print('Error loading layer')
+                        if len(temp_name) > 0:
+                            temp_name += "_" + layer_infos.pop(0)
+                        else:
+                            temp_name = layer_infos.pop(0)
+
+        origin_dtype = curr_layer.weight.data.dtype
+        origin_device = curr_layer.weight.data.device
+
+        curr_layer = curr_layer.to(device, dtype)
+        weight_up = elems['lora_up.weight'].to(device, dtype)
+        weight_down = elems['lora_down.weight'].to(device, dtype)
+        
+        if 'alpha' in elems.keys():
+            alpha = elems['alpha'].item() / weight_up.shape[1]
+        else:
+            alpha = 1.0
+
+        if len(weight_up.shape) == 4:
+            curr_layer.weight.data += multiplier * alpha * torch.mm(
+                weight_up.squeeze(3).squeeze(2), weight_down.squeeze(3).squeeze(2)
+            ).unsqueeze(2).unsqueeze(3)
+        else:
+            curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up, weight_down)
+        curr_layer = curr_layer.to(origin_device, origin_dtype)
+
+    if sequential_cpu_offload_flag:
+        pipeline.enable_sequential_cpu_offload(device=offload_device)
+    return pipeline
+
+# TODO: Refactor with merge_lora.
+def unmerge_lora(pipeline, lora_path, multiplier=1, device="cpu", dtype=torch.float32):
+    """Unmerge state_dict in LoRANetwork from the pipeline in diffusers."""
+    LORA_PREFIX_UNET = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    state_dict = load_file(lora_path, device=device)
+
+    updates = defaultdict(dict)
+    for key, value in state_dict.items():
+        layer, elem = key.split('.', 1)
+        updates[layer][elem] = value
+
+    sequential_cpu_offload_flag = False
+    if pipeline.transformer.device == torch.device(type="meta"):
+        pipeline.remove_all_hooks()
+        sequential_cpu_offload_flag = True
+
+    for layer, elems in updates.items():
+
+        if "lora_te" in layer:
+            layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+            curr_layer = pipeline.text_encoder
+        else:
+            layer_infos = layer.split(LORA_PREFIX_UNET + "_")[-1].split("_")
+            curr_layer = pipeline.transformer
+
+        try:
+            curr_layer = curr_layer.__getattr__("_".join(layer_infos[1:]))
+        except Exception:
+            temp_name = layer_infos.pop(0)
+            while len(layer_infos) > -1:
+                try:
+                    curr_layer = curr_layer.__getattr__(temp_name + "_" + "_".join(layer_infos))
+                    break
+                except Exception:
+                    try:
+                        curr_layer = curr_layer.__getattr__(temp_name)
+                        if len(layer_infos) > 0:
+                            temp_name = layer_infos.pop(0)
+                        elif len(layer_infos) == 0:
+                            break
+                    except Exception:
+                        if len(layer_infos) == 0:
+                            print('Error loading layer')
+                        if len(temp_name) > 0:
+                            temp_name += "_" + layer_infos.pop(0)
+                        else:
+                            temp_name = layer_infos.pop(0)
+
+        origin_dtype = curr_layer.weight.data.dtype
+        origin_device = curr_layer.weight.data.device
+
+        curr_layer = curr_layer.to(device, dtype)
+        weight_up = elems['lora_up.weight'].to(device, dtype)
+        weight_down = elems['lora_down.weight'].to(device, dtype)
+        
+        if 'alpha' in elems.keys():
+            alpha = elems['alpha'].item() / weight_up.shape[1]
+        else:
+            alpha = 1.0
+
+        if len(weight_up.shape) == 4:
+            curr_layer.weight.data -= multiplier * alpha * torch.mm(
+                weight_up.squeeze(3).squeeze(2), weight_down.squeeze(3).squeeze(2)
+            ).unsqueeze(2).unsqueeze(3)
+        else:
+            curr_layer.weight.data -= multiplier * alpha * torch.mm(weight_up, weight_down)
+        curr_layer = curr_layer.to(origin_device, origin_dtype)
+
+    if sequential_cpu_offload_flag:
+        pipeline.enable_sequential_cpu_offload(device=device)
+    return pipeline
diff --git a/videox_fun/utils/optical_flow_utils.py b/videox_fun/utils/optical_flow_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dfddbe4f61648cafc010942fae5eb5f2f8ef20f
--- /dev/null
+++ b/videox_fun/utils/optical_flow_utils.py
@@ -0,0 +1,818 @@
+"""
+Optical Flow Extraction for VideoJAM Framework
+===============================================
+
+This module implements optical flow extraction using RAFT and conversion to
+RGB motion representation as described in the VideoJAM paper.
+
+Motion Representation:
+- Magnitude: m = min(1, sqrt(u^2 + v^2) / (0.15 * sqrt(H^2 + W^2)))
+- Direction: a = arctan2(v, u)
+- RGB encoding follows HSV color wheel where hue=direction, saturation=1, value=magnitude
+"""
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from typing import Tuple, Optional, List
+import os
+from pathlib import Path
+
+try:
+    import cv2
+    CV2_AVAILABLE = True
+except ImportError:
+    CV2_AVAILABLE = False
+    print("Warning: cv2 not available. Video I/O functions will not work.")
+
+
+try:
+    from torchvision.models.optical_flow import raft_large, Raft_Large_Weights
+    from torchvision.utils import flow_to_image
+    RAFT_AVAILABLE = True
+    FLOW_TO_IMAGE_AVAILABLE = True
+except ImportError:
+    RAFT_AVAILABLE = False
+    FLOW_TO_IMAGE_AVAILABLE = False
+    print("Warning: torchvision.models.optical_flow not available. Install torchvision >= 0.13.0")
+
+
+class RAFTFlowExtractor:
+    """
+    Extracts dense optical flow from video frames using RAFT.
+    Converts flow fields to RGB motion representation for VideoJAM.
+    """
+
+    def __init__(self, device='cuda', model_weights=None):
+        """
+        Initialize RAFT flow extractor.
+
+        Args:
+            device: Device to run RAFT on ('cuda' or 'cpu')
+            model_weights: Optional path to custom RAFT weights, otherwise uses pretrained
+        """
+        if not RAFT_AVAILABLE:
+            raise RuntimeError("RAFT not available. Install torchvision >= 0.13.0")
+
+        self.device = device
+
+        # Load RAFT model
+        if model_weights is None:
+            weights = Raft_Large_Weights.DEFAULT
+            self.model = raft_large(weights=weights, progress=True)
+        else:
+            self.model = raft_large(weights=None)
+            self.model.load_state_dict(torch.load(model_weights))
+
+        self.model = self.model.to(device)
+        self.model.eval()
+
+        # RAFT preprocessing transforms
+        self.transforms = weights.transforms() if model_weights is None else None
+
+    @torch.no_grad()
+    def extract_flow(self, frame1: torch.Tensor, frame2: torch.Tensor) -> torch.Tensor:
+        """
+        Extract optical flow between two consecutive frames.
+
+        Args:
+            frame1: First frame [B, C, H, W] in range [0, 1] or [0, 255]
+            frame2: Second frame [B, C, H, W] in range [0, 1] or [0, 255]
+
+        Returns:
+            flow: Optical flow [B, 2, H, W] where flow[:, 0] is u and flow[:, 1] is v
+        """
+        # Ensure frames are float32
+        if frame1.dtype != torch.float32 and frame1.dtype != torch.float16:
+            frame1 = frame1.float()
+            frame2 = frame2.float()
+
+        # If frames are in [0, 255], convert to [0, 1]
+        # transforms() expects [0, 1] input
+        if frame1.max() > 1.5:
+            frame1 = frame1 / 255.0
+            frame2 = frame2 / 255.0
+
+        # RAFT requires dimensions divisible by 8
+        # Pad if necessary
+        B, C, H, W = frame1.shape
+        pad_h = (8 - H % 8) % 8
+        pad_w = (8 - W % 8) % 8
+
+        if pad_h > 0 or pad_w > 0:
+            frame1 = torch.nn.functional.pad(frame1, (0, pad_w, 0, pad_h), mode='replicate')
+            frame2 = torch.nn.functional.pad(frame2, (0, pad_w, 0, pad_h), mode='replicate')
+
+        # Apply RAFT preprocessing (expects [0, 1] input, normalizes internally)
+        if self.transforms is not None:
+            frame1, frame2 = self.transforms(frame1, frame2)
+
+        # Extract flow
+        flow_predictions = self.model(frame1.to(self.device), frame2.to(self.device))
+
+        # RAFT returns a list of flow predictions at different iterations
+        # We use the final (most refined) prediction
+        flow = flow_predictions[-1]
+
+        # Remove padding from flow if we added any
+        if pad_h > 0 or pad_w > 0:
+            flow = flow[:, :, :H, :W]
+
+        return flow
+
+    def extract_video_flow(self, video_frames: torch.Tensor) -> torch.Tensor:
+        """
+        Extract optical flow for an entire video sequence.
+
+        Args:
+            video_frames: Video tensor [B, T, C, H, W] or [T, C, H, W]
+
+        Returns:
+            flows: Flow tensor [B, T-1, 2, H, W] or [T-1, 2, H, W]
+                  Note: T-1 because flow is between consecutive frames
+        """
+        if video_frames.ndim == 4:
+            # Add batch dimension if not present
+            video_frames = video_frames.unsqueeze(0)
+            squeeze_output = True
+        else:
+            squeeze_output = False
+
+        B, T, C, H, W = video_frames.shape
+        flows = []
+
+        for t in range(T - 1):
+            flow = self.extract_flow(video_frames[:, t], video_frames[:, t + 1])
+            flows.append(flow)
+
+        flows = torch.stack(flows, dim=1)  # [B, T-1, 2, H, W]
+
+        if squeeze_output:
+            flows = flows.squeeze(0)
+
+        return flows
+
+    def extract_videojam_motion(self,
+                                 video_frames: torch.Tensor,
+                                 sigma: float = 0.15,
+                                 deadzone_px: float = 0.0,
+                                 target_resolution: int = None) -> torch.Tensor:
+        """
+        Extract VideoJAM motion representation for an entire video sequence.
+
+        This performs:
+        1. RAFT flow extraction between consecutive frames (T-1 flows)
+        2. VideoJAM normalization (resolution-aware)
+        3. HSV-to-RGB encoding
+        4. Temporal alignment (duplicate first frame to get T frames)
+
+        Args:
+            video_frames: Video tensor [B, T, C, H, W] or [T, C, H, W] in range [0, 1]
+            sigma: VideoJAM normalization constant (default: 0.15)
+            deadzone_px: Magnitude threshold to suppress noise (default: 0.05 px)
+            target_resolution: Target training resolution (e.g., 256). If set, scales flow
+                              vectors to match target resolution brightness.
+
+        Returns:
+            motion_rgb: RGB motion tensor [B, T, 3, H, W] or [T, 3, H, W]
+                       Aligned with input video (same temporal length T)
+        """
+        if video_frames.ndim == 4:
+            # Add batch dimension if not present
+            video_frames = video_frames.unsqueeze(0)
+            squeeze_output = True
+        else:
+            squeeze_output = False
+
+        B, T, C, H, W = video_frames.shape
+
+        # 1. Extract flow: [B, T-1, 2, H, W]
+        flows = self.extract_video_flow(video_frames)
+
+        # 2. Convert to VideoJAM RGB motion: [B, T-1, 3, H, W]
+        motion_rgb = flow_to_motion_rgb_videojam(
+            flows,
+            sigma=sigma,
+            deadzone_px=deadzone_px,
+            target_resolution=target_resolution
+        )
+
+        # 3. Temporal alignment: duplicate first frame to match length T
+        # This aligns motion[t] with video[t]
+        first_frame = motion_rgb[:, 0:1, :, :, :]  # [B, 1, 3, H, W]
+        motion_rgb = torch.cat([first_frame, motion_rgb], dim=1)  # [B, T, 3, H, W]
+
+        if squeeze_output:
+            motion_rgb = motion_rgb.squeeze(0)
+
+        return motion_rgb
+
+
+def flow_to_motion_rgb_videojam(flow: torch.Tensor,
+                                 sigma: float = 0.15,
+                                 deadzone_px: float = 0.0,
+                                 target_resolution: int = None,
+                                 return_magnitude_angle: bool = False) -> torch.Tensor:
+    """
+    Convert optical flow to RGB motion representation following VideoJAM paper specification.
+
+    This is NOT a visualization - it's a weak, normalized motion prior for diffusion training.
+    Static scenes will produce BLACK output (this is CORRECT behavior).
+
+    VideoJAM normalization (Eq. 5 from paper):
+    - Magnitude: m = min(1, sqrt(u² + v²) / (σ · sqrt(H² + W²)))
+      where σ = 0.15 (fixed constant from paper)
+    - Direction: α = arctan2(v, u)
+    - HSV encoding: Hue=direction, Saturation=1, Value=magnitude
+
+    IMPORTANT: Use target_resolution to match training resolution!
+    If you extract flow at 1600×900 but train at 256×256, set target_resolution=256.
+    This scales the flow vectors so brightness matches what you'd get from 256×256 extraction.
+
+    Args:
+        flow: Optical flow tensor [..., 2, H, W] where flow[..., 0] is u, flow[..., 1] is v
+        sigma: Normalization constant (default: 0.15 from VideoJAM paper)
+        deadzone_px: Magnitude threshold in pixels to suppress RAFT noise (default: 0.0)
+                     IMPORTANT: 0.0 is paper-faithful. Only use 0.05-0.1 if you see
+                     static noise artifacts. Can kill subtle motion in low-res or
+                     small-motion videos.
+        target_resolution: Target training resolution (e.g., 256). If set, scales flow
+                          vectors and uses target diagonal for normalization.
+                          Use this to extract at high-res but get correct brightness for training!
+        return_magnitude_angle: If True, also returns (magnitude, angle) tensors
+
+    Returns:
+        motion_rgb: RGB motion tensor [..., 3, H, W] in range [0, 1]
+                   Will be DARK for static scenes (this is correct!)
+    """
+    # Extract u, v components
+    u = flow[..., 0:1, :, :]  # [..., 1, H, W]
+    v = flow[..., 1:2, :, :]  # [..., 1, H, W]
+
+    H, W = flow.shape[-2:]
+
+    # 1. Compute raw magnitude: ||d|| = sqrt(u² + v²)
+    magnitude_raw = torch.sqrt(u * u + v * v)
+
+    # Print flow statistics for debugging
+    if magnitude_raw.numel() > 0:
+        mag_flat = magnitude_raw.flatten()
+        # Sample to avoid memory issues
+        if mag_flat.numel() > 10000:
+            indices = torch.randperm(mag_flat.numel())[:10000]
+            mag_sample = mag_flat[indices]
+        else:
+            mag_sample = mag_flat
+        print(f"[VideoJAM Flow] Raw magnitude stats: "
+              f"mean={mag_sample.mean().item():.4f} px, "
+              f"median={mag_sample.median().item():.4f} px, "
+              f"max={mag_sample.max().item():.4f} px, "
+              f"p95={torch.quantile(mag_sample, 0.95).item():.4f} px")
+
+    # 2. Optional: Apply deadzone to suppress RAFT noise in static regions
+    # Zero out u,v when magnitude is below threshold (so it affects final result)
+    if deadzone_px > 0:
+        mask = magnitude_raw < deadzone_px
+        u = torch.where(mask, torch.zeros_like(u), u)
+        v = torch.where(mask, torch.zeros_like(v), v)
+        # Recompute magnitude after deadzone
+        magnitude_raw = torch.sqrt(u * u + v * v)
+
+    # 2.5. Optional: Scale flow vectors to match target training resolution
+    # This allows extracting flow at high-res but getting correct brightness for training
+    if target_resolution is not None:
+        # Scale flow vectors proportionally
+        # If video is 1600x900 and target is 256, scale factor is 256/900 = 0.284
+        # This makes a 100px motion at 1600x900 become 28.4px, matching native 256x256
+        scale_factor = target_resolution / min(H, W)
+        u = u * scale_factor
+        v = v * scale_factor
+
+        # Recompute magnitude after scaling
+        magnitude_raw = torch.sqrt(u * u + v * v)
+
+        # Use target resolution's diagonal for normalization (assumes square)
+        diagonal = target_resolution * (2 ** 0.5)
+
+        print(f"[VideoJAM Flow] Scaling flow: {H}x{W} → {target_resolution}x{target_resolution}")
+        print(f"[VideoJAM Flow] Scale factor: {scale_factor:.4f}")
+        print(f"[VideoJAM Flow] Target diagonal: {diagonal:.2f} px")
+    else:
+        # Use actual resolution's diagonal
+        diagonal = (H * H + W * W) ** 0.5
+
+    # 3. VideoJAM normalization (Eq. 5): m = min(1, ||d|| / (σ · sqrt(H² + W²)))
+    normalization_factor = sigma * diagonal
+    magnitude_normalized = torch.clamp(magnitude_raw / (normalization_factor + 1e-8), 0.0, 1.0)
+
+    print(f"[VideoJAM Flow] Resolution: {H}x{W}, Diagonal={diagonal:.2f}")
+    print(f"[VideoJAM Flow] Norm factor (σ·diagonal): {normalization_factor:.2f} px, σ={sigma}")
+    print(f"[VideoJAM Flow] Normalized magnitude: "
+          f"mean={magnitude_normalized.mean().item():.4f}, "
+          f"max={magnitude_normalized.max().item():.4f}")
+
+    # 4. Compute direction: α = arctan2(v, u)
+    angle = torch.atan2(v, u)  # Range: [-π, π]
+
+    # 5. Convert to HSV:
+    # - Hue = (α + π) / (2π)  [maps -π,π to 0,1]
+    # - Saturation = 1 (constant, full saturation)
+    # - Value = m (normalized magnitude)
+    hue = (angle + np.pi) / (2 * np.pi)  # [..., 1, H, W] in [0, 1]
+    saturation = torch.ones_like(magnitude_normalized)  # S = 1 (constant)
+    value = magnitude_normalized  # V = normalized magnitude
+
+    # Stack to HSV: [..., 3, H, W]
+    hsv = torch.cat([hue, saturation, value], dim=-3)
+
+    # 6. Convert HSV to RGB
+    motion_rgb = hsv_to_rgb_torch(hsv)
+
+    if return_magnitude_angle:
+        return motion_rgb, magnitude_normalized, angle
+    else:
+        return motion_rgb
+
+
+def flow_to_motion_rgb(flow: torch.Tensor,
+                       fixed_clip_px: float = 10.0,
+                       deadzone: float = 0.0,
+                       return_magnitude_angle: bool = False) -> torch.Tensor:
+    """
+    Stable flow->RGB with fixed pixel clip for consistent visualization.
+
+    NOTE: This is for VISUALIZATION, not VideoJAM training!
+    For VideoJAM training, use flow_to_motion_rgb_videojam() instead.
+
+    HSV mapping:
+    - Hue = angle (direction)
+    - Saturation = 1 (constant, vivid colors)
+    - Value = normalized magnitude (brightness)
+
+    Uses fixed clip in pixels/frame for stable, visible flow across videos.
+
+    Args:
+        flow: Optical flow tensor [..., 2, H, W] where flow[..., 0] is u, flow[..., 1] is v
+        fixed_clip_px: Fixed magnitude clip in pixels/frame (default: 10.0)
+                      0 px → black, fixed_clip_px → full brightness
+        deadzone: Magnitude threshold below which flow is set to zero (default: 0.0)
+        return_magnitude_angle: If True, also returns (magnitude, angle) tensors
+
+    Returns:
+        motion_rgb: RGB motion tensor [..., 3, H, W] in range [0, 1]
+    """
+    # Extract u, v components
+    u = flow[..., 0:1, :, :]  # [..., 1, H, W]
+    v = flow[..., 1:2, :, :]  # [..., 1, H, W]
+
+    H, W = flow.shape[-2:]
+
+    # Compute magnitude
+    magnitude_raw = torch.sqrt(u * u + v * v)
+
+    # Print flow statistics for debugging (sample to avoid memory issues)
+    if magnitude_raw.numel() > 0:
+        mag_flat = magnitude_raw.flatten()
+        # Sample 10k elements if tensor is large
+        if mag_flat.numel() > 10000:
+            indices = torch.randperm(mag_flat.numel())[:10000]
+            mag_sample = mag_flat[indices]
+        else:
+            mag_sample = mag_flat
+        print(f"Flow magnitude stats: mean={mag_sample.mean().item():.3f}, "
+              f"median={mag_sample.median().item():.3f}, "
+              f"max={mag_sample.max().item():.3f}, "
+              f"p95={torch.quantile(mag_sample, 0.95).item():.3f} px/frame")
+
+    # Apply deadzone to RAW magnitude (in pixels): suppress tiny noise
+    if deadzone > 0:
+        magnitude_raw = torch.where(magnitude_raw < deadzone, torch.zeros_like(magnitude_raw), magnitude_raw)
+
+    # Fixed clip normalization: stable across videos
+    # 0 px → 0, fixed_clip_px → 1.0
+    magnitude = torch.clamp(magnitude_raw / fixed_clip_px, 0.0, 1.0)
+
+    # Compute angle
+    angle = torch.atan2(v, u)  # Range: [-pi, pi]
+
+    # Convert to HSV (stable mapping):
+    # H = angle [0, 1]
+    # S = 1 (constant saturation for vivid colors)
+    # V = magnitude (normalized magnitude)
+    hue = (angle + np.pi) / (2 * np.pi)  # [..., 1, H, W] in [0, 1]
+    saturation = torch.ones_like(magnitude)  # S = 1 (constant)
+    value = magnitude  # V = magnitude
+
+    # Stack to HSV
+    hsv = torch.cat([hue, saturation, value], dim=-3)  # [..., 3, H, W]
+
+    # Convert HSV to RGB using torch implementation
+    motion_rgb = hsv_to_rgb_torch(hsv)
+
+    if return_magnitude_angle:
+        return motion_rgb, magnitude, angle
+    else:
+        return motion_rgb
+
+
+def _get_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor:
+    """Create a 2D Gaussian kernel for spatial smoothing."""
+    x = torch.arange(kernel_size).float() - kernel_size // 2
+    gauss = torch.exp(-x.pow(2) / (2 * sigma ** 2))
+    kernel = gauss[:, None] * gauss[None, :]
+    kernel = kernel / kernel.sum()
+    return kernel.view(1, 1, kernel_size, kernel_size)
+
+
+def hsv_to_rgb_torch(hsv: torch.Tensor) -> torch.Tensor:
+    """
+    Convert HSV color space to RGB using PyTorch operations.
+
+    Args:
+        hsv: HSV tensor [..., 3, H, W] with values in [0, 1]
+             hsv[..., 0, :, :] = Hue
+             hsv[..., 1, :, :] = Saturation
+             hsv[..., 2, :, :] = Value
+
+    Returns:
+        rgb: RGB tensor [..., 3, H, W] with values in [0, 1]
+    """
+    h = hsv[..., 0:1, :, :]  # Hue
+    s = hsv[..., 1:2, :, :]  # Saturation
+    v = hsv[..., 2:3, :, :]  # Value
+
+    h = h * 6.0  # Scale hue to [0, 6]
+
+    # Compute RGB components based on hue sector
+    c = v * s
+    x = c * (1 - torch.abs((h % 2) - 1))
+    m = v - c
+
+    # Initialize RGB
+    rgb = torch.zeros_like(hsv)
+
+    # Determine RGB values based on hue sector
+    mask = (h >= 0) & (h < 1)
+    rgb[..., 0:1, :, :] = torch.where(mask, c, rgb[..., 0:1, :, :])
+    rgb[..., 1:2, :, :] = torch.where(mask, x, rgb[..., 1:2, :, :])
+
+    mask = (h >= 1) & (h < 2)
+    rgb[..., 0:1, :, :] = torch.where(mask, x, rgb[..., 0:1, :, :])
+    rgb[..., 1:2, :, :] = torch.where(mask, c, rgb[..., 1:2, :, :])
+
+    mask = (h >= 2) & (h < 3)
+    rgb[..., 1:2, :, :] = torch.where(mask, c, rgb[..., 1:2, :, :])
+    rgb[..., 2:3, :, :] = torch.where(mask, x, rgb[..., 2:3, :, :])
+
+    mask = (h >= 3) & (h < 4)
+    rgb[..., 1:2, :, :] = torch.where(mask, x, rgb[..., 1:2, :, :])
+    rgb[..., 2:3, :, :] = torch.where(mask, c, rgb[..., 2:3, :, :])
+
+    mask = (h >= 4) & (h < 5)
+    rgb[..., 0:1, :, :] = torch.where(mask, x, rgb[..., 0:1, :, :])
+    rgb[..., 2:3, :, :] = torch.where(mask, c, rgb[..., 2:3, :, :])
+
+    mask = (h >= 5) & (h < 6)
+    rgb[..., 0:1, :, :] = torch.where(mask, c, rgb[..., 0:1, :, :])
+    rgb[..., 2:3, :, :] = torch.where(mask, x, rgb[..., 2:3, :, :])
+
+    # Add m to all components
+    rgb = rgb + m
+
+    return rgb
+
+
+def rgb_to_flow(motion_rgb: torch.Tensor,
+                height: int,
+                width: int,
+                normalization_factor: float = 0.15) -> torch.Tensor:
+    """
+    Convert RGB motion representation back to optical flow (u, v).
+    Inverse of flow_to_motion_rgb.
+
+    Args:
+        motion_rgb: RGB motion tensor [..., 3, H, W] in range [0, 1]
+        height: Original video height
+        width: Original video width
+        normalization_factor: Scaling factor used in forward conversion
+
+    Returns:
+        flow: Optical flow tensor [..., 2, H, W]
+    """
+    # Convert RGB to HSV
+    hsv = rgb_to_hsv_torch(motion_rgb)
+
+    hue = hsv[..., 0:1, :, :]  # [..., 1, H, W] in [0, 1]
+    magnitude = hsv[..., 2:3, :, :]  # [..., 1, H, W] in [0, 1]
+
+    # Convert hue back to angle
+    angle = hue * 2 * np.pi - np.pi  # [..., 1, H, W] in [-pi, pi]
+
+    # Denormalize magnitude
+    diagonal = np.sqrt(height ** 2 + width ** 2)
+    magnitude_raw = magnitude * normalization_factor * diagonal
+
+    # Convert polar to cartesian
+    u = magnitude_raw * torch.cos(angle)
+    v = magnitude_raw * torch.sin(angle)
+
+    flow = torch.cat([u, v], dim=-3)
+
+    return flow
+
+
+def rgb_to_hsv_torch(rgb: torch.Tensor) -> torch.Tensor:
+    """
+    Convert RGB to HSV color space using PyTorch operations.
+
+    Args:
+        rgb: RGB tensor [..., 3, H, W] with values in [0, 1]
+
+    Returns:
+        hsv: HSV tensor [..., 3, H, W] with values in [0, 1]
+    """
+    r = rgb[..., 0:1, :, :]
+    g = rgb[..., 1:2, :, :]
+    b = rgb[..., 2:3, :, :]
+
+    max_rgb, max_idx = rgb.max(dim=-3, keepdim=True)
+    min_rgb, _ = rgb.min(dim=-3, keepdim=True)
+
+    delta = max_rgb - min_rgb
+
+    # Hue calculation
+    hue = torch.zeros_like(max_rgb)
+
+    mask = (max_idx == 0) & (delta != 0)
+    hue = torch.where(mask, ((g - b) / delta) % 6, hue)
+
+    mask = (max_idx == 1) & (delta != 0)
+    hue = torch.where(mask, ((b - r) / delta) + 2, hue)
+
+    mask = (max_idx == 2) & (delta != 0)
+    hue = torch.where(mask, ((r - g) / delta) + 4, hue)
+
+    hue = hue / 6.0  # Normalize to [0, 1]
+
+    # Saturation calculation
+    saturation = torch.where(max_rgb != 0, delta / max_rgb, torch.zeros_like(max_rgb))
+
+    # Value calculation
+    value = max_rgb
+
+    hsv = torch.cat([hue, saturation, value], dim=-3)
+
+    return hsv
+
+
+def save_motion_video(motion_rgb: torch.Tensor,
+                     output_path: str,
+                     fps: int = 30):
+    """
+    Save RGB motion representation as a video file using high-quality encoding.
+
+    Args:
+        motion_rgb: Motion RGB tensor [T, 3, H, W] or [B, T, 3, H, W]
+        output_path: Output video file path
+        fps: Frames per second
+    """
+    if motion_rgb.ndim == 5:
+        # Take first batch element
+        motion_rgb = motion_rgb[0]
+
+    T, C, H, W = motion_rgb.shape
+
+    # Convert to numpy and scale to [0, 255]
+    motion_np = (motion_rgb.cpu().numpy().transpose(0, 2, 3, 1) * 255).astype(np.uint8)
+
+    # Save frames to temporary directory and use ffmpeg for high-quality encoding
+    import tempfile
+    import subprocess
+
+    temp_dir = tempfile.mkdtemp()
+
+    try:
+        # Save individual frames as PNG (lossless)
+        for t in range(T):
+            frame = cv2.cvtColor(motion_np[t], cv2.COLOR_RGB2BGR)
+            frame_path = os.path.join(temp_dir, f'frame_{t:06d}.png')
+            cv2.imwrite(frame_path, frame)
+
+        # Use ffmpeg to create high-quality video
+        # -pix_fmt yuv420p: compatible with most players
+        # -crf 17: high quality (0=lossless, 23=default, 51=worst)
+        # -preset slow: better compression
+        ffmpeg_cmd = [
+            'ffmpeg', '-y',
+            '-framerate', str(fps),
+            '-i', os.path.join(temp_dir, 'frame_%06d.png'),
+            '-c:v', 'libx264',
+            '-pix_fmt', 'yuv420p',
+            '-crf', '17',
+            '-preset', 'medium',
+            output_path
+        ]
+
+        subprocess.run(ffmpeg_cmd, check=True, capture_output=True)
+        print(f"Motion video saved to {output_path}")
+
+    finally:
+        # Clean up temporary files
+        import shutil
+        shutil.rmtree(temp_dir)
+
+
+
+def precompute_motion_dataset(video_dir: str,
+                              output_dir: str,
+                              device: str = 'cuda',
+                              video_ext: str = '.mp4',
+                              use_videojam: bool = True,
+                              sigma: float = 0.15,
+                              deadzone_px: float = 0.0):
+    """
+    Precompute motion RGB videos for an entire dataset.
+
+    IMPORTANT: Extract flow from GROUND TRUTH videos (person removed, correct physics).
+    NOT from input videos with the person still present!
+
+    The GT videos show the desired outcome (e.g., guitar falling after person removed).
+    The optical flow from these GT videos teaches the model what realistic motion looks like.
+
+    Args:
+        video_dir: Directory containing GT videos (person removed, correct physics)
+        output_dir: Directory to save motion RGB videos
+        device: Device to run RAFT on
+        video_ext: Video file extension
+        use_videojam: If True, use VideoJAM normalization (recommended for training)
+                     If False, use fixed-pixel visualization normalization
+        sigma: VideoJAM normalization constant (default: 0.15)
+        deadzone_px: Magnitude threshold to suppress noise (default: 0.05 px)
+    """
+    os.makedirs(output_dir, exist_ok=True)
+
+    flow_extractor = RAFTFlowExtractor(device=device)
+    video_files = sorted(Path(video_dir).glob(f'*{video_ext}'))
+
+    print(f"Found {len(video_files)} videos to process")
+    print(f"Using VideoJAM normalization: {use_videojam}")
+
+    for video_path in video_files:
+        print(f"\nProcessing {video_path.name}...")
+
+        # Load video
+        cap = cv2.VideoCapture(str(video_path))
+
+        # Get FPS from input video
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        if fps == 0:
+            print(f"Warning: Could not read FPS from {video_path.name}, defaulting to 30")
+            fps = 30
+        else:
+            print(f"Input video FPS: {fps:.2f}")
+
+        frames = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(frame)
+        cap.release()
+
+        # Convert to tensor [T, H, W, C] -> [T, C, H, W]
+        frames = np.stack(frames)
+        frames_tensor = torch.from_numpy(frames).permute(0, 3, 1, 2).float() / 255.0
+
+        if use_videojam:
+            # Use VideoJAM motion extraction (handles flow + normalization + temporal alignment)
+            motion_rgb = flow_extractor.extract_videojam_motion(
+                frames_tensor,
+                sigma=sigma,
+                deadzone_px=deadzone_px
+            )  # [T, 3, H, W]
+        else:
+            # Legacy visualization mode
+            flows = flow_extractor.extract_video_flow(frames_tensor)  # [T-1, 2, H, W]
+            motion_rgb = flow_to_motion_rgb(flows)  # [T-1, 3, H, W]
+            motion_rgb = torch.cat([motion_rgb[0:1], motion_rgb], dim=0)  # [T, 3, H, W]
+
+        # Save motion video with matching FPS
+        output_path = os.path.join(output_dir, video_path.name)
+        save_motion_video(motion_rgb, output_path, fps=int(fps))
+
+    print(f"\nMotion dataset saved to {output_dir}")
+
+
+def extract_videojam_motion_from_video(video_path: str,
+                                        output_path: str,
+                                        device: str = 'cuda',
+                                        sigma: float = 0.15,
+                                        deadzone_px: float = 0.0,
+                                        fps: int = None,
+                                        target_size: int = None,
+                                        target_resolution: int = None) -> torch.Tensor:
+    """
+    Standalone function to extract VideoJAM motion from a single video file.
+
+    IMPORTANT: For training, set target_resolution to match your training resolution!
+    This scales flow vectors to match the brightness you'd get at target resolution.
+
+    Args:
+        video_path: Path to input video
+        output_path: Path to save motion RGB video
+        device: Device to run RAFT on
+        sigma: VideoJAM normalization constant (default: 0.15)
+        deadzone_px: Magnitude threshold to suppress noise (default: 0.0 px)
+        fps: Frame rate for output video (if None, matches input video FPS)
+        target_size: DEPRECATED - use target_resolution instead
+        target_resolution: Training resolution (e.g., 256). Scales flow vectors to match
+                          brightness at this resolution while extracting at native resolution.
+                          RECOMMENDED: Set to 256 for 256×256 training
+
+    Returns:
+        motion_rgb: RGB motion tensor [T, 3, H, W]
+    """
+    # Backwards compatibility: if target_size is provided but not target_resolution, use it
+    if target_size is not None and target_resolution is None:
+        print(f"⚠️  WARNING: target_size is deprecated. Use target_resolution instead.")
+        print(f"   Setting target_resolution={target_size} for flow vector scaling.")
+        target_resolution = target_size
+
+    # Load video
+    cap = cv2.VideoCapture(video_path)
+
+    # Get FPS from input video if not specified
+    if fps is None:
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        if fps == 0:
+            print(f"Warning: Could not read FPS from video, defaulting to 30")
+            fps = 30
+        else:
+            print(f"Detected input video FPS: {fps:.2f}")
+
+    frames = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames.append(frame)
+    cap.release()
+
+    if len(frames) == 0:
+        raise ValueError(f"No frames loaded from {video_path}")
+
+    print(f"Loaded {len(frames)} frames from {video_path}")
+
+    # Convert to tensor [T, H, W, C] -> [T, C, H, W]
+    frames = np.stack(frames)
+    frames_tensor = torch.from_numpy(frames).permute(0, 3, 1, 2).float() / 255.0
+
+    h, w = frames_tensor.shape[2:]
+    print(f"Video resolution: {w}x{h}")
+    print(f"Extracting flow at native resolution for maximum accuracy...")
+
+    if target_resolution is not None:
+        print(f"✓ Flow vectors will be scaled for target_resolution={target_resolution}")
+        print(f"  This matches the brightness you'd get from native {target_resolution}x{target_resolution} extraction")
+
+    # Extract VideoJAM motion at native resolution
+    flow_extractor = RAFTFlowExtractor(device=device)
+    motion_rgb = flow_extractor.extract_videojam_motion(
+        frames_tensor,
+        sigma=sigma,
+        deadzone_px=deadzone_px,
+        target_resolution=target_resolution
+    )  # [T, 3, H, W]
+
+    # Save motion video
+    save_motion_video(motion_rgb, output_path, fps=fps)
+
+    return motion_rgb
+
+
+if __name__ == "__main__":
+    """
+    Example usage for precomputing motion dataset.
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Precompute motion RGB videos from GT videos')
+    parser.add_argument('--video_dir', type=str, required=True,
+                       help='Directory containing GT videos')
+    parser.add_argument('--output_dir', type=str, required=True,
+                       help='Directory to save motion RGB videos')
+    parser.add_argument('--device', type=str, default='cuda',
+                       help='Device to run RAFT on (cuda or cpu)')
+    parser.add_argument('--video_ext', type=str, default='.mp4',
+                       help='Video file extension')
+
+    args = parser.parse_args()
+
+    precompute_motion_dataset(
+        video_dir=args.video_dir,
+        output_dir=args.output_dir,
+        device=args.device,
+        video_ext=args.video_ext
+    )
diff --git a/videox_fun/utils/utils.py b/videox_fun/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a912943d46361d81d477aa32edf83461409ab65
--- /dev/null
+++ b/videox_fun/utils/utils.py
@@ -0,0 +1,595 @@
+import os
+import sys
+import glob
+import json
+import gc
+import imageio
+from loguru import logger
+import inspect
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision
+import cv2
+from einops import rearrange, repeat
+from PIL import Image
+import mediapy as media
+import skimage
+import matplotlib
+
+from videox_fun.data.dataset_image_video import get_random_mask
+
+
+def filter_kwargs(cls, kwargs):
+    sig = inspect.signature(cls.__init__)
+    valid_params = set(sig.parameters.keys()) - {'self', 'cls'}
+    filtered_kwargs = {k: v for k, v in kwargs.items() if k in valid_params}
+    return filtered_kwargs
+
+def get_width_and_height_from_image_and_base_resolution(image, base_resolution):
+    target_pixels = int(base_resolution) * int(base_resolution)
+    original_width, original_height = Image.open(image).size
+    ratio = (target_pixels / (original_width * original_height)) ** 0.5
+    width_slider = round(original_width * ratio)
+    height_slider = round(original_height * ratio)
+    return height_slider, width_slider
+
+def color_transfer(sc, dc):
+    """
+    Transfer color distribution from of sc, referred to dc.
+
+    Args:
+        sc (numpy.ndarray): input image to be transfered.
+        dc (numpy.ndarray): reference image
+
+    Returns:
+        numpy.ndarray: Transferred color distribution on the sc.
+    """
+
+    def get_mean_and_std(img):
+        x_mean, x_std = cv2.meanStdDev(img)
+        x_mean = np.hstack(np.around(x_mean, 2))
+        x_std = np.hstack(np.around(x_std, 2))
+        return x_mean, x_std
+
+    sc = cv2.cvtColor(sc, cv2.COLOR_RGB2LAB)
+    s_mean, s_std = get_mean_and_std(sc)
+    dc = cv2.cvtColor(dc, cv2.COLOR_RGB2LAB)
+    t_mean, t_std = get_mean_and_std(dc)
+    img_n = ((sc - s_mean) * (t_std / s_std)) + t_mean
+    np.putmask(img_n, img_n > 255, 255)
+    np.putmask(img_n, img_n < 0, 0)
+    dst = cv2.cvtColor(cv2.convertScaleAbs(img_n), cv2.COLOR_LAB2RGB)
+    return dst
+
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=12, imageio_backend=True, color_transfer_post_process=False):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).cpu().float().numpy().astype(np.uint8)
+        outputs.append(Image.fromarray(x))
+
+    if color_transfer_post_process:
+        for i in range(1, len(outputs)):
+            outputs[i] = Image.fromarray(color_transfer(np.uint8(outputs[i]), np.uint8(outputs[0])))
+
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    if imageio_backend:
+        if path.endswith("mp4"):
+            imageio.mimsave(path, outputs, fps=fps)
+        else:
+            imageio.mimsave(path, outputs, duration=(1000 * 1/fps))
+    else:
+        if path.endswith("mp4"):
+            path = path.replace('.mp4', '.gif')
+        outputs[0].save(path, format='GIF', append_images=outputs, save_all=True, duration=100, loop=0)
+
+
+def save_inout_row(input_video, input_mask, output_video, video_path, fps=16, visualize_masked_video=False, visualize_error=True):
+    input_video = rearrange(input_video[0], "c t h w -> t h w c")
+    input_mask = rearrange(input_mask[0], "c t h w -> t h w c")
+    input_mask = repeat(input_mask, "t h w c -> t h w (repeat c)", repeat=3)
+    input_mask = 1 - input_mask
+    output_video = rearrange(output_video[0], "c t h w -> t h w c")
+    min_len = min(len(input_video), len(output_video), len(input_mask))
+    input_video = input_video[:min_len]
+    input_mask = input_mask[:min_len]
+    output_video = output_video[:min_len]
+
+    row = [input_video.cpu().float().numpy(), input_mask.cpu().float().numpy(),]
+    if visualize_masked_video:
+        row += [(input_mask * input_video).cpu().float().numpy()]
+    row += [output_video.cpu().float().numpy()]
+
+    if visualize_error:
+        err = torch.abs(input_video - output_video).mean(-1).cpu().float().numpy()
+        vis_err = apply_colormap(err)
+        row += [vis_err]
+
+    row = np.concatenate(row, 2)
+    media.write_video(video_path, row, fps=fps)
+
+
+def get_image_to_video_latent(validation_image_start, validation_image_end, video_length, sample_size):
+    if validation_image_start is not None and validation_image_end is not None:
+        if type(validation_image_start) is str and os.path.isfile(validation_image_start):
+            image_start = clip_image = Image.open(validation_image_start).convert("RGB")
+            image_start = image_start.resize([sample_size[1], sample_size[0]])
+            clip_image = clip_image.resize([sample_size[1], sample_size[0]])
+        else:
+            image_start = clip_image = validation_image_start
+            image_start = [_image_start.resize([sample_size[1], sample_size[0]]) for _image_start in image_start]
+            clip_image = [_clip_image.resize([sample_size[1], sample_size[0]]) for _clip_image in clip_image]
+
+        if type(validation_image_end) is str and os.path.isfile(validation_image_end):
+            image_end = Image.open(validation_image_end).convert("RGB")
+            image_end = image_end.resize([sample_size[1], sample_size[0]])
+        else:
+            image_end = validation_image_end
+            image_end = [_image_end.resize([sample_size[1], sample_size[0]]) for _image_end in image_end]
+
+        if type(image_start) is list:
+            clip_image = clip_image[0]
+            start_video = torch.cat(
+                [torch.from_numpy(np.array(_image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0) for _image_start in image_start], 
+                dim=2
+            )
+            input_video = torch.tile(start_video[:, :, :1], [1, 1, video_length, 1, 1])
+            input_video[:, :, :len(image_start)] = start_video
+            
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, len(image_start):] = 255
+        else:
+            input_video = torch.tile(
+                torch.from_numpy(np.array(image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0), 
+                [1, 1, video_length, 1, 1]
+            )
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, 1:] = 255
+
+        if type(image_end) is list:
+            image_end = [_image_end.resize(image_start[0].size if type(image_start) is list else image_start.size) for _image_end in image_end]
+            end_video = torch.cat(
+                [torch.from_numpy(np.array(_image_end)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0) for _image_end in image_end], 
+                dim=2
+            )
+            input_video[:, :, -len(end_video):] = end_video
+            
+            input_video_mask[:, :, -len(image_end):] = 0
+        else:
+            image_end = image_end.resize(image_start[0].size if type(image_start) is list else image_start.size)
+            input_video[:, :, -1:] = torch.from_numpy(np.array(image_end)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0)
+            input_video_mask[:, :, -1:] = 0
+
+        input_video = input_video / 255
+
+    elif validation_image_start is not None:
+        if type(validation_image_start) is str and os.path.isfile(validation_image_start):
+            image_start = clip_image = Image.open(validation_image_start).convert("RGB")
+            image_start = image_start.resize([sample_size[1], sample_size[0]])
+            clip_image = clip_image.resize([sample_size[1], sample_size[0]])
+        else:
+            image_start = clip_image = validation_image_start
+            image_start = [_image_start.resize([sample_size[1], sample_size[0]]) for _image_start in image_start]
+            clip_image = [_clip_image.resize([sample_size[1], sample_size[0]]) for _clip_image in clip_image]
+        image_end = None
+        
+        if type(image_start) is list:
+            clip_image = clip_image[0]
+            start_video = torch.cat(
+                [torch.from_numpy(np.array(_image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0) for _image_start in image_start], 
+                dim=2
+            )
+            input_video = torch.tile(start_video[:, :, :1], [1, 1, video_length, 1, 1])
+            input_video[:, :, :len(image_start)] = start_video
+            input_video = input_video / 255
+            
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, len(image_start):] = 255
+        else:
+            input_video = torch.tile(
+                torch.from_numpy(np.array(image_start)).permute(2, 0, 1).unsqueeze(1).unsqueeze(0), 
+                [1, 1, video_length, 1, 1]
+            ) / 255
+            input_video_mask = torch.zeros_like(input_video[:, :1])
+            input_video_mask[:, :, 1:, ] = 255
+    else:
+        image_start = None
+        image_end = None
+        input_video = torch.zeros([1, 3, video_length, sample_size[0], sample_size[1]])
+        input_video_mask = torch.ones([1, 1, video_length, sample_size[0], sample_size[1]]) * 255
+        clip_image = None
+
+    del image_start
+    del image_end
+    gc.collect()
+
+    return  input_video, input_video_mask, clip_image
+
+def get_video_to_video_latent(input_video_path, video_length, sample_size, fps=None, validation_video_mask=None, ref_image=None):
+    if isinstance(input_video_path, str):
+        input_video = media.read_video(input_video_path)
+    else:
+        input_video, input_video_mask = None, None
+
+    input_video = torch.from_numpy(np.array(input_video))[:video_length]
+    input_video = input_video.permute([3, 0, 1, 2]).float() / 255  # (c, t, h, w)
+    input_video = F.interpolate(input_video, sample_size, mode='area').unsqueeze(0)  # (1, c, t, h, w)
+
+    if validation_video_mask is not None:
+        if (
+            validation_video_mask.endswith(".jpg") or
+            validation_video_mask.endswith(".jpeg") or
+            validation_video_mask.endswith(".png")
+            ):
+            validation_video_mask = Image.open(validation_video_mask).convert('L').resize((sample_size[1], sample_size[0]))
+            input_video_mask = np.where(np.array(validation_video_mask) < 240, 0, 255)
+            
+            input_video_mask = torch.from_numpy(np.array(input_video_mask)).unsqueeze(0).unsqueeze(-1).permute([3, 0, 1, 2]).unsqueeze(0)
+            input_video_mask = torch.tile(input_video_mask, [1, 1, input_video.size()[2], 1, 1])
+            input_video_mask = input_video_mask.to(input_video.device, input_video.dtype)
+        elif validation_video_mask.endswith(".mp4"):
+            validation_video_mask = media.read_video(validation_video_mask)[:video_length]
+            if len(validation_video_mask.shape) == 4:  # (t, h, w, c)
+                validation_video_mask = validation_video_mask[..., 0] # (t, h, w)
+            input_video_mask = torch.from_numpy(validation_video_mask).unsqueeze(0)  # (1, t, h, w)
+            input_video_mask = F.interpolate(input_video_mask.float(), sample_size, mode='area')
+            input_video_mask = torch.where(input_video_mask < 240, 0, 255).unsqueeze(0)  # (1, 1, t, h, w)
+
+            input_video_mask = dilate_video_mask(input_video_mask)
+
+            input_video_mask = input_video_mask.to(input_video.device, input_video.dtype)
+
+        else:
+            raise NotImplementedError(f"Not supported validation_video_mask format {validation_video_mask}")
+
+    if ref_image is not None:
+        if isinstance(ref_image, str):
+            clip_image = Image.open(ref_image).convert("RGB")
+        else:
+            clip_image = Image.fromarray(np.array(ref_image, np.uint8))
+    else:
+        clip_image = None
+
+    if ref_image is not None:
+        if isinstance(ref_image, str):
+            ref_image = Image.open(ref_image).convert("RGB")
+            ref_image = ref_image.resize((sample_size[1], sample_size[0]))
+            ref_image = torch.from_numpy(np.array(ref_image))
+            ref_image = ref_image.unsqueeze(0).permute([3, 0, 1, 2]).unsqueeze(0) / 255
+        else:
+            ref_image = torch.from_numpy(np.array(ref_image))
+            ref_image = ref_image.unsqueeze(0).permute([3, 0, 1, 2]).unsqueeze(0) / 255
+    return input_video, input_video_mask, ref_image, clip_image
+
+
+def read_mask_video_binary(mask_path, sample_size, video_length, dilate_width=11):
+    video_mask = media.read_video(mask_path)[:video_length]
+    if len(video_mask.shape) == 4:  # (t, h, w, c)
+        video_mask = video_mask[..., 0] # (t, h, w)
+    video_mask = torch.from_numpy(video_mask).unsqueeze(0)  # (1, t, h, w)
+    video_mask = F.interpolate(video_mask.float(), sample_size, mode='area')
+    video_mask = torch.where(video_mask < 240, 0, 255).unsqueeze(0)  # (1, 1, t, h, w)
+    if dilate_width > 0:
+        video_mask = dilate_video_mask(video_mask, width=dilate_width)
+    return video_mask
+
+
+def temporal_padding(video, min_length=85, max_length=197, dim=2):
+    length = video.size(dim)
+
+    min_len = (length // 4) * 4 + 1
+    if min_len < length:
+        min_len += 4
+    if (min_len // 4) % 2 == 0:
+        min_len += 4
+    target_length = min(min_len, max_length)
+    target_length = max(min_length, target_length)
+
+    logger.debug(f'video size: {video.shape}')
+    if dim == 0:
+        video = video[:target_length]
+    elif dim == 1:
+        video = video[:, :target_length]
+    elif dim == 2:
+        video = video[:, :, :target_length]
+    elif dim == 3:
+        video = video[:, :, :, :target_length]
+    else:
+        raise NotImplementedError
+    logger.debug(f'making video length: {target_length}, padding length: {target_length - length}')
+    while video.size(dim) < target_length:
+        video_flipped = torch.flip(video, [dim])
+        video = torch.cat([video, video_flipped], dim=dim)
+        if dim == 0:
+            video = video[:target_length]
+        elif dim == 1:
+            video = video[:, :target_length]
+        elif dim == 2:
+            video = video[:, :, :target_length]
+        elif dim == 3:
+            video = video[:, :, :, :target_length]
+        else:
+            raise NotImplementedError
+    logger.debug(f'return video size: {video.shape}')
+    return video
+
+
+def get_video_mask_input(
+        input_video_name,
+        sample_size,
+        keep_fg_ids=[-1],
+        max_video_length=49,
+        temporal_window_size=49,
+        data_rootdir="datasets/test/",
+        use_trimask=False,
+        use_quadmask=False,
+        use_fixed_bbox=False,
+        dilate_width=11,
+        apply_temporal_padding=True,
+    ): 
+    input_video_path = os.path.join(data_rootdir, input_video_name, "input_video.mp4")
+    mask_paths = sorted(list(glob.glob(os.path.join(data_rootdir, input_video_name, 'mask_*.mp4'))))
+    prompt = json.load(open(os.path.join(data_rootdir, input_video_name, "prompt.json")))['bg']
+    
+    input_video = media.read_video(input_video_path)
+    clip_image = Image.fromarray(np.array(input_video[0]))
+
+    input_video = torch.from_numpy(np.array(input_video))[:max_video_length]
+    input_video = input_video.permute([3, 0, 1, 2]).float() / 255  # (c, t, h, w)
+    input_video = F.interpolate(input_video, sample_size, mode='area').unsqueeze(0)  # (1, c, t, h, w)
+
+    masks_to_remove = []
+    masks_to_keep = []
+    if mask_paths:
+        for fg_id, mask_path in enumerate(mask_paths):
+            if -1 in keep_fg_ids or fg_id not in keep_fg_ids:
+                masks_to_remove.append(mask_path)
+            else:
+                masks_to_keep.append(mask_path)
+        input_mask = None
+        if use_trimask:
+            for mask_path in masks_to_keep:
+                mask_i = read_mask_video_binary(mask_path, sample_size, max_video_length, dilate_width=dilate_width)
+                if input_mask is None:
+                    input_mask = mask_i
+                else:
+                    input_mask = torch.where(mask_i > 127, 255, input_mask)
+            if input_mask is not None:
+                input_mask = torch.where(input_mask > 127, 0, 127)  # mask region --> 0 (keep), background --> 127 (neutral)
+
+        for mask_path in masks_to_remove:
+            mask_i = read_mask_video_binary(mask_path, sample_size, max_video_length, dilate_width=dilate_width)
+            if input_mask is None:
+                if use_trimask:
+                    input_mask = torch.where(mask_i > 127, 255, 127)
+                else:
+                    input_mask = mask_i
+            else:
+                input_mask = torch.where(mask_i > 127, 255, input_mask)
+    else:  # already has trimask/quadmask video ready
+        # Look for mask files (can be trimask or quadmask)
+        mask_files = sorted(list(glob.glob(os.path.join(data_rootdir, input_video_name, 'mask*.mp4'))))
+        if not mask_files:
+            mask_files = sorted(list(glob.glob(os.path.join(data_rootdir, input_video_name, 'quadmask_*.mp4'))))
+
+        if (use_trimask or use_quadmask) and mask_files:
+            input_mask = torch.from_numpy(media.read_video(mask_files[0])).float()[:max_video_length]
+            if len(input_mask.shape) == 4: input_mask = input_mask[..., 0]
+            input_mask = F.interpolate(input_mask.unsqueeze(0), sample_size, mode='area').unsqueeze(0)  # (1, 1, t, h, w)
+
+            # Apply mask quantization based on mode
+            if use_quadmask:
+                # Quadmask mode: preserve 4 values [0, 63, 127, 255]
+                input_mask = torch.where(input_mask <= 31, 0, input_mask)
+                input_mask = torch.where((input_mask > 31) * (input_mask <= 95), 63, input_mask)
+                input_mask = torch.where((input_mask > 95) * (input_mask <= 191), 127, input_mask)
+                input_mask = torch.where(input_mask > 191, 255, input_mask)
+                input_mask = 255 - input_mask
+                logger.debug(f'[QUADMASK INFERENCE] Using 4-value quadmask: [0, 63, 127, 255]')
+            else:
+                # Trimask mode: 3 values [0, 127, 255]
+                input_mask = torch.where(input_mask > 192, 255, input_mask)
+                input_mask = torch.where((input_mask <= 192) * (input_mask >= 64), 128, input_mask)
+                input_mask = torch.where(input_mask < 64, 0, input_mask)
+                input_mask = 255 - input_mask
+                logger.debug(f'[TRIMASK INFERENCE] Using 3-value trimask: [0, 127, 255]')
+        else:
+            logger.error(f'Masks not found in {os.path.join(data_rootdir, input_video_name)}')
+            sys.exit(1)
+
+    if use_fixed_bbox and not use_trimask:
+        logger.debug('Using fixed bbox')
+        input_mask = mask_to_fixed_bbox(input_mask)
+
+    input_mask = input_mask.to(input_video.device, input_video.dtype)
+    if apply_temporal_padding:
+        input_video = temporal_padding(input_video, min_length=temporal_window_size, max_length=max_video_length)
+        input_mask = temporal_padding(input_mask, min_length=temporal_window_size, max_length=max_video_length)
+    input_mask = input_mask / 255.
+    logger.debug('dataloading mask', input_mask.min(), input_mask.max(), input_mask.dtype, input_mask.shape)
+    return input_video, input_mask, prompt, clip_image
+
+
+def get_video_mask_validation(
+        input_video_name,
+        sample_size,
+        max_video_length=49,
+        temporal_window_size=49,
+        data_rootdir="datasets/test/",
+        use_trimask=False,
+        use_fixed_bbox=False,
+        dilate_width=11,
+        caption_path="datasets/vidgen1m/VidGen_1M_video_caption.json",
+    ):
+    caption_list = json.load(open(caption_path, 'r'))
+    prompt = None
+    for caption_item in caption_list:
+        if caption_item["vid"] == input_video_name.split('.')[0]:
+            prompt = caption_item["caption"]
+            break
+    assert prompt is not None
+
+    input_video_path = os.path.join(data_rootdir, input_video_name)
+    input_video = media.read_video(input_video_path)
+    input_video = torch.from_numpy(np.array(input_video))[:max_video_length]
+    input_video = input_video.permute([3, 0, 1, 2]).float() / 255  # (c, t, h, w)
+    input_video = F.interpolate(input_video, sample_size, mode='area').unsqueeze(0)  # (1, c, t, h, w)
+
+    input_video = temporal_padding(input_video, min_length=temporal_window_size, max_length=max_video_length)
+    input_mask = get_random_mask((input_video.size(2), input_video.size(1), input_video.size(3), input_video.size(4)))
+    input_mask = input_mask.to(input_video.device, input_video.dtype)
+    input_mask = input_mask.permute(1, 0, 2, 3).unsqueeze(0)
+    return input_video, input_mask, prompt
+
+
+def get_video(
+        input_video_path,
+        sample_size,
+        max_video_length=49,
+        temporal_window_size=49,
+    ):
+    input_video = media.read_video(input_video_path)
+    input_video = torch.from_numpy(np.array(input_video))[:max_video_length]
+    input_video = input_video.permute([3, 0, 1, 2]).float() / 255  # (c, t, h, w)
+    input_video = F.interpolate(input_video, sample_size, mode='area').unsqueeze(0)  # (1, c, t, h, w)
+
+    input_video = temporal_padding(input_video, min_length=temporal_window_size, max_length=max_video_length)
+    return input_video
+
+
+def dilate_video_mask(video_mask, width=11):
+    
+    is_tensor = torch.is_tensor(video_mask)
+    if is_tensor:
+        video_mask = video_mask[0, 0].numpy()  # (t, h, w)
+    if video_mask.max() > 127:
+        video_mask = video_mask.astype(np.uint8)
+    elif video_mask.max() <= 1.0:
+        video_mask = (video_mask * 255).astype(np.uint8)
+    is_dim4 = len(video_mask.shape) == 4
+    if is_dim4:
+        video_mask = video_mask[..., -1]
+
+    dilated_video_mask = []
+    for mask in video_mask:
+        dilated_mask = skimage.morphology.binary_dilation(mask, footprint=np.ones((width, width)))
+        dilated_mask = np.where(dilated_mask, 255, 0)
+        dilated_video_mask.append(dilated_mask)
+    dilated_video_mask = np.stack(dilated_video_mask)
+
+    if is_dim4:
+        dilated_video_mask = dilated_video_mask[..., None]
+    if is_tensor:
+        dilated_video_mask = torch.from_numpy(dilated_video_mask).unsqueeze(0).unsqueeze(0)
+    return dilated_video_mask
+
+
+def erode_video_mask(video_mask, width=5):
+    
+    is_tensor = torch.is_tensor(video_mask)
+    if is_tensor:
+        video_mask = video_mask[0, 0].numpy()  # (t, h, w)
+    if video_mask.max() > 127:
+        video_mask = video_mask.astype(np.uint8)
+    elif video_mask.max() <= 1.0:
+        video_mask = (video_mask * 255).astype(np.uint8)
+    is_dim4 = len(video_mask.shape) == 4
+    if is_dim4:
+        video_mask = video_mask[..., -1]
+
+    eroded_video_mask = []
+    for mask in video_mask:
+        eroded_mask = skimage.morphology.binary_erosion(mask, footprint=np.ones((width, width)))
+        eroded_mask = np.where(eroded_mask, 255, 0)
+        eroded_video_mask.append(eroded_mask)
+    eroded_video_mask = np.stack(eroded_video_mask)
+
+    if is_dim4:
+        eroded_video_mask = eroded_video_mask[..., None]
+    if is_tensor:
+        eroded_video_mask = torch.from_numpy(eroded_video_mask).unsqueeze(0).unsqueeze(0)
+    return eroded_video_mask
+
+
+def mask_to_bbox(video_mask):
+    is_tensor = torch.is_tensor(video_mask)
+    if is_tensor:
+        video_mask = video_mask[0, 0].numpy()  # (t, h, w)
+    if video_mask.max() > 127:
+        video_mask = video_mask.astype(np.uint8)
+    elif video_mask.max() <= 1.0:
+        video_mask = (video_mask * 255).astype(np.uint8)
+    is_dim4 = len(video_mask.shape) == 4
+    if is_dim4:
+        video_mask = video_mask[..., -1]
+
+    bbox_masks = []
+    for mask in video_mask:
+        bbox_mask = np.zeros_like(mask)
+        t, b, l, r = 0, mask.shape[0] - 1, 0, mask.shape[1] - 1
+        while(mask[t].sum() == 0): t += 1
+        while(mask[b].sum() == 0): b -= 1
+        while(mask[:, l].sum() == 0): l += 1
+        while(mask[:, r].sum() == 0): r -= 1
+        bbox_mask[t:b, l:r] = 255
+        bbox_masks.append(bbox_mask)
+    bbox_masks = np.stack(bbox_masks)
+    if is_dim4:
+        bbox_masks = bbox_masks[..., None]
+    if is_tensor:
+        bbox_masks = torch.from_numpy(bbox_masks).unsqueeze(0).unsqueeze(0)
+    return bbox_masks
+
+
+def mask_to_fixed_bbox(video_mask):
+    is_tensor = torch.is_tensor(video_mask)
+    if is_tensor:
+        video_mask = video_mask[0, 0].numpy()  # (t, h, w)
+    if video_mask.max() > 127:
+        video_mask = video_mask.astype(np.uint8)
+    elif video_mask.max() <= 1.0:
+        video_mask = (video_mask * 255).astype(np.uint8)
+    is_dim4 = len(video_mask.shape) == 4
+    if is_dim4:
+        video_mask = video_mask[..., -1]
+
+    bbox_masks = []
+
+    # for mask in video_mask:
+    mask = video_mask
+    bbox_mask = np.zeros_like(mask)
+    t, b, l, r = 0, mask.shape[1] - 1, 0, mask.shape[2] - 1
+    while(mask[:, t].sum() == 0): t += 1
+    while(mask[:, b].sum() == 0): b -= 1
+    while(mask[:, :, l].sum() == 0): l += 1
+    while(mask[:, :, r].sum() == 0): r -= 1
+    bbox_mask[:, t:b, l:r] = 255
+    # bbox_masks.append(bbox_mask)
+    # bbox_masks = np.stack(bbox_masks)
+    bbox_masks = bbox_mask
+    if is_dim4:
+        bbox_masks = bbox_masks[..., None]
+    if is_tensor:
+        bbox_masks = torch.from_numpy(bbox_masks).unsqueeze(0).unsqueeze(0)
+    return bbox_masks
+
+
+def apply_colormap(video):
+    if len(video.shape) == 4:
+        video = video.mean(-1)
+    if video.max() >= 2.0:
+        video = video.astype(float) / 255.
+
+    video_colored = []
+    cmap = matplotlib.colormaps['turbo']
+    for frame in video:
+        frame =  cmap(frame)[..., :3]
+        video_colored.append(frame)
+    video_colored = np.stack(video_colored)
+    return video_colored
+