Spaces:

SII-GAIR
/

daVinci-MagiHuman

Paused

App Files Files Community

ethanchern commited on about 1 month ago

Commit

873b6ec

0 Parent(s):

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +46 -0
.pre-commit-config.yaml +53 -0
README.md +191 -0
example/assets/image.png +3 -0
example/assets/prompt.txt +7 -0
example/base/config.json +16 -0
example/base/run.sh +29 -0
example/distill/config.json +17 -0
example/distill/run.sh +29 -0
example/sr_1080p/config.json +20 -0
example/sr_1080p/run.sh +33 -0
example/sr_540p/config.json +20 -0
example/sr_540p/run.sh +32 -0
inference/common/__init__.py +39 -0
inference/common/arch.py +35 -0
inference/common/config.py +283 -0
inference/common/cpu_offload_wrapper.py +186 -0
inference/common/sequence_schema.py +33 -0
inference/infra/__init__.py +37 -0
inference/infra/checkpoint/__init__.py +20 -0
inference/infra/checkpoint/load_model_checkpoint.py +99 -0
inference/infra/distributed/__init__.py +28 -0
inference/infra/distributed/init_dist_env.py +62 -0
inference/infra/distributed/parallel_state.py +659 -0
inference/infra/distributed/utils.py +47 -0
inference/infra/parallelism/__init__.py +20 -0
inference/infra/parallelism/all_to_all_primitive.py +142 -0
inference/infra/parallelism/gather_scatter_primitive.py +217 -0
inference/infra/parallelism/ulysses_scheduler.py +143 -0
inference/model/dit/__init__.py +18 -0
inference/model/dit/dit_model.py +42 -0
inference/model/dit/dit_module.py +950 -0
inference/model/sa_audio/__init__.py +25 -0
inference/model/sa_audio/sa_audio_model.py +116 -0
inference/model/sa_audio/sa_audio_module.py +478 -0
inference/model/t5_gemma/__init__.py +3 -0
inference/model/t5_gemma/t5_gemma_model.py +43 -0
inference/model/turbo_vaed/__init__.py +4 -0
inference/model/turbo_vaed/turbo_vaed_model.py +33 -0
inference/model/turbo_vaed/turbo_vaed_module.py +1039 -0
inference/model/vae2_2/__init__.py +3 -0
inference/model/vae2_2/vae2_2_model.py +17 -0
inference/model/vae2_2/vae2_2_module.py +1086 -0
inference/pipeline/__init__.py +20 -0
inference/pipeline/data_proxy.py +390 -0
inference/pipeline/entry.py +96 -0
inference/pipeline/pipeline.py +108 -0
inference/pipeline/prompt_process.py +60 -0
inference/pipeline/scheduler_unipc.py +832 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+tmp*
+depyf
+torch_compile_cache
+__pycache__
+*.so
+build
+.coverage_*
+*.egg-info
+*~
+slurm*
+logs
+.vscode
+nsys*
+tmp/*
+.mypy_cache
+output
+*.pyc
+*.log
+.idea
+*.pt
+*.png
+*.jpg
+*.jpeg
+*.gif
+*.mp3
+*.mp4
+*.pickle
+*.nsys-rep
+*.html
+*.mov
+*.safetensors
+*.json
+# Keep example media assets tracked.
+!example/assets/*.png
+!example/assets/*.mp4
+!example/**/*.json
+proj*
+.venv
+var
+tags
+fx_graph*.pdf
+/clean_repo.py
+/rm_caches.sh

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+exclude: \.patch$
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: check-added-large-files
+        args:
+        -   --maxkb=30720
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: requirements-txt-fixer
+    -   id: sort-simple-yaml
+-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
+    rev: v1.5.1
+    hooks:
+    -   id: remove-crlf
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
+    -   id: remove-tabs
+        name: Tabs remover (C++)
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
+        args: [--whitespaces-count, '2']
+    -   id: remove-tabs
+        name: Tabs remover (Python)
+        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+        args: [--whitespaces-count, '4']
+-   repo: https://github.com/psf/black.git
+    rev: 23.3.0
+    hooks:
+    -   id: black
+        args: [--line-length=127, --skip-string-normalization, --skip-magic-trailing-comma]
+        files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+-   repo: https://github.com/pre-commit/mirrors-isort
+    rev: v5.10.1
+    hooks:
+    -   id: isort
+        args: [--profile=black, --line-length=127, --multi-line=3, --force-grid-wrap=0, --src-path=infra, --src-path=pipeline, --src-path=model]
+        files: \.py$
+-   repo: https://github.com/PyCQA/autoflake
+    rev: v2.3.1
+    hooks:
+    -   id: autoflake
+        args: [--remove-all-unused-imports, --remove-unused-variables, --in-place, --ignore-init-module-imports, --ignore-pass-after-docstring]
+        files: \.py$
+-   repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks.git
+    rev: v2.9.0
+    hooks:
+    -   id: pretty-format-yaml
+        args: [--autofix, --indent, '4']
+        additional_dependencies: [setuptools]

README.md ADDED Viewed

	@@ -0,0 +1,191 @@

+<div align="center">
+# daVinci-MagiHuman
+### Speed by Simplicity: A Single-Stream Architecture for Fast Audio-Video Generative Foundation Model
+<p align="center">
+  <a href="https://www.sjtu.edu.cn/">SII-GAIR</a> &nbsp;&amp;&nbsp; <a href="https://sand.ai">Sand.ai</a>
+</p>
+[![Paper](https://img.shields.io/badge/Paper-PDF-red)](https://github.com/GAIR-NLP/daVinci-MagiHuman/blob/main/assets/daVinci_MagiHuman.pdf)
+[![Demo](https://img.shields.io/badge/%F0%9F%A4%97%20Demo-HuggingFace-orange)](https://huggingface.co/spaces/SII-GAIR/daVinci-MagiHuman)
+[![Models](https://img.shields.io/badge/%F0%9F%A4%97%20Models-HuggingFace-yellow)](https://huggingface.co/GAIR-NLP/daVinci-MagiHuman)
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Python](https://img.shields.io/badge/Python-3.12%2B-blue.svg)](https://www.python.org/)
+[![PyTorch](https://img.shields.io/badge/PyTorch-2.9%2B-ee4c2c.svg)](https://pytorch.org/)
+</div>
+## Highlights
+- **Single-Stream Transformer** — A unified 15B-parameter, 40-layer Transformer that jointly processes text, video, and audio via self-attention only. No cross-attention, no multi-stream complexity.
+- **Exceptional Human-Centric Quality** — Expressive facial performance, natural speech-expression coordination, realistic body motion, and accurate audio-video synchronization.
+- **Multilingual** — Supports Chinese (Mandarin & Cantonese), English, Japanese, Korean, German, and French.
+- **Blazing Fast Inference** — Generates a 5-second 256p video in **2 seconds** and a 5-second 1080p video in **38 seconds** on a single H100 GPU.
+- **State-of-the-Art Results** — Achieves **80.0%** win rate vs Ovi 1.1 and **60.9%** vs LTX 2.3 in pairwise human evaluation over 2,000 comparisons.
+- **Fully Open Source** — We release the complete model stack: base model, distilled model, super-resolution model, and inference code.
+## Demo
+<!--
+  To add demo videos:
+  1. Open a GitHub issue on this repo
+  2. Drag & drop your .mp4 files into the issue comment box
+  3. Copy the generated URLs and paste them below
+  Example:
+  https://github.com/user-attachments/assets/xxxx-xxxx
+-->
+https://github.com/user-attachments/assets/PLACEHOLDER_VIDEO_1
+https://github.com/user-attachments/assets/PLACEHOLDER_VIDEO_2
+https://github.com/user-attachments/assets/PLACEHOLDER_VIDEO_3
+## Architecture
+<div align="center">
+<img src="assets/architecture.png" width="90%">
+</div>
+daVinci-MagiHuman uses a single-stream Transformer that takes text tokens, a reference image latent, and noisy video and audio tokens as input, and jointly denoises the video and audio within a unified token sequence.
+Key design choices:
+| Component | Description |
+|---|---|
+| **Sandwich Architecture** | First and last 4 layers use modality-specific projections; middle 32 layers share parameters across modalities |
+| **Timestep-Free Denoising** | No explicit timestep embeddings — the model infers the denoising state directly from input latents |
+| **Per-Head Gating** | Learned scalar gates with sigmoid activation on each attention head for training stability |
+| **Unified Conditioning** | Denoising and reference signals handled through a minimal unified interface — no dedicated conditioning branches |
+## Performance
+### Quantitative Quality Benchmark
+| Model | Visual Quality ↑ | Text Alignment ↑ | Physical Consistency ↑ | WER ↓ |
+|---|:---:|:---:|:---:|:---:|
+| OVI 1.1 | 4.73 | 4.10 | 4.41 | 40.45% |
+| LTX 2.3 | 4.76 | 4.12 | **4.56** | 19.23% |
+| **daVinci-MagiHuman** | **4.80** | **4.18** | 4.52 | **14.60%** |
+### Human Evaluation (2,000 Pairwise Comparisons)
+| Matchup | daVinci-MagiHuman Win | Tie | Opponent Win |
+|---|:---:|:---:|:---:|
+| vs Ovi 1.1 | **80.0%** | 8.2% | 11.8% |
+| vs LTX 2.3 | **60.9%** | 17.2% | 21.9% |
+### Inference Speed (Single H100 GPU, 5-second video)
+| Resolution | Base (s) | Super-Res (s) | Decode (s) | **Total (s)** |
+|---|:---:|:---:|:---:|:---:|
+| 256p | 1.6 | — | 0.4 | **2.0** |
+| 540p | 1.6 | 5.1 | 1.3 | **8.0** |
+| 1080p | 1.6 | 31.0 | 5.8 | **38.4** |
+## Efficient Inference Techniques
+- **Latent-Space Super-Resolution** — Two-stage pipeline: generate at low resolution, then refine in latent space (not pixel space), avoiding an extra VAE decode-encode round trip.
+- **Turbo VAE Decoder** — A lightweight re-trained decoder that substantially reduces decoding overhead.
+- **Full-Graph Compilation** — [MagiCompiler](https://github.com/sandai/MagiCompiler) fuses operators across Transformer layers for ~1.2x speedup.
+- **Distillation** — DMD-2 distillation enables generation with only 8 denoising steps (no CFG), without sacrificing quality.
+## Getting Started
+### Option 1: Docker (Recommended)
+```bash
+# Pull the MagiCompiler Docker image
+docker pull sandai/magi-compiler:latest
+# Launch container
+docker run -it --gpus all \
+  -v /path/to/models:/models \
+  sandai/magi-compiler:latest bash
+# Install MagiCompiler
+git clone https://github.com/sandai/MagiCompiler
+cd MagiCompiler
+pip install -e . --no-build-isolation --config-settings editable_mode=compat
+cd ..
+# Clone daVinci-MagiHuman
+git clone https://github.com/GAIR-NLP/daVinci-MagiHuman
+cd daVinci-MagiHuman
+```
+### Option 2: Conda
+```bash
+# Create environment
+conda create -n davinci python=3.12
+conda activate davinci
+# Install PyTorch
+pip install torch==2.9.0 torchvision==0.24.0 torchaudio==2.9.0
+# Install Flash Attention (Hopper)
+git clone https://github.com/Dao-AILab/flash-attention
+cd flash-attention/hopper && python setup.py install && cd ../..
+# Install MagiCompiler
+git clone https://github.com/sandai/MagiCompiler
+cd MagiCompiler
+pip install -e . --no-build-isolation --config-settings editable_mode=compat
+cd ..
+# Clone and install daVinci-MagiHuman
+git clone https://github.com/GAIR-NLP/daVinci-MagiHuman
+cd daVinci-MagiHuman
+pip install -r requirements.txt
+```
+### Download Model Checkpoints
+Download the complete model stack from [HuggingFace](https://huggingface.co/GAIR-NLP/daVinci-MagiHuman) and update the paths in the config files under `example/`.
+## Usage
+Before running, update the checkpoint paths in the config files (`example/*/config.json`) to point to your local model directory.
+**Base Model (256p)**
+```bash
+bash example/base/run.sh
+```
+**Distilled Model (256p, 8 steps, no CFG)**
+```bash
+bash example/distill/run.sh
+```
+**Super-Resolution to 540p**
+```bash
+bash example/sr_540p/run.sh
+```
+**Super-Resolution to 1080p**
+```bash
+bash example/sr_1080p/run.sh
+```
+## Citation
+```bibtex
+@misc{davinci-magihuman-2025,
+  title   = {Speed by Simplicity: A Single-Stream Architecture for Fast Audio-Video Generative Foundation Model},
+  author  = {SII-GAIR and Sand.ai},
+  year    = {2025},
+  url     = {https://github.com/GAIR-NLP/daVinci-MagiHuman}
+}
+```
+## Acknowledgements
+daVinci-MagiHuman builds upon several outstanding open-source projects, including [Wan2.2](https://github.com/Wan-Video/Wan2.2), [Flash Attention](https://github.com/Dao-AILab/flash-attention), and [Turbo-VAED](https://github.com/zou-group/turbo-vaed). We thank the broader open-source community for making this work possible.
+## License
+This project is released under the [Apache License 2.0](https://opensource.org/licenses/Apache-2.0).

example/assets/image.png ADDED Viewed

Git LFS Details

SHA256: 0659ddf2d52dea107c8437889d850400929901676916ba3c5fe5feab4b116f65
Pointer size: 132 Bytes
Size of remote file: 1.01 MB

example/assets/prompt.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+"A man with dark hair and glasses, wearing a green button-up shirt and black gloves, stands behind a counter with a pan, gesturing with his left hand, while a blonde woman with her hair in a bun, dressed in a white shirt, holds a microphone to her mouth, looking intently at the pan. The scene is set outdoors under a bright, overcast sky, with decorative palm tree cutouts and a light green vintage Volkswagen bus visible in the background, suggesting a relaxed, possibly tropical, cooking demonstration. The overall emotional disposition is one of focused engagement and professional presentation. The camera maintains a static medium shot, capturing both individuals from the waist up, with a shallow depth of field that keeps them sharp while blurring the background elements. The lighting is bright and even, typical of outdoor daylight, with soft shadows. The color grading is natural and vibrant, reflecting the outdoor setting. The man, with a slight smile, explains in a clear, steady, and informative tone, ""Pulver mit dran gemacht, gibt's ja auch als Paste, aber als Pulver ist das hier ein bisschen..."" as he gestures towards the pan with his left hand, his right hand resting on the counter. The woman listens attentively, her eyebrows slightly raised, her mouth slightly open in an expression of curiosity and concentration, her gaze fixed on the pan.
+Dialogue:
+<Man in green shirt, German>: ""Pulver mit dran gemacht, gibt's ja auch als Paste, aber als Pulver ist das hier ein bisschen...""
+Background Sound:
+<No prominent background sound effects>"

example/base/config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "engine_config": {
+    "load": "/home/niubility2/hongyu/open_source_ckpt/base",
+    "cp_size": 1
+  },
+  "evaluation_config": {
+    "cfg_number": 2,
+    "num_inference_steps": 32,
+    "audio_model_path": "/home/niubility2/hongyu/open_source_ckpt/audio",
+    "txt_model_path": "/home/niubility2/hongyu/open_source_ckpt/t5/t5gemma-9b-9b-ul2",
+    "vae_model_path": "/home/niubility2/hongyu/open_source_ckpt/wan_vae/Wan2.2-TI2V-5B",
+    "use_turbo_vae": true,
+    "student_config_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/TurboV3-Wan22-TinyShallow_7_7.json",
+    "student_ckpt_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/checkpoint-340000.ckpt"
+  }
+}

example/base/run.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$PROJECT_ROOT"
+export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+export MASTER_PORT="${MASTER_PORT:-6009}"
+export NNODES="${NNODES:-1}"
+export NODE_RANK="${NODE_RANK:-0}"
+export GPUS_PER_NODE="${GPUS_PER_NODE:-1}"
+export WORLD_SIZE="$((GPUS_PER_NODE * NNODES))"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+export NCCL_ALGO="${NCCL_ALGO:-^NVLS}"
+export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}"
+DISTRIBUTED_ARGS="--nnodes=${NNODES} --node_rank=${NODE_RANK} --nproc_per_node=${GPUS_PER_NODE} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT}"
+torchrun ${DISTRIBUTED_ARGS} inference/pipeline/entry.py \
+  --config-load-path example/base/config.json \
+  --prompt "$(<example/assets/prompt.txt)" \
+  --image_path example/assets/image.png \
+  --seconds 10 \
+  --br_width 448 \
+  --br_height 256 \
+  --output_path "output_example_base_$(date '+%Y%m%d_%H%M%S')" \
+  2>&1 | tee "log_example_base_$(date '+%Y%m%d_%H%M%S').log"

example/distill/config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "engine_config": {
+    "load": "/home/niubility2/hongyu/open_source_ckpt/distill",
+    "distill": true,
+    "cp_size": 1
+  },
+  "evaluation_config": {
+    "cfg_number": 1,
+    "num_inference_steps": 8,
+    "audio_model_path": "/home/niubility2/hongyu/open_source_ckpt/audio",
+    "txt_model_path": "/home/niubility2/hongyu/open_source_ckpt/t5/t5gemma-9b-9b-ul2",
+    "vae_model_path": "/home/niubility2/hongyu/open_source_ckpt/wan_vae/Wan2.2-TI2V-5B",
+    "use_turbo_vae": true,
+    "student_config_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/TurboV3-Wan22-TinyShallow_7_7.json",
+    "student_ckpt_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/checkpoint-340000.ckpt"
+  }
+}

example/distill/run.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$PROJECT_ROOT"
+export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+export MASTER_PORT="${MASTER_PORT:-6010}"
+export NNODES="${NNODES:-1}"
+export NODE_RANK="${NODE_RANK:-0}"
+export GPUS_PER_NODE="${GPUS_PER_NODE:-1}"
+export WORLD_SIZE="$((GPUS_PER_NODE * NNODES))"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+export NCCL_ALGO="${NCCL_ALGO:-^NVLS}"
+export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}"
+DISTRIBUTED_ARGS="--nnodes=${NNODES} --node_rank=${NODE_RANK} --nproc_per_node=${GPUS_PER_NODE} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT}"
+torchrun ${DISTRIBUTED_ARGS} inference/pipeline/entry.py \
+  --config-load-path example/distill/config.json \
+  --prompt "$(<example/assets/prompt.txt)" \
+  --image_path example/assets/image.png \
+  --seconds 10 \
+  --br_width 448 \
+  --br_height 256 \
+  --output_path "output_example_distill_$(date '+%Y%m%d_%H%M%S')" \
+  2>&1 | tee "log_example_distill_$(date '+%Y%m%d_%H%M%S').log"

example/sr_1080p/config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "engine_config": {
+    "load": "/home/niubility2/hongyu/open_source_ckpt/base",
+    "cp_size": 1
+  },
+  "evaluation_config": {
+    "cfg_number": 2,
+    "num_inference_steps": 32,
+    "audio_model_path": "/home/niubility2/hongyu/open_source_ckpt/audio",
+    "txt_model_path": "/home/niubility2/hongyu/open_source_ckpt/t5/t5gemma-9b-9b-ul2",
+    "vae_model_path": "/home/niubility2/hongyu/open_source_ckpt/wan_vae/Wan2.2-TI2V-5B",
+    "use_sr_model": true,
+    "sr_model_path": "/home/niubility2/hongyu/open_source_ckpt/1080p_sr",
+    "sr_num_inference_steps": 5,
+    "sr_cfg_number": 1,
+    "use_turbo_vae": true,
+    "student_config_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/TurboV3-Wan22-TinyShallow_7_7.json",
+    "student_ckpt_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/checkpoint-340000.ckpt"
+  }
+}

example/sr_1080p/run.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$PROJECT_ROOT"
+export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+export MASTER_PORT="${MASTER_PORT:-6012}"
+export NNODES="${NNODES:-1}"
+export NODE_RANK="${NODE_RANK:-0}"
+export GPUS_PER_NODE="${GPUS_PER_NODE:-1}"
+export WORLD_SIZE="$((GPUS_PER_NODE * NNODES))"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+export NCCL_ALGO="${NCCL_ALGO:-^NVLS}"
+export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}"
+export SR2_1080="${SR2_1080:-true}"
+export CPU_OFFLOAD="${CPU_OFFLOAD:-true}"
+DISTRIBUTED_ARGS="--nnodes=${NNODES} --node_rank=${NODE_RANK} --nproc_per_node=${GPUS_PER_NODE} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT}"
+torchrun ${DISTRIBUTED_ARGS} inference/pipeline/entry.py \
+  --config-load-path example/sr_1080p/config.json \
+  --prompt "$(<example/assets/prompt.txt)" \
+  --image_path example/assets/image.png \
+  --seconds 10 \
+  --br_width 448 \
+  --br_height 256 \
+  --output_path "output_example_sr_1080p_$(date '+%Y%m%d_%H%M%S')" \
+  --sr_width 1920 \
+  --sr_height 1088 \
+  2>&1 | tee "log_example_sr_1080p_$(date '+%Y%m%d_%H%M%S').log"

example/sr_540p/config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "engine_config": {
+    "load": "/home/niubility2/hongyu/open_source_ckpt/base",
+    "cp_size": 1
+  },
+  "evaluation_config": {
+    "cfg_number": 2,
+    "num_inference_steps": 32,
+    "audio_model_path": "/home/niubility2/hongyu/open_source_ckpt/audio",
+    "txt_model_path": "/home/niubility2/hongyu/open_source_ckpt/t5/t5gemma-9b-9b-ul2",
+    "vae_model_path": "/home/niubility2/hongyu/open_source_ckpt/wan_vae/Wan2.2-TI2V-5B",
+    "use_sr_model": true,
+    "sr_model_path": "/home/niubility2/hongyu/open_source_ckpt/540p_sr",
+    "sr_num_inference_steps": 5,
+    "sr_cfg_number": 1,
+    "use_turbo_vae": true,
+    "student_config_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/TurboV3-Wan22-TinyShallow_7_7.json",
+    "student_ckpt_path": "/home/niubility2/hongyu/open_source_ckpt/turbo_vae/checkpoint-340000.ckpt"
+  }
+}

example/sr_540p/run.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$PROJECT_ROOT"
+export MASTER_ADDR="${MASTER_ADDR:-localhost}"
+export MASTER_PORT="${MASTER_PORT:-6011}"
+export NNODES="${NNODES:-1}"
+export NODE_RANK="${NODE_RANK:-0}"
+export GPUS_PER_NODE="${GPUS_PER_NODE:-1}"
+export WORLD_SIZE="$((GPUS_PER_NODE * NNODES))"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+export NCCL_ALGO="${NCCL_ALGO:-^NVLS}"
+export PYTHONPATH="${PROJECT_ROOT}:${PYTHONPATH:-}"
+export CPU_OFFLOAD=true
+DISTRIBUTED_ARGS="--nnodes=${NNODES} --node_rank=${NODE_RANK} --nproc_per_node=${GPUS_PER_NODE} --rdzv-backend=c10d --rdzv-endpoint=${MASTER_ADDR}:${MASTER_PORT}"
+torchrun ${DISTRIBUTED_ARGS} inference/pipeline/entry.py \
+  --config-load-path example/sr_540p/config.json \
+  --prompt "$(<example/assets/prompt.txt)" \
+  --image_path example/assets/image.png \
+  --seconds 10 \
+  --br_width 448 \
+  --br_height 256 \
+  --output_path "output_example_sr_540p_$(date '+%Y%m%d_%H%M%S')" \
+  --sr_width 896 \
+  --sr_height 512 \
+  2>&1 | tee "log_example_sr_540p_$(date '+%Y%m%d_%H%M%S').log"

inference/common/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .arch import get_arch_memory, is_hopper_arch
+from .config import (
+    DataProxyConfig,
+    EngineConfig,
+    EvaluationConfig,
+    parse_config,
+)
+from .cpu_offload_wrapper import CPUOffloadWrapper
+from .sequence_schema import Modality, VarlenHandler
+__all__ = [
+    # arch
+    "get_arch_memory",
+    "is_hopper_arch",
+    # config
+    "EngineConfig",
+    "DataProxyConfig",
+    "EvaluationConfig",
+    "parse_config",
+    # cpu offload wrapper
+    "CPUOffloadWrapper",
+    # sequence schema
+    "Modality",
+    "VarlenHandler",
+]

inference/common/arch.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+def is_hopper_arch():
+    return torch.cuda.get_device_capability()[0] == 9
+def get_arch_memory(unit: str = "GB"):
+    if not torch.cuda.is_available():
+        return 0
+    total_bytes = torch.cuda.get_device_properties(torch.cuda.current_device()).total_memory
+    if unit == "B":
+        return float(total_bytes)
+    elif unit == "KB":
+        return total_bytes / 1024
+    elif unit == "MB":
+        return total_bytes / 1024 / 1024
+    elif unit == "GB":
+        return total_bytes / 1024 / 1024 / 1024
+    else:
+        raise ValueError(f"Invalid unit: {unit}")

inference/common/config.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import copy
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Literal, Tuple
+import torch
+from inference.utils import env_is_true, print_rank_0
+from pydantic import BaseModel, ConfigDict, Field, field_serializer, field_validator, model_validator
+from pydantic_settings import (
+    BaseSettings,
+    CliSettingsSource,
+    JsonConfigSettingsSource,
+    PydanticBaseSettingsSource,
+    SettingsConfigDict,
+)
+class EngineConfig(BaseModel):
+    # Basic settings
+    seed: int = Field(1234, description="Random seed used for python, numpy, pytorch, and cuda.")
+    load: str | None = Field(None, description="Directory containing a model checkpoint.")
+    # Parallelism strategy
+    distributed_backend: Literal["nccl", "gloo"] = Field("nccl", description="Distributed backend. Choices: ['nccl', 'gloo'].")
+    distributed_timeout_minutes: int = Field(10, description="Timeout minutes for torch.distributed.")
+    sequence_parallel: bool = Field(False, description="Enable sequence parallel optimization.")
+    tp_size: int = Field(1, description="Degree of tensor model parallelism.")
+    pp_size: int = Field(1, description="Degree of pipeline model parallelism.")
+    cp_size: int = Field(1, description="Degree of context parallelism.")
+    dp_size: int = Field(1, description="Degree of data parallelism.")
+class ModelConfig(BaseModel):
+    """Model configuration class defining various parameters for video generation model"""
+    model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=())
+    num_layers: int = Field(default=40, description="Number of Transformer layers")
+    hidden_size: int = Field(default=5120, description="Hidden size of the Transformer model")
+    head_dim: int = Field(default=128, description="Dimension per attention head")
+    num_query_groups: int = Field(default=8, description="Number of query groups for grouped-query attention")
+    video_in_channels: int = Field(default=48 * 4, description="Number of video input channels after patch embedding")
+    audio_in_channels: int = Field(default=64, description="Number of audio input channels")
+    text_in_channels: int = Field(default=3584, description="Number of text input channels")
+    checkpoint_qk_layernorm_rope: bool = Field(default=False, description="Enable checkpointing for QK layernorm + RoPE")
+    params_dtype: torch.dtype | str = Field(default=torch.float32, description="Parameter dtype")
+    tread_config: dict = Field(
+        default=dict(
+            selection_rate=0.5, start_layer_idx=2, end_layer_idx=25  # after forward of 0, 1  # before forward of 26 27 28 29
+        ),
+        description="TReAD (Token Routing and Early Drop) configuration",
+    )
+    mm_layers: list[int] = Field(default=[0, 1, 2, 3, 36, 37, 38, 39], description="Indices of multimodal fusion layers")
+    local_attn_layers: list[int] = Field(default=[], description="Indices of local attention layers")
+    enable_attn_gating: bool = Field(default=True, description="Enable attention gating")
+    activation_type: str = Field(default="swiglu7", description="Activation type")
+    gelu7_layers: list[int] = Field(default=[0, 1, 2, 3], description="Indices of gelu7 layers")
+    # Add computed fields
+    num_heads_q: int = Field(default=0, description="Number of query heads (calculated from hidden_size // head_dim)")
+    num_heads_kv: int = Field(default=0, description="Number of key-value heads (calculated from num_query_groups)")
+    post_norm_layers: list[int] = Field(default=[], description="Indices of post norm layers")
+    @field_serializer("params_dtype")
+    def serialize_dtype(self, value: torch.dtype | str) -> str:
+        return str(value)
+    @field_validator("params_dtype", mode="before")
+    @classmethod
+    def validate_dtype(cls, value):
+        if isinstance(value, torch.dtype):
+            return value
+        if isinstance(value, str):
+            if value == "torch.float32" or value == "float32":
+                return torch.float32
+            elif value == "torch.float16" or value == "float16":
+                return torch.float16
+            elif value == "torch.bfloat16" or value == "bfloat16":
+                return torch.bfloat16
+        raise ValueError(f"Unknown torch.dtype string: '{value}'")
+class DataProxyConfig(BaseModel):
+    t_patch_size: int = Field(default=1, description="Patch size for time dimension")
+    patch_size: int = Field(default=2, description="Patch size for spatial dimensions")
+    frame_receptive_field: int = Field(default=11, description="Frame receptive field")
+    spatial_rope_interpolation: Literal["inter", "extra"] = Field(
+        default="extra", description="Spatial rope interpolation method."
+    )
+    ref_audio_offset: int = Field(default=1000, description="Offset for reference audio.")
+    text_offset: int = Field(default=0, description="Offset for text.")
+    coords_style: Literal["v1", "v2"] = Field(default="v2", description="Coords style.")
+class EvaluationConfig(BaseModel):
+    """Evaluation configuration class defining parameters for model evaluation and inference"""
+    model_config = ConfigDict(protected_namespaces=())
+    data_proxy_config: DataProxyConfig = Field(default=DataProxyConfig(), description="Data proxy configuration")
+    fps: int = Field(default=25, description="Frames per second for video generation")
+    num_inference_steps: int = Field(default=32, description="Number of denoising steps during inference")
+    video_txt_guidance_scale: float = Field(default=5.0, description="Video text guidance scale for text conditioning")
+    audio_txt_guidance_scale: float = Field(default=5.0, description="Audio text guidance scale for text conditioning")
+    txt_encoder_type: Literal["t5_gemma"] = Field(default="t5_gemma", description="Text encoder type.")
+    t5_gemma_target_length: int = Field(default=640, description="Target length for T5-Gemma encoder.")
+    support_ref_audio: bool = Field(default=True, description="Whether to support the ref_audio feature")
+    shift: float = Field(default=5.0, description="Temporal shift parameter for video generation")
+    exp_name: str = Field(default="exp_debug", description="Experiment name with evaluation suffix")
+    audio_model_path: str = Field(default="", description="Path to the pretrained audio model")
+    txt_model_path: str = Field(default="", description="Path to the pretrained txt model")
+    vae_model_path: str = Field(default="", description="Path to the pretrained vae model")
+    vae_stride: Tuple[int, int, int] = Field(default=(4, 16, 16), description="VAE stride in format (time, height, width)")
+    z_dim: int = Field(default=48, description="Dimension of z space.")
+    patch_size: Tuple[int, int, int] = Field(default=(1, 2, 2), description="Patch size in format (time, height, width)")
+    cfg_number: int = Field(default=2, description="Classifier-free guidance number")
+    sr_cfg_number: int = Field(default=2, description="SR Classifier-free guidance number")
+    # flops recording
+    enable_flops_recording: bool = Field(default=False, description="Whether to enable flops recording")
+    # super resolution model configuration
+    use_sr_model: bool = Field(default=False, description="Whether to use the super resolution model")
+    sr_model_path: str = Field(default="", description="Path to the pretrained super resolution model")
+    sr_num_inference_steps: int = Field(default=5, description="Number of denoising steps during super resolution inference")
+    noise_value: int = Field(default=220, description="Noise value for the super resolution model")
+    sr_video_txt_guidance_scale: float = Field(
+        default=3.5, description="Super resolution video text guidance scale for text conditioning"
+    )
+    use_cfg_trick: bool = Field(default=True, description="Whether to use the cfg trick")
+    cfg_trick_start_frame: int = Field(default=13, description="Start frame for the cfg trick")
+    cfg_trick_value: float = Field(default=2.0, description="Value for the cfg trick")
+    using_sde_flag: bool = Field(default=False, description="Whether to use the sde flag")
+    sr_audio_noise_scale: float = Field(default=0.7, description="Noise scale for the super resolution audio")
+    # turbo-vae config
+    use_turbo_vae: bool = Field(default=True, description="Whether to use the turbo-vae")
+    student_config_path: str = Field(default="", description="Path to the student config")
+    student_ckpt_path: str = Field(default="", description="Path to the student checkpoint")
+class MagiPipelineConfig(BaseSettings):
+    engine_config: EngineConfig = Field(description="Engine configuration.", default_factory=EngineConfig)
+    arch_config: ModelConfig = Field(default=ModelConfig(), description="Model configuration.")
+    evaluation_config: EvaluationConfig = Field(default=EvaluationConfig(), description="Evaluation configuration.")
+    sr_arch_config: ModelConfig = Field(default=ModelConfig(), description="Super resolution model configuration.")
+    model_config = SettingsConfigDict(cli_parse_args=True, cli_ignore_unknown_args=True, cli_implicit_flags=True)
+    @classmethod
+    def settings_customise_sources(
+        cls,
+        settings_cls: type[BaseSettings],
+        init_settings: PydanticBaseSettingsSource,
+        env_settings: PydanticBaseSettingsSource,
+        dotenv_settings: PydanticBaseSettingsSource,
+        file_secret_settings: PydanticBaseSettingsSource,
+    ) -> tuple[PydanticBaseSettingsSource, ...]:
+        parser = argparse.ArgumentParser(allow_abbrev=False)
+        parser.add_argument("--config-load-path", type=str, default=None, help="Path to load the config.json from")
+        args, _ = parser.parse_known_args()
+        config_load_path = args.config_load_path
+        sources = [env_settings, CliSettingsSource(settings_cls, cli_parse_args=True, cli_ignore_unknown_args=True)]
+        if config_load_path:
+            sources.append(JsonConfigSettingsSource(settings_cls, json_file=config_load_path))
+        sources.extend([init_settings, dotenv_settings, file_secret_settings])
+        return tuple(sources)
+    def save_to_json(self, json_path: str, indent: int = 4):
+        path = Path(json_path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text(self.__str__(indent=indent))
+    def __str__(self, indent: int = 4):
+        data = self.model_dump(mode="json")
+        formatted = json.dumps(data, indent=indent, ensure_ascii=False, sort_keys=False)
+        class_name = self.__class__.__name__
+        return f"{class_name}:\n{formatted}".replace('"', "")
+    def __repr__(self, indent: int = 4):
+        return self.__str__(indent=indent)
+    @model_validator(mode="after")
+    def validate_engine_config(self):
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        self.engine_config.dp_size = world_size // (
+            self.engine_config.tp_size * self.engine_config.pp_size * self.engine_config.cp_size
+        )
+        assert world_size % self.engine_config.tp_size == 0
+        tp_pp_size = self.engine_config.tp_size * self.engine_config.pp_size
+        assert world_size % tp_pp_size == 0
+        tp_pp_cp_size = tp_pp_size * self.engine_config.cp_size
+        assert world_size % tp_pp_cp_size == 0
+        assert world_size == self.engine_config.dp_size * tp_pp_cp_size
+        if self.engine_config.tp_size == 1:
+            self.engine_config.sequence_parallel = False
+        return self
+    @model_validator(mode="after")
+    def post_override_config(self):
+        self.arch_config.num_heads_q = self.arch_config.hidden_size // self.arch_config.head_dim
+        self.arch_config.num_heads_kv = self.arch_config.num_query_groups
+        self.sr_arch_config = copy.deepcopy(self.arch_config)
+        if env_is_true("SR2_1080"):
+            self.sr_arch_config = copy.deepcopy(self.arch_config)
+            # fmt: off
+            self.sr_arch_config.local_attn_layers = [
+                0, 1, 2,
+                4, 5, 6,
+                8, 9, 10,
+                12, 13, 14,
+                16, 17, 18,
+                20, 21, 22,
+                24, 25, 26,
+                28, 29, 30,
+                32, 33, 34,
+                35, 36, 37,
+                38, 39,
+            ]
+            # fmt: on
+            self.evaluation_config.sr_video_txt_guidance_scale = 3.5
+        return self
+def prevent_unsupported_list_syntax():
+    """
+    Check sys.argv before Pydantic parsing to prevent using unsupported list syntax.
+    """
+    args = sys.argv[1:]
+    for i, arg in enumerate(args):
+        if i + 2 < len(args):
+            value1, value2 = args[i + 1], args[i + 2]
+            if not value1.startswith("-") and not value2.startswith("-"):
+                error_msg = (
+                    f"\n\nError: Detected list parameter '{arg}' using unsupported command line syntax.\n"
+                    f"Error pattern: '{arg} {value1} {value2} ...'\n\n"
+                    "Pydantic (or related libraries) do not support passing lists with space-separated multiple values.\n"
+                    "Please use one of the following supported formats:\n\n"
+                    f"1. JSON style:      {arg} '[{value1},{value2},...]'\n"
+                    f"2. Argparse style:  {arg} {value1} {arg} {value2}\n"
+                    f"3. Lazy style:      {arg} {value1},{value2}\n"
+                )
+                raise ValueError(error_msg)
+def parse_config(verbose: bool = False) -> MagiPipelineConfig:
+    parser = argparse.ArgumentParser(description="Load and optionally save config", allow_abbrev=False)
+    parser.add_argument("--config-save-path", type=str, default=None, help="Path to save the config.json to")
+    args, _ = parser.parse_known_args()
+    prevent_unsupported_list_syntax()
+    config = MagiPipelineConfig()
+    if args.config_save_path is not None:
+        config.save_to_json(args.config_save_path)
+    if verbose:
+        print_rank_0(config)
+    return config

inference/common/cpu_offload_wrapper.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Dict, Tuple
+import torch
+class CPUOffloadWrapper:
+    def __init__(self, model: Any, is_cpu_offload: bool = False, is_running_on_gpu: bool = True):
+        object.__setattr__(self, "model", model)
+        object.__setattr__(self, "is_cpu_offload", is_cpu_offload)
+        object.__setattr__(self, "is_running_on_gpu", is_running_on_gpu)
+        cpu_device = torch.device("cpu")
+        cuda_device = torch.device("cuda")
+        object.__setattr__(self, "cpu_device", cpu_device)
+        object.__setattr__(self, "cuda_device", cuda_device)
+        # Initialize placement location
+        if is_cpu_offload:
+            self.model.to(cpu_device)
+        else:
+            self.model.to(cuda_device)
+        # Whitelist non-compute methods that shouldn't trigger device hops (pass-through only; no device switch)
+        object.__setattr__(
+            self,
+            "_non_compute_methods",
+            {
+                "to",
+                "cpu",
+                "cuda",
+                "eval",
+                "train",
+                "state_dict",
+                "load_state_dict",
+                "parameters",
+                "named_parameters",
+                "buffers",
+                "named_buffers",
+                "modules",
+                "named_modules",
+                "children",
+                "named_children",
+                "register_forward_hook",
+                "register_forward_pre_hook",
+                "register_full_backward_hook",
+                "zero_grad",
+                "share_memory",
+                "half",
+                "float",
+                "bfloat16",
+            },
+        )
+    # Get current primary device (for external reads)
+    @property
+    def device(self) -> torch.device:
+        if isinstance(self.model, torch.nn.Module):
+            return next(self.model.parameters()).device
+        else:
+            for k, v in self.model.__dict__.items():
+                if isinstance(v, torch.Tensor):
+                    return v.device
+                elif isinstance(v, torch.nn.Module):
+                    return next(v.parameters()).device
+            return self.cuda_device
+    def _backup_cpu_state(self) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, Any]]:
+        # Backup module parameters and buffers
+        module_param_backup = {}
+        module_buffer_backup = {}
+        other_backup = {}
+        def save_module_state(mod: torch.nn.Module, prefix: str):
+            for name, param in mod.named_parameters():
+                if param is not None:
+                    full_key = prefix + name
+                    module_param_backup[full_key] = param.data
+            for name, buffer in mod.named_buffers():
+                if buffer is not None:
+                    full_key = prefix + name
+                    module_buffer_backup[full_key] = buffer.data
+        if isinstance(self.model, torch.nn.Module):
+            save_module_state(self.model, "")
+        else:
+            for name, attr_val in self.model.__dict__.items():
+                if isinstance(attr_val, torch.nn.Module):
+                    save_module_state(attr_val, name + ".")
+                elif isinstance(attr_val, torch.Tensor):
+                    other_backup[name] = attr_val
+        return module_param_backup, module_buffer_backup, other_backup
+    def _restore_cpu_state(self, backups: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, Any]]):
+        # Restore module parameters and buffers
+        module_param_backup, module_buffer_backup, other_backup = backups
+        def restore_module_state(mod: torch.nn.Module, prefix: str):
+            for name, param in mod.named_parameters():
+                full_key = prefix + name
+                if full_key in module_param_backup:
+                    param.data = module_param_backup[full_key]
+            for name, buffer in mod.named_buffers():
+                full_key = prefix + name
+                if full_key in module_buffer_backup:
+                    buffer.data = module_buffer_backup[full_key]
+        if isinstance(self.model, torch.nn.Module):
+            restore_module_state(self.model, "")
+        else:
+            for name, attr_val in self.model.__dict__.items():
+                if isinstance(attr_val, torch.nn.Module):
+                    restore_module_state(attr_val, name + ".")
+        if not isinstance(self.model, torch.nn.Module):
+            for name, val in other_backup.items():
+                setattr(self.model, name, val)
+    # Unified on/offload executor
+    def _run_with_optional_offload(self, func: Callable[..., Any], *args, **kwargs):
+        if self.is_cpu_offload and self.is_running_on_gpu:
+            backups = self._backup_cpu_state()
+            self.model.to(self.cuda_device)
+            try:
+                return func(*args, **kwargs)
+            finally:
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
+                self._restore_cpu_state(backups)
+        else:
+            # Make sure model and args are on the same device
+            args = [
+                arg.to(self.device) if isinstance(arg, torch.Tensor) and arg.device != self.device else arg for arg in args
+            ]
+            kwargs = {
+                k: v.to(self.device) if isinstance(v, torch.Tensor) and v.device != self.device else v
+                for k, v in kwargs.items()
+            }
+            return func(*args, **kwargs)
+    # Direct call (equivalent to forward)
+    def __call__(self, *args, **kwargs):
+        return self._run_with_optional_offload(self.model.__call__, *args, **kwargs)
+    # Explicit forward; some code calls model.forward(...)
+    def forward(self, *args, **kwargs):
+        return self._run_with_optional_offload(self.model.forward, *args, **kwargs)
+    # Key: passthrough all attrs/methods. For callables, wrap with on/offload; for non-compute methods, pass-through only with no device switch.
+    def __getattr__(self, name: str):
+        # Fetch attribute from the wrapped model first
+        attr = getattr(self.model, name)
+        # Wrap methods (except in whitelist)
+        if callable(attr) and name not in self._non_compute_methods:
+            def _wrapped(*args, **kwargs):
+                return self._run_with_optional_offload(attr, *args, **kwargs)
+            return _wrapped
+        return attr
+    def __dir__(self):
+        return sorted(set(list(super().__dir__()) + dir(self.model)))
+    def __setattr__(self, name: str, value: Any):
+        raise AttributeError("CPUOffloadWrapper is immutable")
+    def __repr__(self) -> str:
+        return f"CPUOffloadWrapper(is_cpu_offload={self.is_cpu_offload}, is_running_on_gpu={self.is_running_on_gpu}, model={repr(self.model)})"

inference/common/sequence_schema.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from enum import IntEnum
+import torch
+class Modality(IntEnum):
+    VIDEO = 0
+    AUDIO = 1
+    TEXT = 2
+@dataclass
+class VarlenHandler:
+    cu_seqlens_q: torch.Tensor
+    cu_seqlens_k: torch.Tensor
+    max_seqlen_q: int
+    max_seqlen_k: int

inference/infra/__init__.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from inference.common import parse_config
+from inference.infra.distributed import get_dp_rank, initialize_distributed
+from inference.utils import print_rank_0, set_random_seed
+def initialize_infra():
+    assert torch.cuda.is_available(), "Infra requires CUDA environment."
+    # Initialize distributed environment
+    initialize_distributed()
+    # Initialize config
+    config = parse_config(verbose=True)
+    # Initialize random seed
+    set_random_seed(config.engine_config.seed + 10 * get_dp_rank())
+    print_rank_0("Infra successfully initialized")
+__all__ = ["initialize_infra"]

inference/infra/checkpoint/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .load_model_checkpoint import load_model_checkpoint
+__all__ = [
+    # checkpoint loader
+    "load_model_checkpoint",
+]

inference/infra/checkpoint/load_model_checkpoint.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import json
+import os
+import subprocess
+from concurrent.futures import ThreadPoolExecutor
+from inference.common import EngineConfig
+from inference.utils import print_rank_0
+from safetensors.torch import load as load_from_bytes
+from safetensors.torch import load_file
+from tqdm.auto import tqdm
+def _load_shard(shard_path, param_names, num_threads=None):
+    zstd_path = shard_path + ".zst"
+    if os.path.exists(zstd_path):
+        cmd = ["zstd", "-d"]
+        if num_threads:
+            cmd.extend(["-T", str(num_threads)])  # set parallelism
+        process = subprocess.Popen(cmd + ["-c", zstd_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1)
+        decompressed_data = process.stdout.read()
+        while True:
+            new_data = process.stdout.read()
+            if not new_data:
+                break
+            decompressed_data += new_data
+        process.stdout.close()
+        retcode = process.wait()
+        if retcode != 0:
+            raise RuntimeError(f"Decompression failed: {process.stderr.read().decode()}")
+        buffer = io.BytesIO(decompressed_data)
+        weights = load_from_bytes(buffer.getvalue())
+        buffer.close()
+    else:
+        weights = load_file(shard_path)
+    return {name: weights[name] for name in param_names}
+def load_sharded_safetensors_parallel_with_progress(checkpoint_dir):
+    index_path = os.path.join(checkpoint_dir, "model.safetensors.index.json")
+    if not os.path.exists(index_path):
+        model_file_path = os.path.join(checkpoint_dir, "model.safetensors")
+        state_dict = load_file(model_file_path)
+        return state_dict
+    with open(index_path, "r") as f:
+        index = json.load(f)
+    state_dict = {}
+    shard_map = {}
+    # Group parameters by shard file
+    for param_name, shard_file in index["weight_map"].items():
+        shard_path = os.path.join(checkpoint_dir, shard_file)
+        if shard_path not in shard_map:
+            shard_map[shard_path] = []
+        shard_map[shard_path].append(param_name)
+    # Load shards in parallel with a progress bar
+    with ThreadPoolExecutor() as executor:
+        futures = {
+            executor.submit(_load_shard, shard_path, param_names): shard_path for shard_path, param_names in shard_map.items()
+        }
+        pbar = tqdm(futures, desc="Loading shards", total=len(futures))
+        for future in pbar:
+            result = future.result()
+            state_dict.update(result)
+    return state_dict
+def load_model_checkpoint(model, engine_config: EngineConfig):
+    print_rank_0("Loading checkpoint with safetensors format from pretrained_folder")
+    state_dict = load_sharded_safetensors_parallel_with_progress(engine_config.load)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    print_rank_0(f"Load Weight Missing Keys: {missing_keys}")
+    print_rank_0(f"Load Weight Unexpected Keys: {unexpected_keys}")
+    print_rank_0("Load checkpoint successfully")
+    return model

inference/infra/distributed/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .parallel_state import get_cp_group, get_cp_rank, get_cp_world_size, get_dp_rank, get_pp_rank, get_tp_rank
+from .init_dist_env import initialize_distributed
+__all__ = [
+    # distributed init
+    "initialize_distributed",
+    # parallel state
+    "get_cp_group",
+    "get_cp_world_size",
+    "get_tp_rank",
+    "get_pp_rank",
+    "get_dp_rank",
+    "get_cp_rank",
+]

inference/infra/distributed/init_dist_env.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from datetime import timedelta
+import torch
+from inference.common import parse_config
+from .parallel_state import initialize_model_parallel, model_parallel_is_initialized
+from inference.utils import print_rank_0
+def initialize_distributed():
+    """Initialize torch.distributed and core model parallel."""
+    config = parse_config()
+    device_count = torch.cuda.device_count()
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() == 0:
+            print_rank_0("> torch distributed already initialized, skipping initialization ...")
+    else:
+        rank = int(os.getenv("RANK", "0"))
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        if rank == 0:
+            print_rank_0("> initializing torch distributed ...")
+        # Manually set the device ids.
+        if device_count > 0:
+            device = rank % device_count
+            torch.cuda.set_device(device)
+        # Call the init process
+        torch.distributed.init_process_group(
+            backend=config.engine_config.distributed_backend,
+            world_size=world_size,
+            rank=rank,
+            timeout=timedelta(minutes=config.engine_config.distributed_timeout_minutes),
+        )
+    # Set the tp, pp and dp communicators.
+    if device_count > 0:
+        if model_parallel_is_initialized():
+            return
+        initialize_model_parallel(
+            tp_size=config.engine_config.tp_size,
+            pp_size=config.engine_config.pp_size,
+            cp_size=config.engine_config.cp_size,
+            nccl_communicator_config_path=None,
+            distributed_timeout_minutes=config.engine_config.distributed_timeout_minutes,
+            order="tp-cp-pp-dp",
+        )

inference/infra/distributed/parallel_state.py ADDED Viewed

	@@ -0,0 +1,659 @@

+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Model and data parallel groups."""
+import warnings
+from datetime import timedelta
+from typing import List, Optional
+import torch
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+# Tensor parallel group information with context parallel combined.
+_TENSOR_MODEL_PARALLEL_GROUP_WITH_CP = None
+_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP = None
+# Inter-layer model parallel group that the current rank belongs to.
+_PIPELINE_MODEL_PARALLEL_GROUP = None
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+# tensor model parallel group and data parallel group combined
+# used for fp8 and moe training
+_TENSOR_AND_DATA_PARALLEL_GROUP = None
+# A list of global ranks for each pipeline group to ease calculation of the source
+# rank when broadcasting from the first or last pipeline stage.
+_PIPELINE_GLOBAL_RANKS = None
+# A list of global ranks for each data parallel group to ease calculation of the source
+# rank when broadcasting weights from src to all other data parallel ranks
+_DATA_PARALLEL_GLOBAL_RANKS = None
+# A list of global ranks for each tensor model parallel group to ease calculation of
+# the first local rank in the tensor model parallel group
+_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = None
+# Context parallel group that the current rank belongs to
+_CONTEXT_PARALLEL_GROUP = None
+# A list of global ranks for each context parallel group to ease calculation of the
+# destination rank when exchanging KV/dKV between context parallel_ranks
+_CONTEXT_PARALLEL_GLOBAL_RANKS = None
+_CONTEXT_PARALLEL_EXTRA_GROUP = None
+# Data parallel group information with context parallel combined.
+_DATA_PARALLEL_GROUP_WITH_CP = None
+_DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None
+# combined parallel group of TP, DP, and CP used for fp8
+_TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None
+def _get_nccl_options(pg_name, nccl_comm_cfgs):
+    """Set the NCCL process group options.
+    Args:
+        pg_name (str): process group name
+        nccl_comm_cfgs (dict): nccl communicator configurations
+    When an option (e.g., max_ctas) is not found in the config, use the NCCL default setting.
+    """
+    if pg_name in nccl_comm_cfgs:
+        nccl_options = torch.distributed.ProcessGroupNCCL.Options()
+        nccl_options.config.cga_cluster_size = nccl_comm_cfgs[pg_name].get("cga_cluster_size", 4)
+        nccl_options.config.max_ctas = nccl_comm_cfgs[pg_name].get("max_ctas", 32)
+        nccl_options.config.min_ctas = nccl_comm_cfgs[pg_name].get("min_ctas", 1)
+        return nccl_options
+    else:
+        return None
+def generate_masked_orthogonal_rank_groups(world_size: int, parallel_size: List[int], mask: List[bool]) -> List[List[int]]:
+    r"""Generate orthogonal parallel groups based on the parallel size and mask.
+    Arguments:
+        world_size (int): world size
+        parallel_size (List[int]):
+            The parallel size of each orthogonal parallel type. For example, if
+            tensor_parallel_size = 2, pipeline_model_parallel_group = 3, data_parallel_size = 4,
+            and the parallel mapping order is tp-pp-dp, then the parallel_size = [2, 3, 4].
+        mask (List[bool]):
+            The mask controls which parallel methods the generated groups represent. If mask[i] is
+            True, it means the generated group contains the i-th parallelism method. For example,
+            if parallel_size = [tp_size, pp_size, dp_size], and mask = [True, False , True], then
+            the generated group is the `tp-dp` group, if the mask = [False, True, False], then the
+            generated group is the `pp` group.
+    Algorithm:
+        For orthogonal parallelism, such as tp/dp/pp/cp, the global_rank and
+        local_rank satisfy the following equation:
+            global_rank = tp_rank + dp_rank * tp_size + pp_rank * tp_size * dp_size (1)
+                tp_rank \in [0, tp_size)
+                dp_rank \in [0, dp_size)
+                pp_rank \in [0, pp_size)
+        If we want to get the `dp_group` (tp_size * pp_size groups of dp_size ranks each.
+        For example,  if the gpu size is 8 and order is 'tp-pp-dp', size is '2-2-2', and the
+        dp_group here is [[0, 4], [1, 5], [2, 6], [3, 7]].)
+        The tp_rank and pp_rank will be combined to form the `dp_group_index`.
+            dp_group_index = tp_rank + pp_rank * tp_size (2)
+        So, Given that tp_rank and pp_rank satisfy equation (2), and dp_rank in
+        range(0, dp_size), the ranks in dp_group[dp_group_index] satisfies the
+        equation (1).
+        This function solve this math problem.
+    For example, if the parallel_size = [tp_size, dp_size, pp_size] = [2, 3, 4],
+    and the mask = [False, True, False]. Then,
+        dp_group_index(0) = tp_rank(0) + pp_rank(0) * 2
+        dp_group_index(1) = tp_rank(1) + pp_rank(0) * 2
+        ...
+        dp_group_index(7) = tp_rank(1) + pp_rank(3) * 2
+        dp_group[0] = 0 + range(0, 3) * 2 + 0 = [0, 2, 4]
+        dp_group[1] = 1 + range(0, 3) * 2 + 0 = [1, 3, 5]
+        ...
+        dp_group[7] = 1 + range(0, 3) * 2 + 3 * 2 * 3 = [19, 21, 23]
+    """
+    def prefix_product(a: List[int], init=1) -> List[int]:
+        r = [init]
+        for v in a:
+            init = init * v
+            r.append(init)
+        return r
+    def inner_product(a: List[int], b: List[int]) -> int:
+        return sum([x * y for x, y in zip(a, b)])
+    def decompose(index, shape, stride=None):
+        """
+        This function solve the math problem below:
+            There is an equation:
+                index = sum(idx[i] * stride[i])
+            And given the value of index, stride.
+            Return the idx.
+        This function will used to get the pp/dp/pp_rank
+        from group_index and rank_in_group.
+        """
+        if stride is None:
+            stride = prefix_product(shape)
+        idx = [(index // d) % s for s, d in zip(shape, stride)]
+        # stride is a prefix_product result. And the value of stride[-1]
+        # is not used.
+        assert (
+            sum([x * y for x, y in zip(idx, stride[:-1])]) == index
+        ), "idx {} with shape {} mismatch the return idx {}".format(index, shape, idx)
+        return idx
+    masked_shape = [s for s, m in zip(parallel_size, mask) if m]
+    unmasked_shape = [s for s, m in zip(parallel_size, mask) if not m]
+    global_stride = prefix_product(parallel_size)
+    masked_stride = [d for d, m in zip(global_stride, mask) if m]
+    unmasked_stride = [d for d, m in zip(global_stride, mask) if not m]
+    group_size = prefix_product(masked_shape)[-1]
+    num_of_group = world_size // group_size
+    ranks = []
+    for group_index in range(num_of_group):
+        # get indices from unmaksed for group_index.
+        decomposed_group_idx = decompose(group_index, unmasked_shape)
+        rank = []
+        for rank_in_group in range(group_size):
+            # get indices from masked for rank_in_group.
+            decomposed_rank_idx = decompose(rank_in_group, masked_shape)
+            rank.append(
+                inner_product(decomposed_rank_idx, masked_stride) + inner_product(decomposed_group_idx, unmasked_stride)
+            )
+        ranks.append(rank)
+    return ranks
+class RankGenerator(object):
+    def __init__(self, tp: int, dp: int, pp: int, cp: int, order: str) -> None:
+        self.tp = tp
+        self.dp = dp
+        self.pp = pp
+        self.cp = cp
+        self.world_size = tp * dp * pp * cp
+        self.name_to_size = {"tp": self.tp, "pp": self.pp, "dp": self.dp, "cp": self.cp}
+        order = order.lower()
+        for name in self.name_to_size.keys():
+            if name not in order and self.name_to_size[name] != 1:
+                raise RuntimeError(
+                    f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't specified the order ({order})."
+                )
+            elif name not in order:
+                order = order + "-" + name
+        self.order = order
+        self.ordered_size = [self.name_to_size[token] for token in order.split("-")]
+    def get_mask(self, order: str, token: str):
+        ordered_token = order.split("-")
+        token = token.split("-")
+        mask = [False] * len(ordered_token)
+        for t in token:
+            mask[ordered_token.index(t)] = True
+        return mask
+    def get_ranks(self, token):
+        """Get rank group by input token.
+        Arguments:
+            token (str):
+                Specify the ranks type that want to get. If we want
+                to obtain multiple parallel types, we can use a hyphen
+                '-' to separate them. For example, if we want to obtain
+                the TP_DP group, the token should be 'tp-dp'.
+        """
+        mask = self.get_mask(self.order, token)
+        ranks = generate_masked_orthogonal_rank_groups(self.world_size, self.ordered_size, mask)
+        return ranks
+def initialize_model_parallel(
+    tp_size: int = 1,
+    pp_size: int = 1,
+    cp_size: int = 1,
+    nccl_communicator_config_path: Optional[str] = None,
+    distributed_timeout_minutes: int = 30,
+    order: str = "tp-cp-pp-dp",
+) -> None:
+    """Initialize model data parallel groups.
+    Borrow from: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
+    Args:
+        tp_size (int, default = 1):
+            The number of GPUs to split individual tensors across.
+        pp_size (int, default = 1):
+            The number of tensor parallel GPU groups to split the
+            Transformer layers across. For example, if tp_size is 4 and
+            pp_size is 2, the model will be split into 2 groups of 4 GPUs.
+        cp_size (int, default = 1):
+            The number of tensor parallel GPU groups to split the
+            network input sequence length across. Compute of attention
+            module requires tokens of full sequence length, so GPUs
+            in a context parallel group need to communicate with each
+            other to exchange information of other sequence chunks.
+            Each GPU and its counterparts in other tensor parallel
+            groups compose a context parallel group.
+            For example, assume we have 8 GPUs, if tensor model parallel
+            size is 4 and context parallel size is 2, the network input
+            will be split into two sequence chunks, which are processed
+            by 2 different groups of 4 GPUs. One chunk is processed by
+            GPU0-3, the other chunk is processed by GPU4-7. Four groups
+            are build to do context parallel communications: [GPU0, GPU4],
+            [GPU1, GPU5], [GPU2, GPU6], and [GPU3, GPU7].
+            Context parallelism partitions sequence length, so it has no
+            impact on weights, which means weights are duplicated among
+            GPUs in a context parallel group. Hence, weight gradients
+            all-reduce is required in backward. For simplicity, we piggyback
+            GPUs of context parallelism on data parallel group for
+            weight gradient all-reduce.
+        nccl_communicator_config_path (str, default = None):
+            Path to the yaml file of NCCL communicator configurations.
+            `min_ctas`, `max_ctas`, and `cga_cluster_size` can be set
+            for each communicator.
+        distributed_timeout_minutes (int, default = 30): Timeout, in
+            minutes,for operations executed against distributed
+            process groups. See PyTorch documentation at
+            https://pytorch.org/docs/stable/distributed.html for
+            caveats.
+        order (str, default=tp-dp-pp):
+            The rank initialization order of parallelism. Now we support
+            tp-dp-pp and tp-pp-dp orders.
+    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
+    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
+    the model pipeline. The present function will
+    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
+    and 8 data-parallel groups as:
+        8 data_parallel groups:
+            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
+        8 tensor model-parallel groups:
+            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
+        4 pipeline model-parallel groups:
+            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
+    Note that for efficiency, the caller should make sure adjacent ranks
+    are on the same DGX box. For example if we are using 2 DGX-1 boxes
+    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
+    ranks 8 to 15 belong to the second box.
+    """
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    if world_size % (tp_size * pp_size * cp_size) != 0:
+        raise RuntimeError(
+            f"world_size ({world_size}) is not divisible by tp_size "
+            f"({tp_size}) x pp_size ({pp_size}) "
+            f"x cp_size ({cp_size})"
+        )
+    nccl_comm_cfgs = {}
+    if nccl_communicator_config_path is not None:
+        try:
+            import yaml
+        except ImportError:
+            raise RuntimeError("Cannot import `yaml`. Setting custom nccl communicator configs " "requires the yaml package.")
+        with open(nccl_communicator_config_path, "r") as stream:
+            nccl_comm_cfgs = yaml.safe_load(stream)
+    dp_size: int = world_size // (tp_size * pp_size * cp_size)
+    rank = torch.distributed.get_rank()
+    rank_generator = RankGenerator(tp=tp_size, dp=dp_size, pp=pp_size, cp=cp_size, order=order)
+    timeout = timedelta(minutes=distributed_timeout_minutes)
+    # Build the data-parallel groups.
+    global _DATA_PARALLEL_GROUP
+    global _DATA_PARALLEL_GLOBAL_RANKS
+    global _DATA_PARALLEL_GROUP_WITH_CP
+    global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
+    assert _DATA_PARALLEL_GROUP is None, "data parallel group is already initialized"
+    for ranks in rank_generator.get_ranks("dp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("dp", nccl_comm_cfgs))
+        if rank in ranks:
+            _DATA_PARALLEL_GROUP = group
+            _DATA_PARALLEL_GLOBAL_RANKS = ranks
+    for ranks_with_cp in rank_generator.get_ranks("dp-cp"):
+        group_with_cp = torch.distributed.new_group(
+            ranks_with_cp, timeout=timeout, pg_options=_get_nccl_options("dp_cp", nccl_comm_cfgs)
+        )
+        if rank in ranks_with_cp:
+            _DATA_PARALLEL_GROUP_WITH_CP = group_with_cp
+            _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp
+    # Build the context-parallel groups.
+    global _CONTEXT_PARALLEL_GROUP
+    global _CONTEXT_PARALLEL_GLOBAL_RANKS
+    assert _CONTEXT_PARALLEL_GROUP is None, "context parallel group is already initialized"
+    for ranks in rank_generator.get_ranks("cp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("cp", nccl_comm_cfgs))
+        if rank in ranks:
+            _CONTEXT_PARALLEL_GROUP = group
+            _CONTEXT_PARALLEL_GLOBAL_RANKS = ranks
+    # Build the model-parallel groups.
+    global _MODEL_PARALLEL_GROUP
+    assert _MODEL_PARALLEL_GROUP is None, "model parallel group is already initialized"
+    for ranks in rank_generator.get_ranks("tp-pp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("mp", nccl_comm_cfgs))
+        if rank in ranks:
+            _MODEL_PARALLEL_GROUP = group
+    # Build the tensor model-parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS
+    assert _TENSOR_MODEL_PARALLEL_GROUP is None, "tensor model parallel group is already initialized"
+    for ranks in rank_generator.get_ranks("tp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("tp", nccl_comm_cfgs))
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP = group
+            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = ranks
+    # Build the tensor + context parallel groups.
+    global _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP
+    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP
+    assert (
+        _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP is None
+    ), "tensor model parallel group with context parallel is already initialized"
+    for ranks in rank_generator.get_ranks("tp-cp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("tp_cp", nccl_comm_cfgs))
+        if rank in ranks:
+            _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP = group
+            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks
+    # Build the pipeline model-parallel groups
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    global _PIPELINE_GLOBAL_RANKS
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, "pipeline model parallel group is already initialized"
+    for ranks in rank_generator.get_ranks("pp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("pp", nccl_comm_cfgs))
+        if rank in ranks:
+            _PIPELINE_MODEL_PARALLEL_GROUP = group
+            _PIPELINE_GLOBAL_RANKS = ranks
+    # Build the tensor + data parallel groups.
+    global _TENSOR_AND_DATA_PARALLEL_GROUP
+    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
+    assert _TENSOR_AND_DATA_PARALLEL_GROUP is None, "Tensor + data parallel group is already initialized"
+    for ranks in rank_generator.get_ranks("tp-cp-dp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("tp_cp_dp", nccl_comm_cfgs))
+        if rank in ranks:
+            _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = group
+    for ranks in rank_generator.get_ranks("tp-dp"):
+        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=_get_nccl_options("tp_dp", nccl_comm_cfgs))
+        if rank in ranks:
+            _TENSOR_AND_DATA_PARALLEL_GROUP = group
+def is_initialized():
+    """Useful for code segments that may be accessed with or without mpu initialization"""
+    return _DATA_PARALLEL_GROUP is not None
+def is_unitialized() -> bool:
+    """Check if parallel state has been initialized
+    Deprecated. Use is_initialized instead.
+    """
+    warnings.warn("is_unitialized is deprecated, use is_initialized instead", DeprecationWarning)
+    return not is_initialized()
+def model_parallel_is_initialized():
+    """Check if model and data parallel groups are initialized."""
+    if _TENSOR_MODEL_PARALLEL_GROUP is None or _PIPELINE_MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
+        return False
+    return True
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+    assert _MODEL_PARALLEL_GROUP is not None, "model parallel group is not initialized"
+    return _MODEL_PARALLEL_GROUP
+def get_tp_group(check_initialized=True, with_context_parallel=False):
+    """Get the tensor model parallel group the caller rank belongs to."""
+    if check_initialized:
+        assert _TENSOR_MODEL_PARALLEL_GROUP is not None, "tensor model parallel group is not initialized"
+    if with_context_parallel:
+        assert (
+            _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP is not None
+        ), "tensor model parallel group with context parallel combined is not initialized"
+        return _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP
+    else:
+        assert _TENSOR_MODEL_PARALLEL_GROUP is not None, "tensor model parallel group is not initialized"
+        return _TENSOR_MODEL_PARALLEL_GROUP
+def get_pp_group():
+    """Get the pipeline model parallel group the caller rank belongs to."""
+    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, "pipeline_model parallel group is not initialized"
+    return _PIPELINE_MODEL_PARALLEL_GROUP
+def get_dp_group(with_context_parallel=False):
+    """Get the data parallel group the caller rank belongs to."""
+    if with_context_parallel:
+        assert (
+            _DATA_PARALLEL_GROUP_WITH_CP is not None
+        ), "data parallel group with context parallel combined is not initialized"
+        return _DATA_PARALLEL_GROUP_WITH_CP
+    else:
+        assert _DATA_PARALLEL_GROUP is not None, "data parallel group is not initialized"
+        return _DATA_PARALLEL_GROUP
+def get_cp_group(check_initialized=True):
+    """Get the context parallel group the caller rank belongs to."""
+    if check_initialized:
+        assert _CONTEXT_PARALLEL_GROUP is not None, "context parallel group is not initialized"
+    return _CONTEXT_PARALLEL_GROUP
+def get_cp_extra_group(check_initialized=True):
+    if check_initialized:
+        assert _CONTEXT_PARALLEL_EXTRA_GROUP is not None, "context parallel extra group is not initialized"
+    return _CONTEXT_PARALLEL_EXTRA_GROUP
+def get_tp_world_size(with_context_parallel=False):
+    """Return world size for the tensor model parallel group."""
+    return torch.distributed.get_world_size(group=get_tp_group(with_context_parallel=with_context_parallel))
+def get_pp_world_size():
+    """Return world size for the pipeline model parallel group."""
+    return torch.distributed.get_world_size(group=get_pp_group())
+def get_tp_rank(with_context_parallel=False):
+    """Return my rank for the tensor model parallel group."""
+    return torch.distributed.get_rank(group=get_tp_group(with_context_parallel=with_context_parallel))
+def get_pp_rank():
+    """Return my rank for the pipeline model parallel group."""
+    return torch.distributed.get_rank(group=get_pp_group())
+def is_pipeline_first_stage():
+    """Return True if in the first pipeline model-parallel stage, False otherwise."""
+    return get_pp_rank() == 0
+def is_pipeline_last_stage():
+    """Return True if in the last pipeline model-parallel stage, False otherwise."""
+    return get_pp_rank() == (get_pp_world_size() - 1)
+def get_tensor_model_parallel_src_rank(with_context_parallel=False):
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    assert _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None, "Tensor model parallel group is not initialized"
+    if with_context_parallel:
+        assert (
+            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
+        ), "Tensor model parallel group with context parallel combined is not initialized"
+        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP[0]
+    else:
+        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS[0]
+def get_tensor_model_parallel_ranks(with_context_parallel=False):
+    """Return all global ranks for the tensor model parallel group."""
+    if with_context_parallel:
+        assert (
+            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
+        ), "Tensor model parallel group with context parallel combined is not initialized"
+        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP
+    else:
+        assert _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None, "Tensor model parallel group is not initialized"
+        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS
+def get_tensor_model_parallel_last_rank(with_context_parallel=False):
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    assert _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None, "Tensor model parallel group is not initialized"
+    if with_context_parallel:
+        assert (
+            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
+        ), "Tensor model parallel group with context parallel combined is not initialized"
+        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP[-1]
+    else:
+        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS[-1]
+def get_pipeline_model_parallel_first_rank():
+    """Return the global rank of the first process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    return _PIPELINE_GLOBAL_RANKS[0]
+def get_pipeline_model_parallel_last_rank():
+    """Return the global rank of the last process in the pipeline for the
+    current tensor parallel group"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    last_rank_local = get_pp_world_size() - 1
+    return _PIPELINE_GLOBAL_RANKS[last_rank_local]
+def get_pipeline_model_parallel_next_rank():
+    """Return the global rank that follows the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pp_rank()
+    world_size = get_pp_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]
+def get_pipeline_model_parallel_prev_rank():
+    """Return the global rank that preceeds the caller in the pipeline"""
+    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
+    rank_in_pipeline = get_pp_rank()
+    world_size = get_pp_world_size()
+    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]
+def get_dp_world_size(with_context_parallel=False):
+    """Return world size for the data parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(group=get_dp_group(with_context_parallel=with_context_parallel))
+    else:
+        return 0
+def get_dp_rank(with_context_parallel=False):
+    """Return my rank for the data parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(group=get_dp_group(with_context_parallel=with_context_parallel))
+    else:
+        return 0
+def get_cp_world_size():
+    """Return world size for the context parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_world_size(group=get_cp_group())
+    else:
+        return 0
+def get_cp_rank():
+    """Return my rank for the context parallel group."""
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank(group=get_cp_group())
+    else:
+        return 0
+def destroy_model_parallel():
+    """Set the groups to none."""
+    global _MODEL_PARALLEL_GROUP
+    _MODEL_PARALLEL_GROUP = None
+    global _TENSOR_MODEL_PARALLEL_GROUP
+    _TENSOR_MODEL_PARALLEL_GROUP = None
+    global _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP
+    _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP = None
+    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP
+    _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP = None
+    global _PIPELINE_MODEL_PARALLEL_GROUP
+    _PIPELINE_MODEL_PARALLEL_GROUP = None
+    global _DATA_PARALLEL_GROUP
+    _DATA_PARALLEL_GROUP = None
+    global _TENSOR_AND_DATA_PARALLEL_GROUP
+    _TENSOR_AND_DATA_PARALLEL_GROUP = None
+    global _PIPELINE_GLOBAL_RANKS
+    _PIPELINE_GLOBAL_RANKS = None
+    global _DATA_PARALLEL_GLOBAL_RANKS
+    _DATA_PARALLEL_GLOBAL_RANKS = None
+    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS
+    _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = None
+    global _CONTEXT_PARALLEL_GROUP
+    _CONTEXT_PARALLEL_GROUP = None
+    global _CONTEXT_PARALLEL_GLOBAL_RANKS
+    _CONTEXT_PARALLEL_GLOBAL_RANKS = None
+    global _CONTEXT_PARALLEL_EXTRA_GROUP
+    _CONTEXT_PARALLEL_EXTRA_GROUP = None
+    global _DATA_PARALLEL_GROUP_WITH_CP
+    _DATA_PARALLEL_GROUP_WITH_CP = None
+    global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
+    _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None
+    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
+    _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None

inference/infra/distributed/utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from .parallel_state import get_tp_rank, get_tp_world_size
+def is_last_rank():
+    return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1)
+def is_last_tp_cp_rank():
+    return get_tp_rank(with_context_parallel=True) == get_tp_world_size(with_context_parallel=True) - 1
+def get_world_size():
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = 1
+    return world_size
+def get_device(local_rank=None):
+    backend = torch.distributed.get_backend()
+    if backend == "nccl":
+        if local_rank is None:
+            device = torch.device("cuda")
+        else:
+            device = torch.device(f"cuda:{local_rank}")
+    elif backend == "gloo":
+        device = torch.device("cpu")
+    else:
+        raise RuntimeError
+    return device

inference/infra/parallelism/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .ulysses_scheduler import ulysses_scheduler
+__all__ = [
+    # context parallel
+    "ulysses_scheduler",
+]

inference/infra/parallelism/all_to_all_primitive.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Tuple, Union
+import torch
+import torch.distributed as dist
+from einops import rearrange
+from inference.utils import divide
+class FakeHandle:
+    def __init__(self):
+        pass
+    def wait(self):
+        pass
+def scatter_head_gather_seqlen(
+    tensor: torch.Tensor, split_sizes: List[int] = None, group: dist.ProcessGroup = None, async_op: bool = True
+) -> Union[torch.Tensor, Tuple[torch.Tensor, Union[dist.Work, FakeHandle]]]:
+    """
+    Scatter head_number and gather seq_len, for example:
+    input: (seq_len, cp * hn, hd)
+    output: (seq_len * cp, hn, hd)
+    NOTE: seq_len of input maybe not equal, which depends on split_sizes[rank]
+    """
+    if group is None or dist.get_world_size(group) == 1:
+        return tensor, FakeHandle()
+    group_world_size = dist.get_world_size(group)
+    if split_sizes is None:
+        split_sizes = [tensor.shape[0]] * group_world_size
+    _, hn, _ = tensor.shape
+    if group_world_size % hn == 0 and group_world_size != hn:
+        tensor = torch.repeat_interleave(tensor, repeats=divide(group_world_size, hn), dim=1).contiguous()
+    assert tensor.is_contiguous()
+    input_split_sizes = [tensor.shape[0]] * group_world_size
+    input = rearrange(tensor, "seq (cp hn) hd -> (cp seq) hn hd", cp=group_world_size).contiguous()
+    output = torch.empty([sum(split_sizes), *input.shape[1:]], device=input.device, dtype=input.dtype)
+    if async_op:
+        handle = dist.all_to_all_single(
+            output, input, output_split_sizes=split_sizes, input_split_sizes=input_split_sizes, group=group, async_op=True
+        )
+        return output, handle
+    else:
+        dist.all_to_all_single(
+            output, input, output_split_sizes=split_sizes, input_split_sizes=input_split_sizes, group=group, async_op=False
+        )
+        return output
+def scatter_seqlen_gather_head(
+    tensor: torch.Tensor, split_sizes: List[int] = None, group: dist.ProcessGroup = None, async_op: bool = True
+) -> Union[torch.Tensor, Tuple[torch.Tensor, Union[dist.Work, FakeHandle]]]:
+    """
+    Scatter seq_len and gather head_number, for example:
+    input: (seq_len * cp, hn, hd)
+    output: (seq_len, cp * hn, hd)
+    NOTE: seq_len of output maybe not equal, which depends on split_sizes[rank]
+    NOTE: rearrange the tensor after communication: (cp, seq, hn, hd) -> (seq, cp * hn, hd)
+    """
+    if group is None or dist.get_world_size(group) == 1:
+        return tensor, FakeHandle() if async_op else tensor
+    group_world_size = dist.get_world_size(group)
+    if split_sizes is None:
+        assert (
+            tensor.shape[0] % group_world_size == 0
+        ), f"tensor.shape[0] {tensor.shape[0]} % group_world_size {group_world_size} != 0"
+        split_sizes = [tensor.shape[0] // group_world_size] * group_world_size
+    assert tensor.is_contiguous()
+    assert tensor.dim() == 3, f"tensor must be 3D, but got {tensor.dim()}D"
+    output = torch.empty(
+        [group_world_size * split_sizes[dist.get_rank(group)], *tensor.shape[1:]], device=tensor.device, dtype=tensor.dtype
+    )
+    output_split_sizes = [split_sizes[dist.get_rank(group)]] * group_world_size
+    if async_op:
+        handle = dist.all_to_all_single(
+            output, tensor, output_split_sizes=output_split_sizes, input_split_sizes=split_sizes, group=group, async_op=True
+        )
+        return output, handle
+    else:
+        dist.all_to_all_single(
+            output, tensor, output_split_sizes=output_split_sizes, input_split_sizes=split_sizes, group=group, async_op=False
+        )
+        return output
+def batch_scatter_head_gather_seqlen(
+    inputs: List[torch.Tensor], split_sizes: List[int] = None, group: dist.ProcessGroup = None
+) -> List[torch.Tensor]:
+    """
+    Batch scatter head_number and gather seq_len, for example:
+    inputs[i] input: (seq_len_i, cp * hn_i, hd)
+    outputs[i] output: (seq_len_i * cp, hn_i, hd)
+    NOTE: seq_len of inputs maybe not equal across ranks, which depends on split_sizes[rank]
+    NOTE: fuse along head dim before communication, and split back after
+    """
+    if group is None or dist.get_world_size(group) == 1:
+        return inputs
+    rank = dist.get_rank(group)
+    group_world_size = dist.get_world_size(group)
+    if split_sizes is None:
+        split_sizes = [inputs[0].shape[0]] * group_world_size
+    assert all(
+        input.shape[0] == split_sizes[rank] for input in inputs
+    ), f"inputs[0].shape[0] {inputs[0].shape[0]} != split_sizes[rank] {split_sizes[rank]}"
+    assert all(input.dim() == 3 for input in inputs), f"inputs[0].dim() {inputs[0].dim()} != 3"
+    for idx in range(len(inputs)):
+        _, hn, _ = inputs[idx].shape
+        if group_world_size % hn == 0 and group_world_size != hn:
+            inputs[idx] = torch.repeat_interleave(inputs[idx], repeats=divide(group_world_size, hn), dim=1)
+        inputs[idx] = rearrange(inputs[idx], "seq (cp hn) hd -> (cp seq) hn hd", cp=group_world_size).contiguous()
+    head_split_number = [input.shape[1] for input in inputs]
+    fused_input = torch.cat(inputs, dim=1).contiguous()
+    input_split_sizes = [fused_input.shape[0] // group_world_size] * group_world_size
+    fused_output = torch.empty([sum(split_sizes), *fused_input.shape[1:]], device=fused_input.device, dtype=fused_input.dtype)
+    dist.all_to_all_single(
+        fused_output,
+        fused_input,
+        output_split_sizes=split_sizes,
+        input_split_sizes=input_split_sizes,
+        group=group,
+        async_op=False,
+    )
+    outputs = torch.split(fused_output, head_split_number, dim=1)
+    return outputs

inference/infra/parallelism/gather_scatter_primitive.py ADDED Viewed

	@@ -0,0 +1,217 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import List, Union
+import torch
+import torch.distributed as dist
+from torch.utils._pytree import tree_map
+class Metadata:
+    def __init__(self, dtype: torch.dtype, numel: int, ndim: int, shape: List[int]):
+        self.dtype = dtype
+        self.numel = numel
+        self.ndim = ndim
+        self.shape = shape
+    def __repr__(self):
+        return f"Metadata(dtype={self.dtype}, numel={self.numel}, ndim={self.ndim}, shape={self.shape})"
+def _gather_metadata(tensor_list: List[torch.Tensor], group: dist.ProcessGroup) -> List[List[Metadata]]:
+    dist.get_rank(group)
+    world_size = dist.get_world_size(group)
+    local_rank = torch.distributed.get_rank() % torch.cuda.device_count()
+    assert (
+        local_rank == torch.cuda.current_device()
+    ), f"local_rank {local_rank} != current_device {torch.cuda.current_device()}"
+    device = tensor_list[0].device if len(tensor_list) > 0 else torch.device("cuda")
+    # ========== Step 1: flatten local tensor list ==========
+    # Metadata: [dtype_code, numel, ndim, *shape]
+    local_metadata = []
+    dtype_map = {torch.float32: 0, torch.float16: 1, torch.bfloat16: 2, torch.int32: 3, torch.int64: 4, torch.uint8: 5}
+    reverse_dtype_map = {v: k for k, v in dtype_map.items()}
+    for t in tensor_list:
+        dtype_code = dtype_map[t.dtype]
+        shape = list(t.shape)
+        numel = t.numel()
+        local_metadata.append(torch.tensor([dtype_code, numel, len(shape)] + shape, dtype=torch.int32, device=device))
+    if local_metadata:
+        local_metadata_tensor = torch.cat(local_metadata)
+    else:
+        local_metadata_tensor = torch.empty(0, dtype=torch.int32, device=device)
+    local_metadata_tensor = local_metadata_tensor.contiguous()
+    local_metadata_len = torch.tensor([local_metadata_tensor.numel()], dtype=torch.int32, device=device)
+    # ========== Step 2: all_gather metadata lengths ==========
+    metadata_lens = [torch.empty_like(local_metadata_len) for _ in range(world_size)]
+    dist.all_gather(metadata_lens, local_metadata_len, group)
+    # ========== Step 3: all_gather metadata payloads (with cpu tensor) ==========
+    metadata_lists = [torch.empty(m.item(), dtype=torch.int32, device=device) for m in metadata_lens]
+    dist.all_gather(metadata_lists, local_metadata_tensor, group)
+    # ========== Step 4: decode metadata and reconstruct tensor list ==========
+    result = []
+    for metadata_list in metadata_lists:
+        offset = 0
+        local_metadata = []
+        while offset < metadata_list.numel():
+            dtype_code = metadata_list[offset].item()
+            numel = metadata_list[offset + 1].item()
+            ndim = metadata_list[offset + 2].item()
+            shape = metadata_list[offset + 3 : offset + 3 + ndim].tolist()
+            offset += 3 + ndim
+            local_metadata.append(Metadata(reverse_dtype_map[dtype_code], numel, ndim, shape))
+        result.append(local_metadata)
+    return result
+def _get_dtype_and_assert_consistency(metadata_lists: List[List[Metadata]]):
+    dtype_set = set()
+    for metadata_list in metadata_lists:
+        for metadata in metadata_list:
+            dtype_set.add(metadata.dtype)
+    assert len(dtype_set) == 1, f"Metadata lists are not consistent: {dtype_set}"
+    return dtype_set.pop()
+def _get_numel_for_each_rank(metadata_lists: List[List[Metadata]]) -> List[int]:
+    return [sum(meta.numel for meta in metadata_list) for metadata_list in metadata_lists]
+def gather_arbitrary_tensor_list(tensor_list: List[torch.Tensor], group: dist.ProcessGroup) -> List[torch.Tensor]:
+    """
+    Magic gather primitive. Provide the following features:
+    1. Support tensor list with different length for each rank.
+    2. Support arbitrary Tensor, which means the Tensor can have different shapes but same dtype.
+    3. Support empty tensor_list in some ranks without padding.
+    Args:
+        tensor_list: A list of tensors to gather.
+        group: The process group to use.
+    Returns:
+        A list of tensors gathered from all ranks.
+    """
+    dist.get_rank(group)
+    world_size = dist.get_world_size(group)
+    local_rank = torch.distributed.get_rank() % torch.cuda.device_count()
+    assert (
+        local_rank == torch.cuda.current_device()
+    ), f"local_rank {local_rank} != current_device {torch.cuda.current_device()}"
+    device = tensor_list[0].device if len(tensor_list) > 0 else torch.device("cuda")
+    # Step 1: Gather metadata
+    metadata_lists = _gather_metadata(tensor_list, group)
+    tensor_dtype = _get_dtype_and_assert_consistency(metadata_lists)
+    # Step 2: Flatten local tensors into a single 1D buffer
+    if tensor_list:
+        flat_tensor = torch.cat([t.flatten() for t in tensor_list], dim=0).contiguous()
+    else:
+        flat_tensor = torch.empty(0, dtype=tensor_dtype, device=device)  # dummy, will be ignored
+    # Step 3: Gather lengths from metadata
+    all_numels_int = _get_numel_for_each_rank(metadata_lists)
+    # Step 4: Allocate buffers and gather flat tensor data
+    output_flat_tensors = []
+    for numel in all_numels_int:
+        output_flat_tensors.append(torch.empty(numel, dtype=tensor_dtype, device=device))
+    dist.all_gather(output_flat_tensors, flat_tensor, group)
+    # Step 5: Reconstruct individual tensors using metadata
+    gathered_tensor_lists = []
+    for i in range(world_size):
+        flat = output_flat_tensors[i]
+        if flat.numel() == 0:
+            continue
+        metadata_list = metadata_lists[i]
+        offset = 0
+        for meta in metadata_list:
+            numel = meta.numel
+            t = flat[offset : offset + numel].view(meta.shape).to(meta.dtype)
+            offset += numel
+            gathered_tensor_lists.append(t)
+    return gathered_tensor_lists
+def _scatter_to_context_parallel_region(input: torch.Tensor, split_sizes: List[int], group: dist.ProcessGroup = None):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+    # Split along first dimension with padding.
+    rank = dist.get_rank(group)
+    dim_offset = sum(split_sizes[:rank])
+    output = input[dim_offset : dim_offset + split_sizes[rank]].contiguous()
+    return output
+def scatter_to_context_parallel_region(
+    inputs: Union[torch.Tensor, List[torch.Tensor]], split_sizes: List[int] = None, group: dist.ProcessGroup = None
+):
+    """Split the tensor along its first dimension and keep the
+    corresponding slice."""
+    if group is None or torch.distributed.get_world_size(group) == 1:
+        return inputs
+    if split_sizes is None:
+        assert (
+            inputs.shape[0] % dist.get_world_size(group) == 0
+        ), f"inputs.shape[0] {inputs.shape[0]} % dist.get_world_size(group) {dist.get_world_size(group)} != 0"
+        split_sizes = [inputs.shape[0] // dist.get_world_size(group)] * dist.get_world_size(group)
+    partial_func = partial(_scatter_to_context_parallel_region, split_sizes=split_sizes, group=group)
+    return tree_map(partial_func, inputs)
+def _gather_from_context_parallel_region(
+    input: Union[torch.Tensor, List[torch.Tensor]], split_sizes: List[int], group: dist.ProcessGroup = None
+):
+    input = input.contiguous()
+    dim_size = list(input.size())
+    dim_size[0] = sum(split_sizes)
+    output = torch.empty(dim_size, dtype=input.dtype, device=input.device)
+    outputs = list(torch.split(output, split_sizes, dim=0))
+    torch.distributed.all_gather(outputs, input, group=group)
+    output = torch.concat(outputs, dim=0)
+    return output
+def gather_from_context_parallel_region(
+    inputs: Union[torch.Tensor, List[torch.Tensor]], split_sizes: List[int] = None, group: dist.ProcessGroup = None
+):
+    """Gather tensors and concatinate along the first dimension."""
+    if group is None or torch.distributed.get_world_size(group) == 1:
+        return inputs
+    if split_sizes is None:
+        split_sizes = [inputs.shape[0] * dist.get_world_size(group)]
+    partial_func = partial(_gather_from_context_parallel_region, split_sizes=split_sizes, group=group)
+    return tree_map(partial_func, inputs)

inference/infra/parallelism/ulysses_scheduler.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Generic, List, Optional, TypeVar
+import torch
+from torch.utils._pytree import tree_map
+from inference.infra.distributed import get_cp_group, get_cp_world_size
+from .gather_scatter_primitive import gather_from_context_parallel_region, scatter_to_context_parallel_region
+T = TypeVar("T")
+class UlyssesScheduler(Generic[T]):
+    """
+    A naive implementation of Ulysses scheduler for context parallel processing.
+    This scheduler handles tensor dispatching and undispatching operations when tensors
+    enter and exit the context parallel region. It supports arbitrary nested data structures
+    containing tensors and automatically handles padding and splitting operations.
+    The scheduler splits input tensors along the sequence dimension across multiple GPUs
+    in the context parallel group, enabling parallel processing of long sequences.
+    """
+    def __init__(self):
+        """Initialize the Ulysses scheduler."""
+        self._cp_split_sizes: Optional[List[int]] = None
+    @property
+    def cp_split_sizes(self):
+        """Get the current context parallel split sizes."""
+        return self._cp_split_sizes
+    def _dispatch(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Dispatch a tensor to the context parallel region.
+        This method automatically handles padding and splits the tensor along the sequence
+        dimension across the context parallel group. The split sizes are calculated to
+        distribute the sequence length as evenly as possible across all ranks.
+        Args:
+            x: Input tensor with shape [seq_len, ...] where seq_len is the sequence length.
+        Returns:
+            Dispatched tensor that has been split and distributed across the context parallel group.
+        Raises:
+            AssertionError: If the split sizes change between calls, indicating inconsistent
+                          sequence lengths or context parallel group size.
+        """
+        seq_len = x.shape[0]
+        cp_world_size = get_cp_world_size()
+        if seq_len % cp_world_size == 0:
+            cp_split_sizes = [seq_len // cp_world_size] * cp_world_size
+        else:
+            num_ranks_with_one_extra = seq_len % cp_world_size
+            min_tokens_per_rank = (seq_len - num_ranks_with_one_extra) // cp_world_size
+            cp_split_sizes = [min_tokens_per_rank + 1] * num_ranks_with_one_extra + [min_tokens_per_rank] * (
+                cp_world_size - num_ranks_with_one_extra
+            )
+        if self._cp_split_sizes is not None:
+            assert (
+                self._cp_split_sizes == cp_split_sizes
+            ), f"cp_split_sizes changed from {self._cp_split_sizes} to {cp_split_sizes}"
+        self._cp_split_sizes = cp_split_sizes
+        x = scatter_to_context_parallel_region(x, cp_split_sizes, group=get_cp_group())
+        return x
+    def _undispatch(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Undispatch a tensor from the context parallel region.
+        This method gathers the tensor parts from all ranks in the context parallel group
+        and concatenates them back into the original sequence. It automatically handles
+        unpadding if padding was applied during dispatch.
+        Args:
+            x: Dispatched tensor from the context parallel region.
+        Returns:
+            Reconstructed tensor with the original sequence length.
+        """
+        x = gather_from_context_parallel_region(x, self._cp_split_sizes, group=get_cp_group())
+        return x
+    def dispatch(self, tensors: T) -> T:
+        """
+        Apply dispatch operation to all tensor leaf nodes in a nested data structure.
+        This method recursively applies the _dispatch operation to all tensors in the
+        input data structure, preparing them for context parallel computation. The
+        structure of the input is preserved in the output.
+        Args:
+            tensors: Arbitrary nested data structure containing tensors (single tensor,
+                    tuple, list, dict, etc.). All tensors should have the same sequence
+                    length in their first dimension.
+        Returns:
+            A new data structure with the same structure as input, where all tensors
+            have been dispatched to the context parallel region.
+        """
+        return tree_map(self._dispatch, tensors)
+    def undispatch(self, tensors: T) -> T:
+        """
+        Apply undispatch operation to all tensor leaf nodes in a nested data structure.
+        This method recursively applies the _undispatch operation to all tensors in the
+        input data structure, reconstructing them from the context parallel region. The
+        structure of the input is preserved in the output.
+        Args:
+            tensors: Arbitrary nested data structure containing dispatched tensors.
+        Returns:
+            A new data structure with the same structure as input, where all tensors
+            have been reconstructed from the context parallel region.
+        """
+        output = tree_map(self._undispatch, tensors)
+        self._cp_split_sizes = None
+        return output
+_ULYSSES_SCHEDULER = UlyssesScheduler()
+def ulysses_scheduler() -> UlyssesScheduler:
+    assert _ULYSSES_SCHEDULER is not None, "ulysses scheduler is not initialized"
+    return _ULYSSES_SCHEDULER

inference/model/dit/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .dit_model import get_dit
+from .dit_module import DiTModel
+__all__ = ["DiTModel", "get_dit"]

inference/model/dit/dit_model.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import torch
+from inference.infra.checkpoint import load_model_checkpoint
+from inference.infra.distributed import get_cp_rank, get_pp_rank, get_tp_rank
+from inference.utils import print_mem_info_rank_0, print_model_size, print_rank_0
+from .dit_module import DiTModel
+def get_dit(model_config, engine_config):
+    """Build and load DiT model."""
+    model = DiTModel(model_config=model_config)
+    print_rank_0("Build dit model successfully")
+    print_rank_0(model)
+    print_model_size(
+        model, prefix=f"(tp, cp, pp) rank ({get_tp_rank()}, {get_cp_rank()}, {get_pp_rank()}): ", print_func=print_rank_0
+    )
+    model = load_model_checkpoint(model, engine_config)
+    model.cuda(torch.cuda.current_device())
+    model.eval()
+    print_mem_info_rank_0("Load model successfully")
+    gc.collect()
+    torch.cuda.empty_cache()
+    return model

inference/model/dit/dit_module.py ADDED Viewed

	@@ -0,0 +1,950 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Callable, List, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from inference.common import Modality, VarlenHandler, is_hopper_arch
+from inference.infra.parallelism import ulysses_scheduler
+from magi_compiler import magi_compile
+from magi_compiler.api import magi_register_custom_op
+from magi_compiler.config import CompileConfig
+from torch import Tensor
+from torch.nn import Parameter
+@dataclass
+class FFAHandler:
+    q_ranges: torch.Tensor
+    k_ranges: torch.Tensor
+    max_seqlen_q: int
+    max_seqlen_k: int
+    attn_type_map: torch.Tensor
+    softmax_scale: float
+# Define the MLP activation type
+class MLPActivationType(Enum):
+    """Enumeration of supported activation functions for MLP"""
+    SWIGLU7 = "swiglu7"
+    GELU7 = "gelu7"
+def swiglu7(x, alpha: float = 1.702, limit: float = 7.0, out_dtype: Optional[torch.dtype] = None):
+    out_dtype = x.dtype if out_dtype is None else out_dtype
+    x = x.to(torch.float32)
+    x_glu, x_linear = x[..., ::2], x[..., 1::2]
+    # Clamp the input values
+    x_glu = x_glu.clamp(min=None, max=limit)
+    x_linear = x_linear.clamp(min=-limit, max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    # Note we add an extra bias of 1 to the linear layer (from GPT-OSS)
+    return (out_glu * (x_linear + 1)).to(out_dtype)
+def gelu7(x, alpha: float = 1.702, limit: float = 7.0, out_dtype: Optional[torch.dtype] = None):
+    out_dtype = x.dtype if out_dtype is None else out_dtype
+    x = x.to(torch.float32)
+    x_glu = x
+    # Clamp the input values
+    x_glu = x_glu.clamp(min=None, max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    # Note we add an extra bias of 1 to the linear layer
+    return out_glu.to(out_dtype)
+def create_activation_func(activation_type: MLPActivationType) -> Callable:
+    match activation_type:
+        case MLPActivationType.SWIGLU7:
+            return swiglu7
+        case MLPActivationType.GELU7:
+            return gelu7
+        case _:
+            raise ValueError(f"Unknown activation type: {activation_type}")
+class ModalityDispatcher:
+    permuted_modality_mapping: torch.Tensor
+    group_size: torch.Tensor
+    group_size_cpu: list[int]
+    num_modalities: int
+    def __init__(self, modality_mapping: torch.Tensor, num_modalities: int):
+        """
+        Initialize dispatcher.
+        This runs once during object construction and precomputes all mappings.
+        """
+        self.modality_mapping = modality_mapping
+        self.num_modalities = num_modalities
+        self.permuted_modality_mapping = self._precompute_permute_mapping(modality_mapping)
+        self.group_size = torch.bincount(self.permuted_modality_mapping, minlength=num_modalities).to(torch.int32)
+        self.group_size_cpu: list[int] = [int(x) for x in self.group_size.to("cpu").tolist()]
+    def _precompute_permute_mapping(self, modality_mapping):
+        # 1. Compute forward and inverse permutation mappings.
+        # argsort is an efficient O(N log N) operation.
+        self.permute_mapping = torch.argsort(modality_mapping)
+        self.inv_permute_mapping = torch.argsort(self.permute_mapping)
+        # 2. Compute group size for each modality.
+        # bincount is highly efficient for counting.
+        permuted_modality_mapping = modality_mapping[self.permute_mapping]
+        return permuted_modality_mapping
+    def dispatch(self, x: torch.Tensor) -> list[torch.Tensor]:
+        grouped_tensors = torch.split(x, self.group_size_cpu, dim=0)
+        return list(grouped_tensors)
+    def undispatch(self, *processed_groups: list[torch.Tensor]) -> torch.Tensor:
+        return torch.cat(processed_groups, dim=0)
+    @staticmethod
+    def permute(x: torch.Tensor, permute_mapping: torch.Tensor) -> torch.Tensor:
+        """Apply forward permutation to tensor."""
+        return x[permute_mapping]
+    @staticmethod
+    def inv_permute(x: torch.Tensor, inv_permute_mapping: torch.Tensor) -> torch.Tensor:
+        """Apply inverse permutation to tensor."""
+        return x[inv_permute_mapping]
+def freq_bands(
+    num_bands: int, temperature: float = 10000.0, step: int = 2, device: Optional[torch.device] = None
+) -> torch.Tensor:
+    exp = torch.arange(0, num_bands, step, dtype=torch.int64, device=device).to(torch.float32) / num_bands
+    bands = 1.0 / (temperature**exp)
+    return bands
+def rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    else:
+        x1, x2 = x[..., ::2], x[..., 1::2]
+        return rearrange(torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2)
+def apply_rotary_emb_torch(x, cos, sin, interleaved=False):
+    """
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1]
+    cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
+    return torch.cat([x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin, x[..., ro_dim:]], dim=-1)
+class ElementWiseFourierEmbed(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        max_res: int = 224,
+        temperature: float = 10000.0,
+        in_pixels: bool = True,
+        linear_bands: bool = False,
+        learnable: bool = False,
+        device: torch.device = torch.device("cpu"),
+        dtype: torch.dtype = torch.float32,
+    ):
+        """
+        Args:
+            dim: Output feature dimension, total channels, must be divisible by 6
+            max_res: Max pixel-frequency resolution for pixel-domain bands
+            temperature: Temperature in inverse-frequency mode
+            in_pixels: True -> pixel-frequency bands, False -> inverse-frequency bands
+            linear_bands: Whether pixel-frequency bands are linearly spaced
+            learnable: Whether frequency bands are trainable
+        """
+        super().__init__()
+        self.dim = dim
+        self.in_pixels = in_pixels
+        self.learnable = learnable
+        self.temperature = temperature
+        self.max_res = max_res
+        self.linear_bands = linear_bands
+        self.device = device
+        self.dtype = dtype
+        # Make frequency bands trainable or register as buffer
+        bands = self.get_default_bands()
+        if self.learnable:
+            self.bands = nn.Parameter(bands)
+        else:
+            self.register_buffer("bands", bands)
+    def forward(self, coords: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            coords: [L,9], column order (time, row, col, T, H, W, ref_T, ref_H, ref_W)
+        Returns:
+            emb: [L, dim] element-wise Fourier embedding
+        """
+        # Use slicing instead of unbind + stack to reduce intermediates
+        coords_xyz = coords[:, :3]  # [L,3] -> (t, h, w)
+        sizes = coords[:, 3:6]  # [L,3] -> (T, H, W)
+        refs = coords[:, 6:9]  # [L,3] -> (ref_T, ref_H, ref_W)
+        # Compute scale factors
+        scales = (refs - 1) / (sizes - 1)  # [L,3]
+        # NOTE: if both ref and size are 1, scale is fixed to 1; otherwise invalid
+        scales[(refs == 1) & (sizes == 1)] = 1
+        assert not scales.isnan().any(), "scales has nan"
+        assert not scales.isinf().any(), "scales has inf"
+        # Center alignment: apply to h,w only (not time)
+        centers = (sizes - 1) / 2  # [L,3]
+        centers[:, 0] = 0  # Do not center the time dimension
+        coords_xyz = coords_xyz - centers  # [L,3]
+        # Project to frequency bands in one shot: [L,3,B]
+        proj = coords_xyz.unsqueeze(-1) * scales.unsqueeze(-1) * self.bands
+        # Compute sin & cos and concatenate
+        sin_proj = proj.sin()  # [L,3,B]
+        cos_proj = proj.cos()
+        return torch.cat((sin_proj, cos_proj), dim=1).flatten(1)
+    def reset_parameters(self):
+        bands = self.get_default_bands()
+        self.bands.copy_(bands)
+    def get_default_bands(self):
+        if self.in_pixels:
+            raise NotImplementedError("in_pixels are not implemented yet")
+        else:
+            bands = freq_bands(self.dim // 8, temperature=self.temperature, step=1, device=self.device).to(self.dtype)
+        return bands
+class MultiModalityRMSNorm(nn.Module):
+    __constants__ = ["dim", "eps", "num_modality"]
+    dim: int
+    eps: float
+    num_modality: int
+    def __init__(self, dim: int, eps: float = 1e-6, device: torch.device | None = None, num_modality: int = 1):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.num_modality = num_modality
+        self.weight = torch.nn.Parameter(torch.zeros(dim * num_modality, device=device, dtype=torch.float32))
+        if num_modality > 1:
+            self.forward = self.forward_multi_experts
+        else:
+            self.forward = self.forward_single_expert
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.zeros_(self.weight)
+    def rms(self, x: torch.Tensor) -> torch.Tensor:
+        t, original_dtype = x.float(), x.dtype
+        t = t * torch.rsqrt(torch.mean(t**2, dim=-1, keepdim=True) + self.eps)
+        return t
+    def forward_multi_experts(self, x: torch.Tensor, modality_dispatcher: ModalityDispatcher) -> torch.Tensor:
+        original_dtype = x.dtype
+        t = self.rms(x)
+        weight_chunked = self.weight.chunk(self.num_modality, dim=0)
+        t_list = modality_dispatcher.dispatch(t)
+        for i in range(self.num_modality):
+            t_list[i] = t_list[i] * (weight_chunked[i] + 1)
+        t = modality_dispatcher.undispatch(*t_list)
+        return t.to(original_dtype)
+    def forward_single_expert(self, x: torch.Tensor, modality_dispatcher: Optional[ModalityDispatcher] = None) -> torch.Tensor:
+        t, original_dtype = x.float(), x.dtype
+        t = t * torch.rsqrt(torch.mean(t**2, dim=-1, keepdim=True) + self.eps)
+        return (t * (self.weight + 1)).to(original_dtype)
+class _BF16ComputeLinear(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        bias: Optional[torch.Tensor],
+        output_dtype: Optional[torch.dtype],
+        compute_dtype: torch.dtype = torch.bfloat16,
+    ):
+        # Convert input to specified input data type
+        input_cast = input.to(compute_dtype)
+        # Convert weight to computation data type
+        weight_cast = weight.to(compute_dtype)
+        # Perform linear operation
+        output = torch.matmul(input_cast, weight_cast.t())
+        # Add bias if present
+        if bias is not None:
+            bias_cast = bias.to(compute_dtype)
+            output = output + bias_cast
+        else:
+            bias_cast = None
+        # Convert output to specified output data type
+        return output.to(output_dtype)
+class BaseLinear(nn.Module):
+    __constants__ = ["in_features", "out_features", "num_layers", "num_experts"]
+    in_features: int
+    out_features: int
+    num_layers_for_initialization: int
+    num_experts: int
+    weight: Tensor
+    def __init__(
+        self, in_features, out_features, num_layers_for_initialization, num_experts, bias=True, device=None, dtype=None
+    ):
+        super().__init__()
+        factory_kwargs = {"device": device, "dtype": torch.bfloat16}
+        self.in_features = in_features
+        self.out_features = out_features
+        self.num_layers_for_initialization = num_layers_for_initialization
+        self.num_experts = num_experts
+        self.use_bias = bias
+        self.weight = Parameter(torch.empty((out_features * num_experts, in_features), **factory_kwargs))
+        if bias:
+            self.bias = Parameter(torch.empty(out_features * num_experts, **factory_kwargs))
+        else:
+            self.register_parameter("bias", None)
+    def forward(
+        self,
+        input: torch.Tensor,
+        output_dtype: Optional[torch.dtype] = None,
+        modality_dispatcher: Optional[ModalityDispatcher] = None,
+    ) -> torch.Tensor:
+        output_dtype = input.dtype if output_dtype is None else output_dtype
+        return _BF16ComputeLinear.apply(input, self.weight, self.bias, output_dtype, torch.bfloat16)
+class NativeMoELinear(BaseLinear):
+    def forward(
+        self,
+        input: torch.Tensor,
+        output_dtype: Optional[torch.dtype] = None,
+        modality_dispatcher: Optional[ModalityDispatcher] = None,
+    ) -> torch.Tensor:
+        output_dtype = input.dtype if output_dtype is None else output_dtype
+        input_list = modality_dispatcher.dispatch(input)  # type: ignore
+        weight_chunked = self.weight.chunk(self.num_experts, dim=0)
+        if self.bias is not None:
+            bias_chunked = self.bias.chunk(self.num_experts, dim=0)
+        for i in range(self.num_experts):
+            input_list[i] = _BF16ComputeLinear.apply(
+                input_list[i],
+                weight_chunked[i],
+                bias_chunked[i] if self.bias is not None else None,
+                output_dtype,
+                torch.bfloat16,
+            )
+        return modality_dispatcher.undispatch(*input_list)  # type: ignore
+def create_linear(
+    in_features, out_features, num_layers=1, num_experts=1, bias=True, device=None, dtype=None
+) -> BaseLinear | NativeMoELinear:
+    if num_experts == 1:
+        return BaseLinear(in_features, out_features, num_layers, num_experts, bias, device, dtype)
+    else:
+        return NativeMoELinear(in_features, out_features, num_layers, num_experts, bias, device, dtype)
+HAS_MAGI_ATTENTION = importlib.util.find_spec("magi_attention") is not None
+HAS_FA3 = importlib.util.find_spec("flash_attn_interface") is not None
+@magi_register_custom_op(name="infra::flash_attn_func", is_subgraph_boundary=True)
+def flash_attn_func(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+    if HAS_FA3 and is_hopper_arch():
+        from flash_attn_interface import flash_attn_func as fa3_flash_attn_func
+        return fa3_flash_attn_func(query, key, value)
+    else:
+        from flash_attn.flash_attn_interface import flash_attn_func as fa2_flash_attn_func
+        return fa2_flash_attn_func(query, key, value)
+def _split_q_range_with_no_overlap(
+    q_ranges: torch.Tensor, k_ranges: torch.Tensor
+) -> Tuple[List[List[int]], List[List[List[int]]]]:
+    range_boundary = torch.unique(q_ranges, sorted=True).tolist()
+    candidates = [[start, end, []] for start, end in zip(range_boundary[:-1], range_boundary[1:])]
+    q_ranges = q_ranges.tolist()
+    k_ranges = k_ranges.tolist()
+    for q_range, k_range in zip(q_ranges, k_ranges):
+        q_start, q_end = q_range
+        for q_range_cand in candidates:
+            if q_start <= q_range_cand[0] and q_range_cand[1] <= q_end:
+                q_range_cand[2].append(k_range)
+    q_ranges_out = []
+    k_ranges_out = []
+    for q_range_cand in candidates:
+        if len(q_range_cand[2]) > 0:
+            q_ranges_out.append(q_range_cand[0:2])
+            k_ranges_out.append(q_range_cand[2])
+    return q_ranges_out, k_ranges_out
+def _flash_attn_with_correction(
+    query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, q_ranges: List[List[int]], k_range_list: List[List[List[int]]]
+):
+    output = torch.zeros_like(query)
+    output_lse = torch.zeros((query.shape[0], query.shape[1]), dtype=torch.float32, device=query.device)
+    from flash_attn.flash_attn_interface import flash_attn_func
+    for q_range, k_ranges in zip(q_ranges, k_range_list):
+        q_start, q_end = q_range
+        qo_out, qo_lse = None, None
+        for k_range in k_ranges:
+            k_start, k_end = k_range
+            cur_qo_out, cur_qo_lse, _ = flash_attn_func(
+                query[q_start:q_end].unsqueeze(0),
+                key[k_start:k_end].unsqueeze(0),
+                value[k_start:k_end].unsqueeze(0),
+                return_attn_probs=True,
+            )
+            cur_qo_out, cur_qo_lse = cur_qo_out.squeeze(0), cur_qo_lse.squeeze(0)
+            if qo_out is None:
+                qo_out = cur_qo_out
+                qo_lse = cur_qo_lse
+            else:
+                qo_lse[qo_lse == torch.inf] = -torch.inf
+                cur_qo_lse[cur_qo_lse == torch.inf] = -torch.inf
+                max_lse = torch.max(qo_lse, cur_qo_lse)
+                qo_se, cur_qo_se = torch.exp(qo_lse - max_lse), torch.exp(cur_qo_lse - max_lse)
+                sum_se = qo_se + cur_qo_se
+                qo_scale, cur_qo_scale = qo_se / sum_se, cur_qo_se / sum_se
+                qo_out = qo_out * qo_scale.permute(1, 0).unsqueeze(-1) + cur_qo_out * cur_qo_scale.permute(1, 0).unsqueeze(-1)
+                qo_lse = torch.log(sum_se) + max_lse
+        output[q_start:q_end] = qo_out
+        output_lse[q_start:q_end, :] = qo_lse.permute(1, 0)
+    return output, output_lse
+def _custom_flex_flash_attn_func(
+    query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, q_ranges: torch.Tensor, k_ranges: torch.Tensor, **kwargs
+):
+    q_ranges, k_range_list = _split_q_range_with_no_overlap(q_ranges, k_ranges)
+    return _flash_attn_with_correction(query, key, value, q_ranges, k_range_list)
+def _flex_flash_attn_func_infer_output_meta(
+    query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, q_ranges: torch.Tensor, k_ranges: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    output = torch.empty_like(query)
+    output_lse = torch.empty((query.shape[0], query.shape[1]), dtype=torch.float32, device=query.device)
+    return output, output_lse
+@magi_register_custom_op(
+    name="infra::flex_flash_attn_func",
+    mutates_args=(),
+    infer_output_meta_fn=_flex_flash_attn_func_infer_output_meta,
+    is_subgraph_boundary=True,
+)
+def flex_flash_attn_func(
+    query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, q_ranges: torch.Tensor, k_ranges: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if HAS_MAGI_ATTENTION and is_hopper_arch():
+        from magi_attention.api import flex_flash_attn_func as magi_flex_flash_attn_func
+        return magi_flex_flash_attn_func(query, key, value, q_ranges, k_ranges)
+    else:
+        return _custom_flex_flash_attn_func(query, key, value, q_ranges, k_ranges)
+def _attention_with_cp_infer_output_meta(q: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    return torch.empty_like(q, dtype=torch.bfloat16).squeeze(0)
+@magi_register_custom_op(
+    name="infra::flash_attn_with_cp",
+    mutates_args=(),
+    infer_output_meta_fn=_attention_with_cp_infer_output_meta,
+    is_subgraph_boundary=True,
+)
+def flash_attn_with_cp(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, cp_split_sizes: List[int]) -> torch.Tensor:
+    q, k, v = q.to(torch.bfloat16), k.to(torch.bfloat16), v.to(torch.bfloat16)
+    from inference.infra.distributed import get_cp_group, get_cp_world_size
+    from inference.infra.parallelism.all_to_all_primitive import batch_scatter_head_gather_seqlen, scatter_seqlen_gather_head
+    if get_cp_world_size() > 1:
+        q, k, v = batch_scatter_head_gather_seqlen([q.squeeze(0), k.squeeze(0), v.squeeze(0)], cp_split_sizes, get_cp_group())
+        q = q.unsqueeze(0)
+        k = k.unsqueeze(0)
+        v = v.unsqueeze(0)
+    self_attn_out = torch.ops.infra.flash_attn_func(q, k, v).squeeze(0)
+    if get_cp_world_size() > 1:
+        self_attn_out = scatter_seqlen_gather_head(self_attn_out, cp_split_sizes, get_cp_group(), async_op=False)
+        self_attn_out = rearrange(self_attn_out, "(cp sq) hn hd -> sq (cp hn) hd", cp=get_cp_world_size())
+    return self_attn_out
+@magi_register_custom_op(
+    name="infra::flex_flash_attn_with_cp",
+    mutates_args=(),
+    infer_output_meta_fn=_attention_with_cp_infer_output_meta,
+    is_subgraph_boundary=True,
+)
+def flex_flash_attn_with_cp(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    q_ranges: torch.Tensor,
+    k_ranges: torch.Tensor,
+    cp_split_sizes: List[int],
+) -> torch.Tensor:
+    q, k, v = q.to(torch.bfloat16).squeeze(0), k.to(torch.bfloat16).squeeze(0), v.to(torch.bfloat16).squeeze(0)
+    from inference.infra.distributed import get_cp_group, get_cp_world_size
+    from inference.infra.parallelism.all_to_all_primitive import batch_scatter_head_gather_seqlen, scatter_seqlen_gather_head
+    if get_cp_world_size() > 1:
+        q, k, v = batch_scatter_head_gather_seqlen([q, k, v], cp_split_sizes, get_cp_group())
+    out, _ = torch.ops.infra.flex_flash_attn_func(q, k, v, q_ranges=q_ranges, k_ranges=k_ranges)
+    if get_cp_world_size() > 1:
+        out = scatter_seqlen_gather_head(out, cp_split_sizes, get_cp_group(), async_op=False)
+        out = rearrange(out, "(cp sq) hn hd -> sq (cp hn) hd", cp=get_cp_world_size())
+    return out
+@dataclass
+class AttentionConfig:
+    hidden_size: int
+    num_heads_q: int
+    num_heads_kv: int
+    head_dim: int
+    params_dtype: torch.dtype
+    checkpoint_qk_layernorm_rope: bool
+    num_modality: int
+    num_layers: int
+    use_local_attn: bool = False
+    enable_attn_gating: bool = False
+class Attention(torch.nn.Module):
+    config: AttentionConfig
+    def __init__(self, config: AttentionConfig):
+        super().__init__()
+        self.config = config
+        self.pre_norm = MultiModalityRMSNorm(config.hidden_size, eps=1e-6, num_modality=config.num_modality)
+        self.gating_size = config.num_heads_q if config.enable_attn_gating else 0
+        self.linear_qkv = create_linear(
+            config.hidden_size,
+            config.num_heads_q * config.head_dim + config.num_heads_kv * config.head_dim * 2 + self.gating_size,
+            num_experts=config.num_modality,
+            bias=False,
+            dtype=config.params_dtype,
+            num_layers=config.num_layers,
+        )
+        self.linear_proj = create_linear(
+            config.num_heads_q * config.head_dim,
+            config.hidden_size,
+            bias=False,
+            num_experts=config.num_modality,
+            dtype=config.params_dtype,
+            num_layers=config.num_layers,
+        )
+        self.q_norm = MultiModalityRMSNorm(config.head_dim, num_modality=config.num_modality)
+        self.k_norm = MultiModalityRMSNorm(config.head_dim, num_modality=config.num_modality)
+        self.q_size = config.num_heads_q * config.head_dim
+        self.kv_size = config.num_heads_kv * config.head_dim
+    def reset_parameters(self):
+        if hasattr(self.linear_proj, "reset_parameters_output_layer"):
+            self.linear_proj.reset_parameters_output_layer()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rope: torch.Tensor,
+        permute_mapping: torch.Tensor,
+        inv_permute_mapping: torch.Tensor,
+        varlen_handler: VarlenHandler,
+        local_attn_handler: FFAHandler,
+        modality_dispatcher: ModalityDispatcher,
+        cp_split_sizes: List[int],
+    ) -> torch.Tensor:
+        hidden_states = self.pre_norm(hidden_states, modality_dispatcher=modality_dispatcher).to(torch.bfloat16)
+        qkv: torch.Tensor = self.linear_qkv(hidden_states, modality_dispatcher=modality_dispatcher).to(torch.float32)
+        q, k, v, g = torch.split(qkv, [self.q_size, self.kv_size, self.kv_size, self.gating_size], dim=1)
+        q = q.view(-1, self.config.num_heads_q, self.config.head_dim)
+        k = k.view(-1, self.config.num_heads_kv, self.config.head_dim)
+        v = v.view(-1, self.config.num_heads_kv, self.config.head_dim)
+        g = g.view(k.shape[0], self.config.num_heads_q, -1)
+        q = self.q_norm(q, modality_dispatcher=modality_dispatcher)
+        k = self.k_norm(k, modality_dispatcher=modality_dispatcher)
+        q = ModalityDispatcher.inv_permute(q, inv_permute_mapping).unsqueeze(0)
+        k = ModalityDispatcher.inv_permute(k, inv_permute_mapping).unsqueeze(0)
+        v = ModalityDispatcher.inv_permute(v, inv_permute_mapping).unsqueeze(0)
+        sin_emb, cos_emb = rope.tensor_split(2, -1)
+        q = apply_rotary_emb_torch(q, cos_emb, sin_emb)
+        k = apply_rotary_emb_torch(k, cos_emb, sin_emb)
+        if self.config.use_local_attn:
+            self_attn_out = flex_flash_attn_with_cp(
+                q, k, v, local_attn_handler.q_ranges, local_attn_handler.k_ranges, cp_split_sizes
+            )
+        else:
+            self_attn_out = flash_attn_with_cp(q, k, v, cp_split_sizes)
+        self_attn_out = ModalityDispatcher.permute(self_attn_out, permute_mapping)
+        if self.config.enable_attn_gating:
+            self_attn_out = self_attn_out * torch.sigmoid(g)
+        self_attn_out = self_attn_out.view(-1, self.config.num_heads_q * self.config.head_dim).to(torch.bfloat16)
+        out = self.linear_proj(self_attn_out, modality_dispatcher=modality_dispatcher)
+        return out
+@dataclass
+class MLPConfig:
+    hidden_size: int
+    intermediate_size: int
+    activation_type: MLPActivationType
+    params_dtype: torch.dtype
+    num_modality: int = 1
+    num_layers: int = 1
+    gated_act: bool = False
+class MLP(torch.nn.Module):
+    config: MLPConfig
+    def __init__(self, config: MLPConfig):
+        super().__init__()
+        num_experts = config.num_modality
+        self.pre_norm = MultiModalityRMSNorm(config.hidden_size, num_modality=config.num_modality)
+        intermediate_size_up = config.intermediate_size * 2 if config.gated_act else config.intermediate_size
+        self.up_gate_proj = create_linear(
+            config.hidden_size,
+            intermediate_size_up,
+            bias=False,
+            dtype=config.params_dtype,
+            num_layers=config.num_layers,
+            num_experts=num_experts,
+        )
+        self.down_proj = create_linear(
+            config.intermediate_size,
+            config.hidden_size,
+            bias=False,
+            dtype=config.params_dtype,
+            num_layers=config.num_layers,
+            num_experts=num_experts,
+        )
+        self.activation_func = create_activation_func(config.activation_type)
+    def forward(self, x: torch.Tensor, modality_dispatcher: ModalityDispatcher) -> torch.Tensor:
+        x = self.pre_norm(x, modality_dispatcher=modality_dispatcher).to(torch.bfloat16)
+        x = self.up_gate_proj(x, modality_dispatcher=modality_dispatcher).to(torch.float32)
+        x = self.activation_func(x).to(torch.bfloat16)
+        x = self.down_proj(x, modality_dispatcher=modality_dispatcher).to(torch.float32)
+        return x
+    def extra_repr(self) -> str:
+        return f"{self.up_gate_proj.weight.shape=}, {self.down_proj.weight.shape=}"
+@dataclass
+class AdapterConfig:
+    hidden_size: int
+    num_attention_heads: int
+    text_in_channels: int
+    video_in_channels: int
+    audio_in_channels: int
+    params_dtype: torch.dtype
+class Adapter(torch.nn.Module):
+    config: AdapterConfig
+    def __init__(self, config: AdapterConfig):
+        super().__init__()
+        self.config = config
+        self.video_embedder = nn.Linear(config.video_in_channels, config.hidden_size, bias=True, dtype=torch.float32)
+        self.text_embedder = nn.Linear(config.text_in_channels, config.hidden_size, bias=True, dtype=torch.float32)
+        self.audio_embedder = nn.Linear(config.audio_in_channels, config.hidden_size, bias=True, dtype=torch.float32)
+        self.rope = ElementWiseFourierEmbed(config.hidden_size // config.num_attention_heads, in_pixels=False, learnable=False)
+    def forward(
+        self,
+        x: torch.Tensor,
+        coords_mapping: torch.Tensor,
+        video_mask: torch.Tensor,
+        audio_mask: torch.Tensor,
+        text_mask: torch.Tensor,
+    ):
+        rope = self.rope(coords_mapping)
+        output_x = torch.zeros(x.shape[0], self.config.hidden_size, device=x.device, dtype=x.dtype)
+        output_x[text_mask] = self.text_embedder(x[text_mask, : self.config.text_in_channels])
+        output_x[audio_mask] = self.audio_embedder(x[audio_mask, : self.config.audio_in_channels])
+        output_x[video_mask] = self.video_embedder(x[video_mask, : self.config.video_in_channels])
+        return output_x, rope
+class TransFormerLayer(torch.nn.Module):
+    def __init__(self, config: Any, layer_idx: int):
+        super().__init__()
+        num_modality = 3 if layer_idx in config.mm_layers else 1
+        use_local_attn = layer_idx in config.local_attn_layers
+        self.post_norm = layer_idx in config.post_norm_layers
+        attention_config = AttentionConfig(
+            hidden_size=config.hidden_size,
+            num_heads_q=config.num_heads_q,
+            num_heads_kv=config.num_heads_kv,
+            head_dim=config.head_dim,
+            params_dtype=config.params_dtype,
+            checkpoint_qk_layernorm_rope=config.checkpoint_qk_layernorm_rope,
+            num_modality=num_modality,
+            num_layers=config.num_layers,
+            use_local_attn=use_local_attn,
+            enable_attn_gating=config.enable_attn_gating,
+        )
+        self.attention: Attention = Attention(attention_config)
+        activation_type = MLPActivationType.GELU7 if layer_idx in config.gelu7_layers else MLPActivationType.SWIGLU7
+        if activation_type == MLPActivationType.SWIGLU7:
+            gated_act = True
+            intermediate_size = int(config.hidden_size * 4 * 2 / 3) // 4 * 4
+        else:
+            gated_act = False
+            intermediate_size = config.hidden_size * 4
+        mlp_config = MLPConfig(
+            hidden_size=config.hidden_size,
+            intermediate_size=intermediate_size,
+            activation_type=activation_type,
+            params_dtype=config.params_dtype,
+            num_modality=num_modality,
+            num_layers=config.num_layers,
+            gated_act=gated_act,
+        )
+        self.mlp: MLP = MLP(mlp_config)
+        if self.post_norm:
+            self.attn_post_norm = MultiModalityRMSNorm(config.hidden_size, num_modality=num_modality)
+            self.mlp_post_norm = MultiModalityRMSNorm(config.hidden_size, num_modality=num_modality)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        rope: torch.Tensor,
+        permute_mapping: torch.Tensor,
+        inv_permute_mapping: torch.Tensor,
+        varlen_handler: VarlenHandler,
+        local_attn_handler: FFAHandler,
+        modality_dispatcher: ModalityDispatcher,
+        cp_split_sizes: List[int],
+    ) -> torch.Tensor:
+        attn_out = self.attention(
+            hidden_states,
+            rope,
+            permute_mapping,
+            inv_permute_mapping,
+            varlen_handler,
+            local_attn_handler,
+            modality_dispatcher,
+            cp_split_sizes,
+        )
+        if self.post_norm:
+            attn_out = self.attn_post_norm(attn_out, modality_dispatcher=modality_dispatcher)
+        hidden_states = hidden_states + attn_out
+        mlp_out = self.mlp(hidden_states, modality_dispatcher)
+        if self.post_norm:
+            mlp_out = self.mlp_post_norm(mlp_out, modality_dispatcher=modality_dispatcher)
+        hidden_states = hidden_states + mlp_out
+        return hidden_states
+is_base_model = True
+def config_patch(compile_config: CompileConfig) -> CompileConfig:
+    global is_base_model
+    if is_base_model:
+        is_base_model = False
+    else:
+        # Fully offload SR model for memory-constrained GPU
+        compile_config.offload_config.gpu_resident_weight_ratio = 0.0
+    return compile_config
+@magi_compile(config_patch=config_patch)
+class TransformerBlock(torch.nn.Module):
+    def __init__(self, model_config: Any):
+        super().__init__()
+        self.layers: list[TransFormerLayer] = nn.ModuleList()
+        for layer_idx in range(model_config.num_layers):
+            self.layers.append(TransFormerLayer(model_config, layer_idx))
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: torch.Tensor,
+        permute_mapping: torch.Tensor,
+        inv_permute_mapping: torch.Tensor,
+        varlen_handler: VarlenHandler,
+        local_attn_handler: FFAHandler,
+        modality_dispatcher: ModalityDispatcher,
+        cp_split_sizes: List[int],
+    ) -> torch.Tensor:
+        for _, layer in enumerate(self.layers):
+            x = layer(
+                x,
+                rope,
+                permute_mapping,
+                inv_permute_mapping,
+                varlen_handler,
+                local_attn_handler,
+                modality_dispatcher,
+                cp_split_sizes,
+            )
+        return x
+@dataclass
+class TransformerConfig:
+    hidden_size: int
+    video_in_channels: int
+    audio_in_channels: int
+    text_in_channels: int
+    params_dtype: torch.dtype
+    post_process_dtype: torch.dtype
+class DiTModel(torch.nn.Module):
+    config: TransformerConfig
+    def __init__(self, model_config: Any):
+        super().__init__()
+        self.config = TransformerConfig(
+            hidden_size=model_config.hidden_size,
+            video_in_channels=model_config.video_in_channels,
+            audio_in_channels=model_config.audio_in_channels,
+            text_in_channels=model_config.text_in_channels,
+            params_dtype=model_config.params_dtype,
+            post_process_dtype=torch.float32,
+        )
+        adapter_config = AdapterConfig(
+            hidden_size=model_config.hidden_size,
+            num_attention_heads=model_config.num_heads_q,
+            text_in_channels=model_config.text_in_channels,
+            video_in_channels=model_config.video_in_channels,
+            audio_in_channels=model_config.audio_in_channels,
+            params_dtype=torch.float32,
+        )
+        self.adapter: Adapter = Adapter(adapter_config)
+        self.block: TransformerBlock = TransformerBlock(model_config=model_config)
+        self.final_norm_video = MultiModalityRMSNorm(self.config.hidden_size)
+        self.final_norm_audio = MultiModalityRMSNorm(self.config.hidden_size)
+        self.final_linear_video = nn.Linear(
+            self.config.hidden_size, self.config.video_in_channels, bias=False, dtype=torch.float32
+        )
+        self.final_linear_audio = nn.Linear(
+            self.config.hidden_size, self.config.audio_in_channels, bias=False, dtype=torch.float32
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        coords_mapping: torch.Tensor,
+        modality_mapping: torch.Tensor,
+        varlen_handler: VarlenHandler,
+        local_attn_handler: FFAHandler,
+    ):
+        x = ulysses_scheduler().dispatch(x)
+        coords_mapping = ulysses_scheduler().dispatch(coords_mapping)
+        modality_mapping = ulysses_scheduler().dispatch(modality_mapping)
+        cp_split_sizes = ulysses_scheduler().cp_split_sizes
+        modality_dispatcher = ModalityDispatcher(modality_mapping, 3)
+        permute_mapping, inv_permute_mapping = modality_dispatcher.permute_mapping, modality_dispatcher.inv_permute_mapping
+        video_mask = modality_mapping == Modality.VIDEO
+        audio_mask = modality_mapping == Modality.AUDIO
+        text_mask = modality_mapping == Modality.TEXT
+        x, rope = self.adapter(x, coords_mapping, video_mask, audio_mask, text_mask)
+        x = x.to(self.config.params_dtype)
+        x = ModalityDispatcher.permute(x, permute_mapping)
+        x = self.block(
+            x,
+            rope,
+            permute_mapping=permute_mapping,
+            inv_permute_mapping=inv_permute_mapping,
+            varlen_handler=varlen_handler,
+            local_attn_handler=local_attn_handler,
+            modality_dispatcher=modality_dispatcher,
+            cp_split_sizes=cp_split_sizes,
+        )
+        x = ModalityDispatcher.inv_permute(x, inv_permute_mapping)
+        x_video = x[video_mask].to(self.final_norm_video.weight.dtype)
+        x_video = self.final_norm_video(x_video)
+        x_video = self.final_linear_video(x_video)
+        x_audio = x[audio_mask].to(self.final_norm_audio.weight.dtype)
+        x_audio = self.final_norm_audio(x_audio)
+        x_audio = self.final_linear_audio(x_audio)
+        x_out = torch.zeros(
+            x.shape[0], max(self.config.video_in_channels, self.config.audio_in_channels), device=x.device, dtype=x.dtype
+        )
+        x_out[video_mask, : self.config.video_in_channels] = x_video
+        x_out[audio_mask, : self.config.audio_in_channels] = x_audio
+        x_out = ulysses_scheduler().undispatch(x_out)
+        return x_out

inference/model/sa_audio/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from .sa_audio_model import SAAudioFeatureExtractor
+from .sa_audio_module import (
+    AudioAutoencoder,
+    OobleckDecoder,
+    OobleckEncoder,
+    VAEBottleneck,
+    create_autoencoder_from_config,
+    create_bottleneck_from_config,
+    create_decoder_from_config,
+    create_encoder_from_config,
+    create_model_from_config,
+)
+__all__ = [
+    "SAAudioFeatureExtractor",
+    "AudioAutoencoder",
+    "OobleckDecoder",
+    "OobleckEncoder",
+    "VAEBottleneck",
+    "create_autoencoder_from_config",
+    "create_bottleneck_from_config",
+    "create_decoder_from_config",
+    "create_encoder_from_config",
+    "create_model_from_config",
+]

inference/model/sa_audio/sa_audio_model.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from pathlib import Path
+import torch
+from safetensors.torch import load_file
+# Set env vars for local T5 loading
+os.environ["TRANSFORMERS_OFFLINE"] = "1"
+os.environ["HF_HUB_OFFLINE"] = "1"
+from .sa_audio_module import create_model_from_config
+from inference.utils import print_rank_0
+class SAAudioFeatureExtractor:
+    """Stable Audio Feature Extractor that loads model once and reuses it."""
+    def __init__(self, device, model_path):
+        """Initialize the extractor with model loading."""
+        self.device = device
+        self.vae_model, self.sample_rate = self._get_vae_only(model_path)
+        # self.vae_model.to(self.device).to(torch.bfloat16)
+        self.resampler = None  # Will be initialized when needed
+    def _get_vae_only(self, model_path):
+        """Load VAE only, skip T5 and diffusion model."""
+        if isinstance(model_path, str) and Path(model_path).is_dir():
+            try:
+                # Read full config
+                model_config_path = os.path.join(model_path, "model_config.json")
+                with open(model_config_path) as f:
+                    full_config = json.load(f)
+                vae_config = full_config["model"]["pretransform"]["config"]
+                sample_rate = full_config["sample_rate"]
+                # Rebuild config structure expected by create_autoencoder_from_config
+                autoencoder_config = {
+                    "model_type": "autoencoder",
+                    "sample_rate": sample_rate,  # sample_rate is required
+                    "model": vae_config,  # create_autoencoder_from_config expects key "model"
+                }
+                vae_model = create_model_from_config(autoencoder_config)
+                # Load weights
+                weights_path = Path(model_path) / "model.safetensors"
+                if not weights_path.exists():
+                    raise FileNotFoundError(f"Weight file does not exist: {weights_path}")
+                # Load full state dict
+                full_state_dict = load_file(weights_path, device=str(self.device))
+                # Filter VAE-related weights (prefix: pretransform.model)
+                vae_state_dict = {}
+                for key, value in full_state_dict.items():
+                    if key.startswith("pretransform.model."):
+                        vae_key = key[len("pretransform.model.") :]
+                        vae_state_dict[vae_key] = value
+                # Check expected model keys
+                model_keys = set(vae_model.state_dict().keys())
+                vae_keys = set(vae_state_dict.keys())
+                missing_keys = model_keys - vae_keys
+                extra_keys = vae_keys - model_keys
+                if missing_keys:
+                    print_rank_0(f"Missing keys ({len(missing_keys)}):")
+                    for key in list(missing_keys)[:5]:
+                        print_rank_0(f"  - {key}")
+                if extra_keys:
+                    print_rank_0(f"Unexpected keys ({len(extra_keys)}):")
+                    for key in list(extra_keys)[:5]:
+                        print_rank_0(f"  + {key}")
+                # Load VAE weights
+                vae_model.load_state_dict(vae_state_dict)
+                vae_model.to(self.device)
+                return vae_model, sample_rate
+            except Exception as e:
+                print_rank_0(f"audio model loading failed: {e}")
+                raise RuntimeError(
+                    "Failed to load VAE-only Stable Audio model from local path"
+                ) from e
+        else:
+            print_rank_0("Non-local path is not supported in audio model loading")
+    def decode(self, latents):
+        with torch.no_grad():
+            waveform_out = self.vae_model.decode(latents)
+        return waveform_out
+    def encode(self, waveform):
+        with torch.no_grad():
+            latents = self.vae_model.encode(waveform)
+        return latents

inference/model/sa_audio/sa_audio_module.py ADDED Viewed

	@@ -0,0 +1,478 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+import math
+from typing import Any, Dict, Literal
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.utils import weight_norm
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta + 1e-9)) * torch.pow(torch.sin(x * alpha), 2)
+class SnakeBeta(nn.Module):
+    # Adapted from BigVGAN activation.
+    def __init__(
+        self,
+        in_features: int,
+        alpha: float = 1.0,
+        alpha_trainable: bool = True,
+        alpha_logscale: bool = True,
+    ):
+        super().__init__()
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        return snake_beta(x, alpha, beta)
+def vae_sample(mean, scale):
+    stdev = F.softplus(scale) + 1e-4
+    var = stdev * stdev
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+    kl = (mean * mean + var - logvar - 1).sum(1).mean()
+    return latents, kl
+class VAEBottleneck(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, x, return_info=False, **kwargs):
+        info = {}
+        mean, scale = x.chunk(2, dim=1)
+        x, kl = vae_sample(mean, scale)
+        info["kl"] = kl
+        if return_info:
+            return x, info
+        return x
+    def decode(self, x):
+        return x
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+def checkpoint(function, *args, **kwargs):
+    kwargs.setdefault("use_reentrant", False)
+    return torch.utils.checkpoint.checkpoint(function, *args, **kwargs)
+def get_activation(
+    activation: Literal["elu", "snake", "none"], antialias: bool = False, channels=None
+) -> nn.Module:
+    if antialias:
+        raise NotImplementedError("antialias activation is not supported in sa_audio")
+    if activation == "elu":
+        return nn.ELU()
+    if activation == "snake":
+        return SnakeBeta(channels)
+    if activation == "none":
+        return nn.Identity()
+    raise ValueError(f"Unknown activation {activation}")
+class ResidualUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dilation: int,
+        use_snake: bool = False,
+        antialias_activation: bool = False,
+    ):
+        super().__init__()
+        padding = (dilation * (7 - 1)) // 2
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels,
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                dilation=dilation,
+                padding=padding,
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels,
+            ),
+            WNConv1d(in_channels=out_channels, out_channels=out_channels, kernel_size=1),
+        )
+    def forward(self, x):
+        if self.training:
+            y = checkpoint(self.layers, x)
+        else:
+            y = self.layers(x)
+        return y + x
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: int,
+        use_snake: bool = False,
+        antialias_activation: bool = False,
+    ):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualUnit(in_channels, in_channels, 1, use_snake=use_snake),
+            ResidualUnit(in_channels, in_channels, 3, use_snake=use_snake),
+            ResidualUnit(in_channels, in_channels, 9, use_snake=use_snake),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels,
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        stride: int,
+        use_snake: bool = False,
+        antialias_activation: bool = False,
+        use_nearest_upsample: bool = False,
+    ):
+        super().__init__()
+        if use_nearest_upsample:
+            upsample_layer = nn.Sequential(
+                nn.Upsample(scale_factor=stride, mode="nearest"),
+                WNConv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=2 * stride,
+                    stride=1,
+                    bias=False,
+                    padding="same",
+                ),
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            )
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels,
+            ),
+            upsample_layer,
+            ResidualUnit(out_channels, out_channels, 1, use_snake=use_snake),
+            ResidualUnit(out_channels, out_channels, 3, use_snake=use_snake),
+            ResidualUnit(out_channels, out_channels, 9, use_snake=use_snake),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class OobleckEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 2,
+        channels: int = 128,
+        latent_dim: int = 32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake: bool = False,
+        antialias_activation: bool = False,
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=c_mults[0] * channels,
+                kernel_size=7,
+                padding=3,
+            )
+        ]
+        for i in range(depth - 1):
+            layers.append(
+                EncoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i + 1] * channels,
+                    stride=strides[i],
+                    use_snake=use_snake,
+                )
+            )
+        layers.extend(
+            [
+                get_activation(
+                    "snake" if use_snake else "elu",
+                    antialias=antialias_activation,
+                    channels=c_mults[-1] * channels,
+                ),
+                WNConv1d(
+                    in_channels=c_mults[-1] * channels,
+                    out_channels=latent_dim,
+                    kernel_size=3,
+                    padding=1,
+                ),
+            ]
+        )
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class OobleckDecoder(nn.Module):
+    def __init__(
+        self,
+        out_channels: int = 2,
+        channels: int = 128,
+        latent_dim: int = 32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake: bool = False,
+        antialias_activation: bool = False,
+        use_nearest_upsample: bool = False,
+        final_tanh: bool = True,
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=latent_dim,
+                out_channels=c_mults[-1] * channels,
+                kernel_size=7,
+                padding=3,
+            )
+        ]
+        for i in range(depth - 1, 0, -1):
+            layers.append(
+                DecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample,
+                )
+            )
+        layers.extend(
+            [
+                get_activation(
+                    "snake" if use_snake else "elu",
+                    antialias=antialias_activation,
+                    channels=c_mults[0] * channels,
+                ),
+                WNConv1d(
+                    in_channels=c_mults[0] * channels,
+                    out_channels=out_channels,
+                    kernel_size=7,
+                    padding=3,
+                    bias=False,
+                ),
+                nn.Tanh() if final_tanh else nn.Identity(),
+            ]
+        )
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class AudioAutoencoder(nn.Module):
+    def __init__(
+        self,
+        encoder: nn.Module,
+        decoder: nn.Module,
+        latent_dim: int,
+        downsampling_ratio: int,
+        sample_rate: int,
+        io_channels: int = 2,
+        bottleneck: nn.Module | None = None,
+        in_channels: int | None = None,
+        out_channels: int | None = None,
+        soft_clip: bool = False,
+    ):
+        super().__init__()
+        self.downsampling_ratio = downsampling_ratio
+        self.sample_rate = sample_rate
+        self.latent_dim = latent_dim
+        self.io_channels = io_channels
+        self.in_channels = in_channels if in_channels is not None else io_channels
+        self.out_channels = out_channels if out_channels is not None else io_channels
+        self.bottleneck = bottleneck
+        self.encoder = encoder
+        self.decoder = decoder
+        self.soft_clip = soft_clip
+    def encode(self, audio, skip_bottleneck: bool = False, return_info: bool = False, **kwargs):
+        info = {}
+        latents = self.encoder(audio)
+        info["pre_bottleneck_latents"] = latents
+        if self.bottleneck is not None and not skip_bottleneck:
+            latents, bottleneck_info = self.bottleneck.encode(latents, return_info=True, **kwargs)
+            info.update(bottleneck_info)
+        if return_info:
+            return latents, info
+        return latents
+    def decode(self, latents, skip_bottleneck: bool = False, **kwargs):
+        if self.bottleneck is not None and not skip_bottleneck:
+            latents = self.bottleneck.decode(latents)
+        decoded = self.decoder(latents, **kwargs)
+        if self.soft_clip:
+            decoded = torch.tanh(decoded)
+        return decoded
+# AE factories
+def create_encoder_from_config(encoder_config: Dict[str, Any]):
+    encoder_type = encoder_config.get("type", None)
+    assert encoder_type is not None, "Encoder type must be specified"
+    if encoder_type != "oobleck":
+        raise ValueError(f"Only encoder type 'oobleck' is supported, got: {encoder_type}")
+    encoder = OobleckEncoder(**encoder_config["config"])
+    if not encoder_config.get("requires_grad", True):
+        for param in encoder.parameters():
+            param.requires_grad = False
+    return encoder
+def create_decoder_from_config(decoder_config: Dict[str, Any]):
+    decoder_type = decoder_config.get("type", None)
+    assert decoder_type is not None, "Decoder type must be specified"
+    if decoder_type != "oobleck":
+        raise ValueError(f"Only decoder type 'oobleck' is supported, got: {decoder_type}")
+    decoder = OobleckDecoder(**decoder_config["config"])
+    if not decoder_config.get("requires_grad", True):
+        for param in decoder.parameters():
+            param.requires_grad = False
+    return decoder
+def create_bottleneck_from_config(bottleneck_config: Dict[str, Any]):
+    bottleneck_type = bottleneck_config.get("type", None)
+    assert bottleneck_type is not None, "type must be specified in bottleneck config"
+    if bottleneck_type != "vae":
+        raise NotImplementedError(
+            f"Only bottleneck type 'vae' is supported, got: {bottleneck_type}"
+        )
+    bottleneck = VAEBottleneck()
+    if not bottleneck_config.get("requires_grad", True):
+        for param in bottleneck.parameters():
+            param.requires_grad = False
+    return bottleneck
+def create_autoencoder_from_config(config: Dict[str, Any]):
+    ae_config = config["model"]
+    if ae_config.get("pretransform") is not None:
+        raise NotImplementedError("Nested pretransform is not supported in sa_audio")
+    encoder = create_encoder_from_config(ae_config["encoder"])
+    decoder = create_decoder_from_config(ae_config["decoder"])
+    bottleneck_cfg = ae_config.get("bottleneck")
+    bottleneck = create_bottleneck_from_config(bottleneck_cfg) if bottleneck_cfg else None
+    latent_dim = ae_config.get("latent_dim")
+    assert latent_dim is not None, "latent_dim must be specified in model config"
+    downsampling_ratio = ae_config.get("downsampling_ratio")
+    assert downsampling_ratio is not None, "downsampling_ratio must be specified in model config"
+    io_channels = ae_config.get("io_channels")
+    assert io_channels is not None, "io_channels must be specified in model config"
+    sample_rate = config.get("sample_rate")
+    assert sample_rate is not None, "sample_rate must be specified in model config"
+    return AudioAutoencoder(
+        encoder=encoder,
+        decoder=decoder,
+        latent_dim=latent_dim,
+        downsampling_ratio=downsampling_ratio,
+        sample_rate=sample_rate,
+        io_channels=io_channels,
+        bottleneck=bottleneck,
+        in_channels=ae_config.get("in_channels"),
+        out_channels=ae_config.get("out_channels"),
+        soft_clip=ae_config["decoder"].get("soft_clip", False),
+    )
+def create_model_from_config(model_config: Dict[str, Any]):
+    model_type = model_config.get("model_type", None)
+    assert model_type is not None, "model_type must be specified in model config"
+    if model_type != "autoencoder":
+        raise NotImplementedError(f"Only 'autoencoder' is supported, got: {model_type}")
+    return create_autoencoder_from_config(model_config)

inference/model/t5_gemma/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .t5_gemma_model import get_t5_gemma_embedding
2	+
3	+ __all__ = ["get_t5_gemma_embedding"]

inference/model/t5_gemma/t5_gemma_model.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from __future__ import annotations
+from typing import Optional
+import torch
+from transformers import AutoTokenizer
+from transformers.models.t5gemma import T5GemmaEncoderModel
+from inference.common import CPUOffloadWrapper, get_arch_memory
+from inference.utils import env_is_true
+class T5GemmaEncoder:
+    def __init__(self, model_path: str, device: str, weight_dtype: torch.dtype):
+        self.device = device
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = T5GemmaEncoderModel.from_pretrained(
+            model_path,
+            is_encoder_decoder=False,
+            dtype=weight_dtype,
+        ).to(device)
+        self.model = CPUOffloadWrapper(model, is_cpu_offload=env_is_true("CPU_OFFLOAD") or get_arch_memory() <= 48)
+    def encode(self, prompt: str) -> torch.Tensor:
+        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
+        outputs = self.model(**inputs)
+        return outputs["last_hidden_state"].half()
+_t5_gemma_cache: Optional[T5GemmaEncoder] = None
+def get_t5_gemma_encoder(model_path: str, device: str, weight_dtype: torch.dtype) -> T5GemmaEncoder:
+    global _t5_gemma_cache
+    if _t5_gemma_cache is None:
+        _t5_gemma_cache = T5GemmaEncoder(model_path=model_path, device=device, weight_dtype=weight_dtype)
+    return _t5_gemma_cache
+@torch.inference_mode()
+def get_t5_gemma_embedding(prompt: str, model_path: str, device: str, weight_dtype: torch.dtype) -> torch.Tensor:
+    encoder = get_t5_gemma_encoder(model_path=model_path, device=device, weight_dtype=weight_dtype)
+    return encoder.encode(prompt)

inference/model/turbo_vaed/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .turbo_vaed_module import TurboVAED
+from .turbo_vaed_model import get_turbo_vaed
+__all__ = ["TurboVAED", "get_turbo_vaed"]

inference/model/turbo_vaed/turbo_vaed_model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import json
+import torch
+from .turbo_vaed_module import TurboVAED
+def get_turbo_vaed(config_path, ckpt_path, device="cuda", weight_dtype=torch.float32) -> TurboVAED:
+    with open(config_path, "r", encoding="utf-8") as f:
+        config = json.load(f)
+    student = TurboVAED.from_config(config)
+    ckpt = torch.load(ckpt_path, map_location="cpu")
+    assert "ema_state_dict" in ckpt, "ckpt must contain ema_state_dict or state_dict"
+    state_dict = ckpt["ema_state_dict"]
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith("module."):
+            new_state_dict[key[7:]] = value
+        else:
+            new_state_dict[key] = value
+    state_dict = new_state_dict
+    missing, _ = student.load_state_dict(state_dict, strict=False)
+    if len(missing) > 0:
+        sample_key = next(iter(state_dict.keys()))
+        if not sample_key.startswith("decoder.") and not sample_key.startswith("encoder."):
+            student.decoder.load_state_dict(state_dict, strict=False)
+    student = student.to(device, dtype=weight_dtype)
+    student.eval()
+    student.requires_grad_(False)
+    return student

inference/model/turbo_vaed/turbo_vaed_module.py ADDED Viewed

	@@ -0,0 +1,1039 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from einops import rearrange
+__all__ = ["TurboVAED"]
+ACT2CLS = {"swish": nn.SiLU, "silu": nn.SiLU, "mish": nn.Mish, "gelu": nn.GELU, "relu": nn.ReLU}
+def get_activation(act_fn: str) -> nn.Module:
+    act_fn = act_fn.lower()
+    if act_fn in ACT2CLS:
+        return ACT2CLS[act_fn]()
+    else:
+        raise ValueError(f"activation function {act_fn} not found in ACT2FN mapping {list(ACT2CLS.keys())}")
+def unpatchify(x, patch_size):
+    """
+    Unpatchify operation: convert patched representation back to original spatial resolution.
+    Similar to Wan VAE's unpatchify.
+    Args:
+        x: Input tensor with shape [batch_size, (channels * patch_size * patch_size), frame, height, width]
+        patch_size: The patch size used during patchification
+    Returns:
+        Tensor with shape [batch_size, channels, frame, height * patch_size, width * patch_size]
+    """
+    if patch_size == 1:
+        return x
+    if x.dim() != 5:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    # x shape: [batch_size, (channels * patch_size * patch_size), frame, height, width]
+    batch_size, c_patches, frames, height, width = x.shape
+    channels = c_patches // (patch_size * patch_size)
+    # Reshape to [b, c, patch_size, patch_size, f, h, w]
+    x = x.view(batch_size, channels, patch_size, patch_size, frames, height, width)
+    # Rearrange to [b, c, f, h * patch_size, w * patch_size]
+    x = x.permute(0, 1, 4, 5, 3, 6, 2).contiguous()
+    x = x.view(batch_size, channels, frames, height * patch_size, width * patch_size)
+    return x
+class RMSNorm(nn.Module):
+    r"""
+    RMS Norm as introduced in https://huggingface.co/papers/1910.07467 by Zhang et al.
+    Args:
+        dim (`int`): Number of dimensions to use for `weights`. Only effective when `elementwise_affine` is True.
+        eps (`float`): Small value to use when calculating the reciprocal of the square-root.
+        elementwise_affine (`bool`, defaults to `True`):
+            Boolean flag to denote if affine transformation should be applied.
+        bias (`bool`, defaults to False): If also training the `bias` param.
+    """
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
+        super().__init__()
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if isinstance(dim, int):
+            dim = (dim,)
+        self.dim = torch.Size(dim)
+        self.weight = None
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+        if self.weight is not None:
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+        else:
+            hidden_states = hidden_states.to(input_dtype)
+        return hidden_states
+class TurboVAEDConv2dSplitUpsampler(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        kernel_size: Union[int, Tuple[int, int]] = 3,
+        stride: Union[int, Tuple[int, int]] = 1,
+        upscale_factor: int = 1,
+        padding_mode: str = "zeros",
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride)
+        self.upscale_factor = upscale_factor
+        out_channels = in_channels
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
+        height_pad = self.kernel_size[0] // 2
+        width_pad = self.kernel_size[1] // 2
+        padding = (height_pad, width_pad)
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=self.kernel_size,
+            stride=1,
+            padding=padding,
+            padding_mode=padding_mode,
+        )
+    @torch.compile
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv(hidden_states)
+        hidden_states = torch.nn.functional.pixel_shuffle(hidden_states, self.stride[0])
+        return hidden_states
+class TurboVAEDCausalConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]] = 3,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        groups: int = 1,
+        padding_mode: str = "zeros",
+        is_causal: bool = False,
+    ):
+        super().__init__()
+        assert is_causal == False
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.is_causal = is_causal
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
+        dilation = dilation if isinstance(dilation, tuple) else (dilation, 1, 1)
+        stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
+        height_pad = self.kernel_size[1] // 2
+        width_pad = self.kernel_size[2] // 2
+        padding = (0, height_pad, width_pad)
+        self.conv = nn.Conv3d(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=stride,
+            dilation=dilation,
+            groups=groups,
+            padding=padding,
+            padding_mode=padding_mode,
+        )
+    @torch.compile
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        time_kernel_size = self.kernel_size[0]
+        if time_kernel_size > 1:
+            pad_left = hidden_states[:, :, :1, :, :].repeat((1, 1, (time_kernel_size - 1) // 2, 1, 1))
+            pad_right = hidden_states[:, :, -1:, :, :].repeat((1, 1, (time_kernel_size - 1) // 2, 1, 1))
+            hidden_states = torch.cat([pad_left, hidden_states, pad_right], dim=2)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class TurboVAEDCausalDepthwiseSeperableConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int, int]] = 3,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        dilation: Union[int, Tuple[int, int, int]] = 1,
+        padding_mode: str = "zeros",
+        is_causal: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.is_causal = is_causal
+        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size, kernel_size)
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
+        self.dilation = dilation if isinstance(dilation, tuple) else (dilation, 1, 1)
+        # Calculate padding for height and width dimensions
+        height_pad = self.kernel_size[1] // 2
+        width_pad = self.kernel_size[2] // 2
+        self.padding = (0, height_pad, width_pad)
+        # Depthwise Convolution
+        self.depthwise_conv = nn.Conv3d(
+            in_channels,
+            in_channels,
+            self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=in_channels,  # Each input channel is convolved separately
+            padding=self.padding,
+            padding_mode=padding_mode,
+        )
+        # Pointwise Convolution
+        self.pointwise_conv = nn.Conv3d(in_channels, out_channels, kernel_size=1)  # 1x1x1 convolution to mix channels
+    @torch.compile
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        time_kernel_size = self.kernel_size[0]
+        if time_kernel_size > 1:
+            pad_count = (time_kernel_size - 1) // 2
+            pad_left = hidden_states[:, :, :1, :, :].repeat((1, 1, pad_count, 1, 1))
+            pad_right = hidden_states[:, :, -1:, :, :].repeat((1, 1, pad_count, 1, 1))
+            hidden_states = torch.cat([pad_left, hidden_states, pad_right], dim=2)
+        # Apply depthwise convolution
+        hidden_states = self.depthwise_conv(hidden_states)
+        # Apply pointwise convolution
+        hidden_states = self.pointwise_conv(hidden_states)
+        return hidden_states
+class TurboVAEDResnetBlock3d(nn.Module):
+    r"""
+    A 3D ResNet block used in the TurboVAED model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        elementwise_affine (`bool`, defaults to `False`):
+            Whether to enable elementwise affinity in the normalization layers.
+        non_linearity (`str`, defaults to `"swish"`):
+            Activation function to use.
+        conv_shortcut (bool, defaults to `False`):
+            Whether or not to use a convolution shortcut.
+    """
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
+        eps: float = 1e-6,
+        elementwise_affine: bool = False,
+        non_linearity: str = "swish",
+        is_causal: bool = True,
+        is_upsampler_modified: bool = False,
+        is_dw_conv: bool = False,
+        dw_kernel_size: int = 3,
+    ) -> None:
+        super().__init__()
+        out_channels = out_channels or in_channels
+        self.nonlinearity = get_activation(non_linearity)
+        self.conv_operation = TurboVAEDCausalConv3d if not is_dw_conv else TurboVAEDCausalDepthwiseSeperableConv3d
+        self.kernel_size = 3 if not is_dw_conv else dw_kernel_size
+        self.is_upsampler_modified = is_upsampler_modified
+        self.replace_nonlinearity = get_activation("relu")
+        self.norm1 = RMSNorm(in_channels, eps=1e-8, elementwise_affine=elementwise_affine)
+        self.conv1 = self.conv_operation(
+            in_channels=in_channels, out_channels=out_channels, kernel_size=self.kernel_size, is_causal=is_causal
+        )
+        self.norm2 = RMSNorm(out_channels, eps=1e-8, elementwise_affine=elementwise_affine)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = self.conv_operation(
+            in_channels=out_channels, out_channels=out_channels, kernel_size=self.kernel_size, is_causal=is_causal
+        )
+        self.norm3 = None
+        self.conv_shortcut = None
+        if in_channels != out_channels:
+            self.norm3 = RMSNorm(in_channels, eps=eps, elementwise_affine=elementwise_affine)
+            self.conv_shortcut = self.conv_operation(
+                in_channels=in_channels, out_channels=out_channels, kernel_size=1, stride=1, is_causal=is_causal
+            )
+    @torch.compile
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        hidden_states = inputs
+        hidden_states = self.norm1(hidden_states)
+        if self.is_upsampler_modified:
+            hidden_states = self.replace_nonlinearity(hidden_states)
+        else:
+            hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.norm3 is not None:
+            inputs = self.norm3(inputs)
+        if self.conv_shortcut is not None:
+            inputs = self.conv_shortcut(inputs)
+        hidden_states = hidden_states + inputs
+        return hidden_states
+class TurboVAEDUpsampler3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        stride: Union[int, Tuple[int, int, int]] = 1,
+        is_causal: bool = True,
+        upscale_factor: int = 1,
+        padding_mode: str = "zeros",
+    ) -> None:
+        super().__init__()
+        self.stride = stride if isinstance(stride, tuple) else (stride, stride, stride)
+        self.upscale_factor = upscale_factor
+        out_channels = (in_channels * stride[0] * stride[1] * stride[2]) // upscale_factor
+        self.conv = TurboVAEDCausalConv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            is_causal=is_causal,
+            padding_mode=padding_mode,
+        )
+    @torch.compile
+    def forward(self, hidden_states: torch.Tensor, is_first_chunk: bool = True) -> torch.Tensor:
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        hidden_states = self.conv(hidden_states)
+        # because the former has better performance on cuda kernels.
+        s_t, s_h, s_w = self.stride
+        hidden_states = hidden_states.reshape(batch_size, -1, s_t, s_h, s_w, num_frames, height, width)
+        hidden_states = hidden_states.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        hidden_states = hidden_states.reshape(batch_size, -1, num_frames * s_t, height * s_h, width * s_w)
+        # slice the first chunk
+        if is_first_chunk:
+            hidden_states = hidden_states[:, :, self.stride[0] - 1 :]  # NOTE: extra handling for the first frame
+        return hidden_states
+class WanUpsample(nn.Upsample):
+    r"""
+    Perform upsampling while ensuring the output tensor has the same data type as the input.
+    Args:
+        x (torch.Tensor): Input tensor to be upsampled.
+    Returns:
+        torch.Tensor: Upsampled tensor with the same data type as the input.
+    """
+    def forward(self, x):
+        return super().forward(x.float()).type_as(x)
+class WanResample(nn.Module):
+    r"""
+    A custom resampling module for 2D and 3D data.
+    Args:
+        dim (int): The number of input/output channels.
+        mode (str): The resampling mode. Must be one of:
+            - 'none': No resampling (identity operation).
+            - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
+            - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
+    """
+    def __init__(self, dim: int, mode: str, upsample_out_dim: int = None) -> None:
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # default to dim //2
+        if upsample_out_dim is None:
+            upsample_out_dim = dim // 2
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, upsample_out_dim, 3, padding=1)
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                WanUpsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, upsample_out_dim, 3, padding=1)
+            )
+            self.time_conv = TurboVAEDCausalConv3d(dim, dim * 2, (3, 1, 1))
+        else:
+            self.resample = nn.Identity()
+    def forward(self, x, is_first_chunk: bool = True):
+        b, c, t, h, w = x.shape
+        if self.mode == "upsample3d":
+            x = self.time_conv(x)
+            x = rearrange(x, 'b (n_split c) t h w -> b c (t n_split) h w', n_split=2)
+            assert x.shape == (b, c, t * 2, h, w), "x.shape: {}, expected: {}".format(x.shape, (b, c, t * 2, h, w))
+            if is_first_chunk:
+                x = x[:, :, 1:]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", b=b)
+        return x
+class TurboVAEDMidBlock3d(nn.Module):
+    r"""
+    A middle block used in the TurboVAED model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        is_causal (`bool`, defaults to `True`):
+            Whether this layer behaves causally (future frames depend only on past frames) or not.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        in_channels: int,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        is_causal: bool = True,
+        is_dw_conv: bool = False,
+        dw_kernel_size: int = 3,
+    ) -> None:
+        super().__init__()
+        resnets = []
+        for _ in range(num_layers):
+            resnets.append(
+                TurboVAEDResnetBlock3d(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    dropout=dropout,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    is_causal=is_causal,
+                    is_dw_conv=is_dw_conv,
+                    dw_kernel_size=dw_kernel_size,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = False
+    @torch.compile
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""Forward method of the `LTXMidBlock3D` class."""
+        for i, resnet in enumerate(self.resnets):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
+            else:
+                hidden_states = resnet(hidden_states)
+        return hidden_states
+class TurboVAEDUpBlock3d(nn.Module):
+    r"""
+    Up block used in the TurboVAED model.
+    Args:
+        in_channels (`int`):
+            Number of input channels.
+        out_channels (`int`, *optional*):
+            Number of output channels. If None, defaults to `in_channels`.
+        num_layers (`int`, defaults to `1`):
+            Number of resnet layers.
+        dropout (`float`, defaults to `0.0`):
+            Dropout rate.
+        resnet_eps (`float`, defaults to `1e-6`):
+            Epsilon value for normalization layers.
+        resnet_act_fn (`str`, defaults to `"swish"`):
+            Activation function to use.
+        spatio_temporal_scale (`bool`, defaults to `True`):
+            Whether or not to use a downsampling layer. If not used, output dimension would be same as input dimension.
+            Whether or not to downsample across temporal dimension.
+        is_causal (`bool`, defaults to `True`):
+            Whether this layer behaves causally (future frames depend only on past frames) or not.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        resnet_eps: float = 1e-6,
+        resnet_act_fn: str = "swish",
+        spatio_temporal_scale: bool = True,
+        is_causal: bool = True,
+        is_dw_conv: bool = False,
+        dw_kernel_size: int = 3,
+        spatio_only: bool = False,
+    ):
+        super().__init__()
+        out_channels = out_channels or in_channels
+        self.conv_in = None
+        if in_channels != out_channels:
+            self.conv_in = TurboVAEDResnetBlock3d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                dropout=dropout,
+                eps=resnet_eps,
+                non_linearity=resnet_act_fn,
+                is_causal=is_causal,
+                is_dw_conv=is_dw_conv,
+                dw_kernel_size=dw_kernel_size,
+            )
+        self.upsamplers = None
+        if spatio_temporal_scale:
+            self.upsamplers = nn.ModuleList(
+                [
+                    WanResample(
+                        dim=out_channels, mode="upsample2d" if spatio_only else "upsample3d", upsample_out_dim=out_channels
+                    )
+                ]
+            )
+        resnets = []
+        for _ in range(num_layers):
+            resnets.append(
+                TurboVAEDResnetBlock3d(
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    is_causal=is_causal,
+                    is_dw_conv=is_dw_conv,
+                    dw_kernel_size=dw_kernel_size,
+                    is_upsampler_modified=(spatio_temporal_scale),
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.gradient_checkpointing = False
+    @torch.compile
+    def forward(self, hidden_states: torch.Tensor, is_first_chunk: bool) -> torch.Tensor:
+        if self.conv_in is not None:
+            hidden_states = self.conv_in(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, is_first_chunk=is_first_chunk)
+        for i, resnet in enumerate(self.resnets):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states)
+            else:
+                hidden_states = resnet(hidden_states)
+        return hidden_states
+class TurboVAEDDecoder3d(nn.Module):
+    r"""
+    The `TurboVAEDDecoder3d` layer of a variational autoencoder that decodes its latent representation into an output
+    sample.
+    Args:
+        in_channels (`int`, defaults to 128):
+            Number of latent channels.
+        out_channels (`int`, defaults to 3):
+            Number of output channels.
+        block_out_channels (`Tuple[int, ...]`, defaults to `(128, 256, 512, 512)`):
+            The number of output channels for each block.
+        spatio_temporal_scaling (`Tuple[bool, ...], defaults to `(True, True, True, False)`:
+            Whether a block should contain spatio-temporal upscaling layers or not.
+        layers_per_block (`Tuple[int, ...]`, defaults to `(4, 3, 3, 3, 4)`):
+            The number of layers per block.
+        patch_size (`int`, defaults to `4`):
+            The size of spatial patches.
+        patch_size_t (`int`, defaults to `1`):
+            The size of temporal patches.
+        resnet_norm_eps (`float`, defaults to `1e-6`):
+            Epsilon value for ResNet normalization layers.
+        is_causal (`bool`, defaults to `False`):
+            Whether this layer behaves causally (future frames depend only on past frames) or not.
+    """
+    def __init__(
+        self,
+        in_channels: int = 128,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
+        layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
+        patch_size: int = 4,
+        patch_size_t: int = 1,
+        resnet_norm_eps: float = 1e-6,
+        is_causal: bool = False,
+        decoder_is_dw_conv: Tuple[bool, ...] = (False, False, False, False, False),
+        decoder_dw_kernel_size: int = 3,
+        spatio_only: Tuple[bool, ...] = (False, False, False, False),
+        upsampling: bool = False,
+        use_unpatchify: bool = False,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_size_t = patch_size_t
+        self.out_channels = out_channels
+        self.upsampling = upsampling
+        self.use_unpatchify = use_unpatchify
+        block_out_channels = tuple(reversed(block_out_channels))
+        spatio_temporal_scaling = tuple(reversed(spatio_temporal_scaling))
+        layers_per_block = tuple(reversed(layers_per_block))
+        decoder_is_dw_conv = tuple(reversed(decoder_is_dw_conv))
+        spatio_only = tuple(reversed(spatio_only))
+        output_channel = block_out_channels[0]
+        self.conv_in = TurboVAEDCausalConv3d(
+            in_channels=in_channels, out_channels=output_channel, kernel_size=3, stride=1, is_causal=is_causal
+        )
+        self.mid_block = TurboVAEDMidBlock3d(
+            in_channels=output_channel,
+            num_layers=layers_per_block[0],
+            resnet_eps=resnet_norm_eps,
+            is_causal=is_causal,
+            is_dw_conv=decoder_is_dw_conv[0],
+            dw_kernel_size=decoder_dw_kernel_size,
+        )
+        # up blocks
+        num_block_out_channels = len(block_out_channels)
+        self.up_blocks = nn.ModuleList([])
+        for i in range(num_block_out_channels):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            up_block = TurboVAEDUpBlock3d(
+                in_channels=input_channel,
+                out_channels=output_channel,
+                num_layers=layers_per_block[i + 1],
+                resnet_eps=resnet_norm_eps,
+                spatio_temporal_scale=spatio_temporal_scaling[i],
+                is_causal=is_causal,
+                is_dw_conv=decoder_is_dw_conv[i + 1],
+                dw_kernel_size=decoder_dw_kernel_size,
+                spatio_only=spatio_only[i],
+            )
+            self.up_blocks.append(up_block)
+        # out
+        assert self.patch_size == 2
+        if not self.use_unpatchify:
+            self.norm_up_1 = RMSNorm(output_channel, eps=1e-8, elementwise_affine=False)
+            self.upsampler2d_1 = TurboVAEDConv2dSplitUpsampler(in_channels=output_channel, kernel_size=3, stride=(2, 2))
+            output_channel = output_channel // (2 * 2)
+        self.conv_act = nn.SiLU()
+        # When use_unpatchify=True, conv_out outputs more channels (out_channels * patch_size^2)
+        # and unpatchify will recover the spatial resolution
+        conv_out_channels = self.out_channels
+        if self.use_unpatchify and self.patch_size >= 2:
+            conv_out_channels = self.out_channels * self.patch_size * self.patch_size
+        self.conv_out = TurboVAEDCausalConv3d(
+            in_channels=output_channel, out_channels=conv_out_channels, kernel_size=3, stride=1, is_causal=is_causal
+        )
+        self.gradient_checkpointing = False
+    @torch.compile
+    def forward(self, hidden_states: torch.Tensor, is_first_chunk: bool) -> torch.Tensor:
+        hidden_states = self.conv_in(hidden_states)
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            def create_custom_forward(module):
+                def create_forward(*inputs):
+                    return module(*inputs)
+                return create_forward
+            hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), hidden_states)
+            for up_block in self.up_blocks:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(up_block), hidden_states, is_first_chunk
+                )
+        else:
+            hidden_states = self.mid_block(hidden_states)
+            for index, up_block in enumerate(self.up_blocks):
+                hidden_states = up_block(hidden_states, is_first_chunk=is_first_chunk)
+        if not self.use_unpatchify:
+            hidden_states = self.norm_up_1(hidden_states)
+            hidden_states = self.conv_act(hidden_states)
+            hidden_states_array = []
+            for t in range(hidden_states.shape[2]):
+                h = self.upsampler2d_1(hidden_states[:, :, t, :, :])
+                hidden_states_array.append(h)
+            hidden_states = torch.stack(hidden_states_array, dim=2)
+        # RMSNorm
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + 1e-8)
+        hidden_states = hidden_states.to(input_dtype)
+        hidden_states = self.conv_act(hidden_states)
+        hidden_states = self.conv_out(hidden_states)
+        if self.use_unpatchify:
+            hidden_states = unpatchify(hidden_states, self.patch_size)
+        return hidden_states
+class TurboVAED(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,  # useless arg for compatibility, we only use latent channels
+        out_channels: int = 3,
+        latent_channels: int = 128,
+        decoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        decoder_layers_per_block: Tuple[int, ...] = (4, 3, 3, 3, 4),
+        decoder_spatio_temporal_scaling: Tuple[bool, ...] = (True, True, True, False),
+        patch_size: int = 4,
+        patch_size_t: int = 1,
+        resnet_norm_eps: float = 1e-6,
+        scaling_factor: float = 1.0,
+        decoder_causal: bool = False,
+        decoder_is_dw_conv: Tuple[bool, ...] = (False, False, False, False, False),
+        decoder_dw_kernel_size: int = 3,
+        decoder_spatio_only: Tuple[bool, ...] = (False, False, False, False),
+        first_chunk_size: int = 3,
+        step_size: int = 5,
+        spatial_compression_ratio: int = 16,
+        temporal_compression_ratio: int = 4,
+        use_unpatchify: bool = False,
+        # below are for training, keep for compatibility
+        aligned_feature_projection_mode: Optional[str] = None,
+        aligned_feature_projection_dim: Optional[List[Tuple[int, int]]] = None,
+        aligned_blks_indices: Optional[List[int]] = None,
+    ):
+        super().__init__()
+        self.decoder = TurboVAEDDecoder3d(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            block_out_channels=decoder_block_out_channels,
+            spatio_temporal_scaling=decoder_spatio_temporal_scaling,
+            layers_per_block=decoder_layers_per_block,
+            patch_size=patch_size,
+            patch_size_t=patch_size_t,
+            resnet_norm_eps=resnet_norm_eps,
+            is_causal=decoder_causal,
+            decoder_is_dw_conv=decoder_is_dw_conv,
+            decoder_dw_kernel_size=decoder_dw_kernel_size,
+            spatio_only=decoder_spatio_only,
+            use_unpatchify=use_unpatchify,
+        )
+        self.first_chunk_size = first_chunk_size
+        self.step_size = step_size
+        self.spatial_compression_ratio = spatial_compression_ratio
+        self.temporal_compression_ratio = temporal_compression_ratio
+        self.z_dim = latent_channels
+        self.mean = torch.tensor(
+            [
+                -0.2289,
+                -0.0052,
+                -0.1323,
+                -0.2339,
+                -0.2799,
+                0.0174,
+                0.1838,
+                0.1557,
+                -0.1382,
+                0.0542,
+                0.2813,
+                0.0891,
+                0.1570,
+                -0.0098,
+                0.0375,
+                -0.1825,
+                -0.2246,
+                -0.1207,
+                -0.0698,
+                0.5109,
+                0.2665,
+                -0.2108,
+                -0.2158,
+                0.2502,
+                -0.2055,
+                -0.0322,
+                0.1109,
+                0.1567,
+                -0.0729,
+                0.0899,
+                -0.2799,
+                -0.1230,
+                -0.0313,
+                -0.1649,
+                0.0117,
+                0.0723,
+                -0.2839,
+                -0.2083,
+                -0.0520,
+                0.3748,
+                0.0152,
+                0.1957,
+                0.1433,
+                -0.2944,
+                0.3573,
+                -0.0548,
+                -0.1681,
+                -0.0667,
+            ],
+            dtype=torch.float32,
+            device="cuda",
+        )
+        self.std = torch.tensor(
+            [
+                0.4765,
+                1.0364,
+                0.4514,
+                1.1677,
+                0.5313,
+                0.4990,
+                0.4818,
+                0.5013,
+                0.8158,
+                1.0344,
+                0.5894,
+                1.0901,
+                0.6885,
+                0.6165,
+                0.8454,
+                0.4978,
+                0.5759,
+                0.3523,
+                0.7135,
+                0.6804,
+                0.5833,
+                1.4146,
+                0.8986,
+                0.5659,
+                0.7069,
+                0.5338,
+                0.4889,
+                0.4917,
+                0.4069,
+                0.4999,
+                0.6866,
+                0.4093,
+                0.5709,
+                0.6065,
+                0.6415,
+                0.4944,
+                0.5726,
+                1.2042,
+                0.5458,
+                1.6887,
+                0.3971,
+                1.0600,
+                0.3943,
+                0.5537,
+                0.5444,
+                0.4089,
+                0.7468,
+                0.7744,
+            ],
+            dtype=torch.float32,
+            device="cuda",
+        )
+        self.scale = [self.mean, 1.0 / self.std]
+    def _sliding_window_decode(self, z, output_offload=False):
+        z_dtype = z.dtype
+        z_device = z.device
+        scale = self.scale
+        assert isinstance(scale[0], torch.Tensor), "scale[0] must be a tensor"
+        z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(1, self.z_dim, 1, 1, 1)
+        z = z.to(z_dtype)
+        first_chunk_size = self.first_chunk_size
+        step = self.step_size
+        # Context mapping: 1 latent frame context -> temporal_compression_ratio pixel frames overlap
+        num_overlap_pixel_frames = 1 * self.temporal_compression_ratio
+        _, _, num_frames, _, _ = z.shape
+        # 1. Pad frames to satisfy chunking requirements
+        #     The total number of frames must follow the formula:
+        #     num_frames = first_chunk_size + n * step_size
+        num_padding_frames = 0
+        if num_frames < first_chunk_size:
+            # if input is shorter than first_chunk_size
+            num_padding_frames = first_chunk_size - num_frames
+        elif (num_frames - first_chunk_size) % step != 0:
+            num_padding_frames = step - (num_frames - first_chunk_size) % step
+        if num_padding_frames > 0:
+            z = torch.cat([z, z[:, :, -1:].repeat(1, 1, num_padding_frames, 1, 1)], dim=2)
+            num_frames = num_frames + num_padding_frames
+        # 2. Decode with overlapping windows
+        # Collect chunks on CPU to avoid GPU OOM for high resolution (e.g., 1080P) when output_offload=True
+        out_chunks = []
+        if num_frames == first_chunk_size:
+            # if only one chunk, decode directly
+            out = self.decoder(z, is_first_chunk=True)
+            out_chunks.append(out.cpu() if output_offload else out)
+            del out
+        else:
+            # first chunk: attach the right frame
+            out = self.decoder(z[:, :, 0 : first_chunk_size + 1, :, :], is_first_chunk=True)
+            out = out[:, :, :-num_overlap_pixel_frames]
+            out_chunks.append(out.cpu() if output_offload else out)
+            del out
+            # middle chunk: attach the left and right frames
+            # last chunk: attach the left frame
+            for i in range(first_chunk_size, num_frames, step):
+                is_last_chunk = i + step == num_frames
+                left = i - 1
+                right = i + step + 1 if not is_last_chunk else i + step
+                assert left >= 0 and right <= num_frames, f"left: {left}, right: {right}, num_frames: {num_frames}"
+                out_ = self.decoder(z[:, :, left:right, :, :], is_first_chunk=False)
+                if is_last_chunk:
+                    out_ = out_[:, :, num_overlap_pixel_frames:]
+                else:
+                    out_ = out_[:, :, num_overlap_pixel_frames:-num_overlap_pixel_frames]
+                out_chunks.append(out_.cpu() if output_offload else out_)
+                del out_
+        # Concatenate chunks (on CPU if output_offload, otherwise on GPU)
+        out = torch.cat(out_chunks, dim=2)
+        del out_chunks
+        # 3. Remove padded frames
+        if num_padding_frames > 0:
+            out = out[:, :, : -num_padding_frames * self.temporal_compression_ratio]
+        return out.to(z_device) if output_offload else out
+    def decode(self, z: torch.Tensor, output_offload: bool = False):
+        return self._sliding_window_decode(z, output_offload=output_offload)

inference/model/vae2_2/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .vae2_2_model import Wan2_2_VAE, get_vae2_2
2	+
3	+ __all__ = ["Wan2_2_VAE", "get_vae2_2"]

inference/model/vae2_2/vae2_2_model.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import gc
+import torch
+from .vae2_2_module import Wan2_2_VAE
+def get_vae2_2(model_path, device="cuda", weight_dtype=torch.float32) -> Wan2_2_VAE:
+    vae = Wan2_2_VAE(vae_pth=model_path).to(device).to(weight_dtype)
+    vae.vae.requires_grad_(False)
+    vae.vae.eval()
+    gc.collect()
+    torch.cuda.empty_cache()
+    return vae
+__all__ = ["Wan2_2_VAE", "get_vae2_2"]

inference/model/vae2_2/vae2_2_module.py ADDED Viewed

	@@ -0,0 +1,1086 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright 2024-2026 The Alibaba Wan Team Authors. All rights reserved.
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+__all__ = ["Wan2_2_VAE"]
+CACHE_T = 2
+class ScatterFwdAllGatherBackwardOverlap(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, group, overlap_size):
+        """
+        Forward pass: split input tensor along W; each rank processes its local
+        chunk including overlap regions.
+        Args:
+            x: Input tensor, shape [B, C, T, H, W]
+            group: Distributed communication group
+            overlap_size: Width of overlap region
+        """
+        W = x.shape[4]
+        world_size = torch.distributed.get_world_size(group)
+        rank = torch.distributed.get_rank(group)
+        # Compute base chunk size
+        base_chunk_size = (W + world_size - 1) // world_size
+        # Compute chunk range for current rank
+        chunk_start = rank * base_chunk_size
+        chunk_end = min((rank + 1) * base_chunk_size, W)
+        # Extend range with overlap
+        overlap_start = max(0, chunk_start - overlap_size)
+        overlap_end = min(W, chunk_end + overlap_size)
+        # Slice local chunk
+        x_chunk = x[:, :, :, :, overlap_start:overlap_end].contiguous()
+        # Save metadata needed by backward
+        ctx.save_for_backward(torch.tensor([overlap_start, overlap_end, W], dtype=torch.long, device=x.device))
+        ctx.group = group
+        ctx.overlap_size = overlap_size
+        ctx.world_size = world_size
+        ctx.rank = rank
+        ctx.base_chunk_size = base_chunk_size
+        return x_chunk
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Backward pass: all-gather gradients from all ranks and trim overlap.
+        """
+        # Restore saved forward metadata
+        overlap_start, overlap_end, W = ctx.saved_tensors[0]
+        overlap_start = overlap_start.item()
+        overlap_end = overlap_end.item()
+        W = W.item()
+        group = ctx.group
+        overlap_size = ctx.overlap_size
+        world_size = ctx.world_size
+        ctx.rank
+        base_chunk_size = ctx.base_chunk_size
+        # Collect gradients from all ranks via all_gather
+        grad_output = grad_output.contiguous()
+        B, C, T, H = grad_output.shape[:4]
+        grad_shapes = []
+        for r in range(world_size):
+            r_chunk_start = r * base_chunk_size
+            r_chunk_end = min((r + 1) * base_chunk_size, W)
+            r_overlap_start = max(0, r_chunk_start - overlap_size)
+            r_overlap_end = min(W, r_chunk_end + overlap_size)
+            # Compute gradient shape for each rank
+            chunk_width = r_overlap_end - r_overlap_start
+            grad_shapes.append((B, C, T, H, chunk_width))
+        grad_chunks = [
+            torch.zeros(grad_shape, device=grad_output.device, dtype=grad_output.dtype) for grad_shape in grad_shapes
+        ]
+        torch.distributed.all_gather(grad_chunks, grad_output, group=group)
+        # Stitch gathered chunks into full gradient tensor
+        full_grad = torch.zeros(B, C, T, H, W, device=grad_output.device, dtype=grad_output.dtype)
+        # Place each rank's gradient chunk at the correct position
+        for r in range(world_size):
+            r_chunk_start = r * base_chunk_size
+            r_chunk_end = min((r + 1) * base_chunk_size, W)
+            r_overlap_start = max(0, r_chunk_start - overlap_size)
+            r_overlap_end = min(W, r_chunk_end + overlap_size)
+            # Position in full gradient
+            grad_start_in_full = r_overlap_start
+            grad_end_in_full = r_overlap_end
+            # Position inside gathered chunk
+            grad_start_in_chunk = 0
+            grad_end_in_chunk = r_overlap_end - r_overlap_start
+            # Handle left boundary for first rank
+            if r == 0:
+                grad_start_in_chunk = 0
+                grad_end_in_chunk = min(r_chunk_end + overlap_size, W) - r_overlap_start
+            # Handle right boundary for last rank
+            elif r == world_size - 1:
+                grad_start_in_chunk = max(0, r_chunk_start - overlap_size) - r_overlap_start
+                grad_end_in_chunk = r_overlap_end - r_overlap_start
+            # Accumulate into full gradient
+            full_grad[:, :, :, :, grad_start_in_full:grad_end_in_full] += grad_chunks[r][
+                :, :, :, :, grad_start_in_chunk:grad_end_in_chunk
+            ]
+        return full_grad, None, None
+def scatter_fwd_all_gather_backward_with_overlap(x, group, overlap_size=0):
+    return ScatterFwdAllGatherBackwardOverlap.apply(x, group, overlap_size)
+class AllGatherFwdScatterBackwardOverlap(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, group, overlap_size):
+        """
+        Forward pass: each rank clips local input, then all-gathers clipped chunks.
+        Args:
+            x: Input tensor, shape [B, C, T, H, W], already local overlapped chunk per rank
+            group: Distributed communication group
+            overlap_size: Width of overlap region
+        """
+        world_size = torch.distributed.get_world_size(group)
+        rank = torch.distributed.get_rank(group)
+        # Clip local input first (remove overlap area)
+        if rank == 0:
+            valid_start = 0
+            valid_end = x.shape[-1] - overlap_size
+        elif rank == world_size - 1:
+            valid_start = overlap_size
+            valid_end = x.shape[-1]
+        else:
+            valid_start = overlap_size
+            valid_end = x.shape[-1] - overlap_size
+        x_clipped = x[..., valid_start:valid_end].contiguous()
+        clipped_width = x_clipped.shape[-1]
+        # First all_gather: collect clipped widths across ranks
+        width_tensor = torch.tensor([clipped_width], dtype=torch.long, device=x.device)
+        all_widths = [torch.zeros_like(width_tensor) for _ in range(world_size)]
+        torch.distributed.all_gather(all_widths, width_tensor, group=group)
+        clipped_widths = [w.item() for w in all_widths]
+        # Second all_gather: collect clipped data across ranks
+        B, C, T, H = x_clipped.shape[:4]
+        x_clipped_chunks = [torch.zeros(B, C, T, H, w, device=x.device, dtype=x.dtype) for w in clipped_widths]
+        torch.distributed.all_gather(x_clipped_chunks, x_clipped, group=group)
+        full_x = torch.cat(x_clipped_chunks, dim=-1)
+        # Save metadata needed by backward
+        ctx.save_for_backward(torch.tensor([valid_start, valid_end], dtype=torch.long, device=x.device))
+        ctx.clipped_widths = clipped_widths
+        ctx.group = group
+        ctx.overlap_size = overlap_size
+        ctx.world_size = world_size
+        ctx.rank = rank
+        return full_x
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        Backward pass: each rank restores gradients for its own partition only.
+        """
+        # Restore saved forward metadata
+        valid_start, valid_end = ctx.saved_tensors[0]
+        valid_start = valid_start.item()
+        valid_end = valid_end.item()
+        clipped_widths = ctx.clipped_widths
+        ctx.group
+        overlap_size = ctx.overlap_size
+        world_size = ctx.world_size
+        rank = ctx.rank
+        # Compute current rank offset in full gradient
+        start_pos = sum(clipped_widths[:rank])
+        end_pos = start_pos + clipped_widths[rank]
+        # Extract only current rank gradient slice
+        grad_clipped = grad_output[:, :, :, :, start_pos:end_pos]
+        # Pad zeros to recover overlap area for current rank
+        if rank == 0:
+            # First rank: pad right
+            grad_full = F.pad(grad_clipped, (0, overlap_size))
+        elif rank == world_size - 1:
+            # Last rank: pad left
+            grad_full = F.pad(grad_clipped, (overlap_size, 0))
+        else:
+            # Middle rank: pad both sides
+            grad_full = F.pad(grad_clipped, (overlap_size, overlap_size))
+        return grad_full, None, None
+def all_gather_fwd_scatter_backward_with_overlap(x, group, overlap_size=0):
+    return AllGatherFwdScatterBackwardOverlap.apply(x, group, overlap_size)
+def one_plus_world_size(group):
+    return group is not None and torch.distributed.get_world_size(group) > 1
+class CausalConv3d(nn.Conv3d):
+    """
+    Causal 3d convolusion.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._padding = (self.padding[2], self.padding[2], self.padding[1], self.padding[1], 2 * self.padding[0], 0)
+        self.padding = (0, 0, 0)
+    @torch.compile
+    def forward(self, x, cache_x=None, group: torch.distributed.ProcessGroup = None):
+        padding = list(self._padding)
+        if cache_x is not None and self._padding[4] > 0:
+            cache_x = cache_x.to(x.device)
+            x = torch.cat([cache_x, x], dim=2)
+            padding[4] -= cache_x.shape[2]
+        if one_plus_world_size(group):
+            overlap_size = self.kernel_size[-1] // 2 * self.stride[-1]
+            x = scatter_fwd_all_gather_backward_with_overlap(x, group, overlap_size=overlap_size)
+        x = F.pad(x, padding)
+        x = super().forward(x)
+        if one_plus_world_size(group):
+            x = all_gather_fwd_scatter_backward_with_overlap(x, group, overlap_size=overlap_size)
+        return x
+class RMS_norm(nn.Module):
+    def __init__(self, dim, channel_first=True, images=True, bias=False):
+        super().__init__()
+        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
+        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
+        self.channel_first = channel_first
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(shape))
+        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    @torch.compile
+    def forward(self, x):
+        return F.normalize(x, dim=(1 if self.channel_first else -1)) * self.scale * self.gamma + self.bias
+class Upsample(nn.Upsample):
+    @torch.compile
+    def forward(self, x):
+        """
+        Fix bfloat16 support for nearest neighbor interpolation.
+        """
+        return super().forward(x.float()).type_as(x)
+class Resample(nn.Module):
+    def __init__(self, dim, mode):
+        assert mode in ("none", "upsample2d", "upsample3d", "downsample2d", "downsample3d")
+        super().__init__()
+        self.dim = dim
+        self.mode = mode
+        # layers
+        if mode == "upsample2d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"), nn.Conv2d(dim, dim, 3, padding=1)
+            )
+        elif mode == "upsample3d":
+            self.resample = nn.Sequential(
+                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
+                nn.Conv2d(dim, dim, 3, padding=1),
+                # nn.Conv2d(dim, dim//2, 3, padding=1)
+            )
+            self.time_conv = CausalConv3d(dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
+        elif mode == "downsample2d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+        elif mode == "downsample3d":
+            self.resample = nn.Sequential(nn.ZeroPad2d((0, 1, 0, 1)), nn.Conv2d(dim, dim, 3, stride=(2, 2)))
+            self.time_conv = CausalConv3d(dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
+        else:
+            self.resample = nn.Identity()
+    @torch.compile
+    def forward(self, x, feat_cache=None, feat_idx=[0], group: torch.distributed.ProcessGroup = None):
+        if one_plus_world_size(group):
+            if self.mode in ["upsample3d", "upsample2d"]:
+                overlap_size = 1
+            elif self.mode in ["downsample3d", "downsample2d"]:
+                overlap_size = 2
+            else:
+                overlap_size = 0
+            x = scatter_fwd_all_gather_backward_with_overlap(x, group, overlap_size=overlap_size)
+        b, c, t, h, w = x.size()
+        if self.mode == "upsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = "Rep"
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] != "Rep":
+                        # cache last frame of last two chunk
+                        cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                    if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx] == "Rep":
+                        cache_x = torch.cat([torch.zeros_like(cache_x).to(cache_x.device), cache_x], dim=2)
+                    if feat_cache[idx] == "Rep":
+                        x = self.time_conv(x)
+                    else:
+                        x = self.time_conv(x, feat_cache[idx])
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+                    x = x.reshape(b, 2, c, t, h, w)
+                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]), 3)
+                    x = x.reshape(b, c, t * 2, h, w)
+        t = x.shape[2]
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.resample(x)
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
+        if self.mode == "downsample3d":
+            if feat_cache is not None:
+                idx = feat_idx[0]
+                if feat_cache[idx] is None:
+                    feat_cache[idx] = x.clone()
+                    feat_idx[0] += 1
+                else:
+                    cache_x = x[:, :, -1:, :, :].clone()
+                    x = self.time_conv(torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
+                    feat_cache[idx] = cache_x
+                    feat_idx[0] += 1
+        if one_plus_world_size(group):
+            if self.mode in ["upsample3d", "upsample2d"]:
+                overlap_size = overlap_size * 2
+            elif self.mode in ["downsample3d", "downsample2d"]:
+                overlap_size = overlap_size // 2
+            else:
+                overlap_size = overlap_size
+            x = all_gather_fwd_scatter_backward_with_overlap(x, group, overlap_size=overlap_size)
+        return x
+    def init_weight(self, conv):
+        conv_weight = conv.weight.detach().clone()
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        one_matrix = torch.eye(c1, c2)
+        init_matrix = one_matrix
+        nn.init.zeros_(conv_weight)
+        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
+        conv.weight = nn.Parameter(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+    def init_weight2(self, conv):
+        conv_weight = conv.weight.data.detach().clone()
+        nn.init.zeros_(conv_weight)
+        c1, c2, t, h, w = conv_weight.size()
+        init_matrix = torch.eye(c1 // 2, c2)
+        conv_weight[: c1 // 2, :, -1, 0, 0] = init_matrix
+        conv_weight[c1 // 2 :, :, -1, 0, 0] = init_matrix
+        conv.weight = nn.Parameter(conv_weight)
+        nn.init.zeros_(conv.bias.data)
+class ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout=0.0):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        # layers
+        self.residual = nn.Sequential(
+            RMS_norm(in_dim, images=False),
+            nn.SiLU(),
+            CausalConv3d(in_dim, out_dim, 3, padding=1),
+            RMS_norm(out_dim, images=False),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            CausalConv3d(out_dim, out_dim, 3, padding=1),
+        )
+        self.shortcut = CausalConv3d(in_dim, out_dim, 1) if in_dim != out_dim else nn.Identity()
+    @torch.compile
+    def forward(self, x, feat_cache=None, feat_idx=[0], group: torch.distributed.ProcessGroup = None):
+        if one_plus_world_size(group):
+            overlap_size = 2
+            x = scatter_fwd_all_gather_backward_with_overlap(x, group, overlap_size=overlap_size)
+        h = self.shortcut(x)
+        for layer in self.residual:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    # cache last frame of last two chunk
+                    cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        x = x + h
+        if one_plus_world_size(group):
+            x = all_gather_fwd_scatter_backward_with_overlap(x, group, overlap_size=overlap_size)
+        return x
+class AttentionBlock(nn.Module):
+    """
+    Causal self-attention with a single head.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        # layers
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        # zero out the last layer params
+        nn.init.zeros_(self.proj.weight)
+    @torch.compile
+    def forward(self, x):
+        identity = x
+        b, c, t, h, w = x.size()
+        x = rearrange(x, "b c t h w -> (b t) c h w")
+        x = self.norm(x)
+        # compute query, key, value
+        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3, -1).permute(0, 1, 3, 2).contiguous().chunk(3, dim=-1)
+        # apply attention
+        x = F.scaled_dot_product_attention(q, k, v)
+        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
+        # output
+        x = self.proj(x)
+        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
+        x = x + identity
+        return x
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(x, "b c f (h q) (w r) -> b (c r q) f h w", q=patch_size, r=patch_size)
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(x, "b (c r q) f h w -> b c f (h q) (w r)", q=patch_size, r=patch_size)
+    return x
+class AvgDown3D(nn.Module):
+    def __init__(self, in_channels, out_channels, factor_t, factor_s=1):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert in_channels * self.factor % out_channels == 0
+        self.group_size = in_channels * self.factor // out_channels
+    @torch.compile
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
+        pad = (0, 0, 0, 0, pad_t, 0)
+        x = F.pad(x, pad)
+        B, C, T, H, W = x.shape
+        x = x.view(
+            B, C, T // self.factor_t, self.factor_t, H // self.factor_s, self.factor_s, W // self.factor_s, self.factor_s
+        )
+        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
+        x = x.view(B, C * self.factor, T // self.factor_t, H // self.factor_s, W // self.factor_s)
+        x = x.view(B, self.out_channels, self.group_size, T // self.factor_t, H // self.factor_s, W // self.factor_s)
+        x = x.mean(dim=2)
+        return x
+class DupUp3D(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, factor_t, factor_s=1):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.factor_t = factor_t
+        self.factor_s = factor_s
+        self.factor = self.factor_t * self.factor_s * self.factor_s
+        assert out_channels * self.factor % in_channels == 0
+        self.repeats = out_channels * self.factor // in_channels
+    @torch.compile
+    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
+        x = x.repeat_interleave(self.repeats, dim=1)
+        x = x.view(x.size(0), self.out_channels, self.factor_t, self.factor_s, self.factor_s, x.size(2), x.size(3), x.size(4))
+        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
+        x = x.view(
+            x.size(0), self.out_channels, x.size(2) * self.factor_t, x.size(4) * self.factor_s, x.size(6) * self.factor_s
+        )
+        if first_chunk:
+            x = x[:, :, self.factor_t - 1 :, :, :]
+        return x
+class Down_ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, mult, temperal_downsample=False, down_flag=False):
+        super().__init__()
+        # Shortcut path with downsample
+        self.avg_shortcut = AvgDown3D(
+            in_dim, out_dim, factor_t=2 if temperal_downsample else 1, factor_s=2 if down_flag else 1
+        )
+        # Main path with residual blocks and downsample
+        downsamples = []
+        for _ in range(mult):
+            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final downsample block
+        if down_flag:
+            mode = "downsample3d" if temperal_downsample else "downsample2d"
+            downsamples.append(Resample(out_dim, mode=mode))
+        self.downsamples = nn.Sequential(*downsamples)
+    @torch.compile
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        x_copy = x.clone()
+        for module in self.downsamples:
+            x = module(x, feat_cache, feat_idx)
+        return x + self.avg_shortcut(x_copy)
+class Up_ResidualBlock(nn.Module):
+    def __init__(self, in_dim, out_dim, dropout, mult, temperal_upsample=False, up_flag=False):
+        super().__init__()
+        # Shortcut path with upsample
+        if up_flag:
+            self.avg_shortcut = DupUp3D(in_dim, out_dim, factor_t=2 if temperal_upsample else 1, factor_s=2 if up_flag else 1)
+        else:
+            self.avg_shortcut = None
+        # Main path with residual blocks and upsample
+        upsamples = []
+        for _ in range(mult):
+            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
+            in_dim = out_dim
+        # Add the final upsample block
+        if up_flag:
+            mode = "upsample3d" if temperal_upsample else "upsample2d"
+            upsamples.append(Resample(out_dim, mode=mode))
+        self.upsamples = nn.Sequential(*upsamples)
+    @torch.compile
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False, group: torch.distributed.ProcessGroup = None):
+        x_main = x.clone()
+        for module in self.upsamples:
+            x_main = module(x_main, feat_cache, feat_idx, group=group)
+        if self.avg_shortcut is not None:
+            x_shortcut = self.avg_shortcut(x, first_chunk)
+            return x_main + x_shortcut
+        else:
+            return x_main
+class Encoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        # dimensions
+        dims = [dim * u for u in [1] + dim_mult]
+        scale = 1.0
+        # init block
+        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
+        # downsample blocks
+        downsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_down_flag = temperal_downsample[i] if i < len(temperal_downsample) else False
+            downsamples.append(
+                Down_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks,
+                    temperal_downsample=t_down_flag,
+                    down_flag=i != len(dim_mult) - 1,
+                )
+            )
+            scale /= 2.0
+        self.downsamples = nn.Sequential(*downsamples)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim), ResidualBlock(out_dim, out_dim, dropout)
+        )
+        # # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(), CausalConv3d(out_dim, z_dim, 3, padding=1))
+    @torch.compile
+    def forward(self, x, feat_cache=None, feat_idx=[0]):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx])
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x)
+        # downsamples
+        for layer in self.downsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        # middle
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx)
+            else:
+                x = layer(x)
+        # head
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        return x
+class Decoder3d(nn.Module):
+    def __init__(
+        self,
+        dim=128,
+        z_dim=4,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_upsample=[False, True, True],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_upsample = temperal_upsample
+        # dimensions
+        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        # scale = 1.0 / 2 ** (len(dim_mult) - 2)
+        # init block
+        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
+        # middle blocks
+        self.middle = nn.Sequential(
+            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]), ResidualBlock(dims[0], dims[0], dropout)
+        )
+        # upsample blocks
+        upsamples = []
+        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
+            t_up_flag = temperal_upsample[i] if i < len(temperal_upsample) else False
+            upsamples.append(
+                Up_ResidualBlock(
+                    in_dim=in_dim,
+                    out_dim=out_dim,
+                    dropout=dropout,
+                    mult=num_res_blocks + 1,
+                    temperal_upsample=t_up_flag,
+                    up_flag=i != len(dim_mult) - 1,
+                )
+            )
+        self.upsamples = nn.Sequential(*upsamples)
+        # output blocks
+        self.head = nn.Sequential(RMS_norm(out_dim, images=False), nn.SiLU(), CausalConv3d(out_dim, 12, 3, padding=1))
+    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False, group: torch.distributed.ProcessGroup = None):
+        if feat_cache is not None:
+            idx = feat_idx[0]
+            cache_x = x[:, :, -CACHE_T:, :, :].clone()
+            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+            x = self.conv1(x, feat_cache[idx], group=group)
+            feat_cache[idx] = cache_x
+            feat_idx[0] += 1
+        else:
+            x = self.conv1(x, group=group)
+        for layer in self.middle:
+            if isinstance(layer, ResidualBlock) and feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx, group=group)
+            else:
+                x = layer(x)
+        # upsamples
+        for layer in self.upsamples:
+            if feat_cache is not None:
+                x = layer(x, feat_cache, feat_idx, first_chunk, group=group)
+            else:
+                x = layer(x, group=group)
+        # head
+        if one_plus_world_size(group):
+            overlap_size = self.head[2].kernel_size[-1] // 2 * self.head[2].stride[-1]
+            x = scatter_fwd_all_gather_backward_with_overlap(x, group, overlap_size=overlap_size)
+        for layer in self.head:
+            if isinstance(layer, CausalConv3d) and feat_cache is not None:
+                idx = feat_idx[0]
+                cache_x = x[:, :, -CACHE_T:, :, :].clone()
+                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
+                    cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
+                x = layer(x, feat_cache[idx])
+                feat_cache[idx] = cache_x
+                feat_idx[0] += 1
+            else:
+                x = layer(x)
+        if one_plus_world_size(group):
+            x = all_gather_fwd_scatter_backward_with_overlap(x, group, overlap_size=overlap_size)
+        return x
+def count_conv3d(model):
+    count = 0
+    for m in model.modules():
+        if isinstance(m, CausalConv3d):
+            count += 1
+    return count
+class WanVAE_(nn.Module):
+    def __init__(
+        self,
+        dim=160,
+        dec_dim=256,
+        z_dim=16,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, False],
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.z_dim = z_dim
+        self.dim_mult = dim_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_scales = attn_scales
+        self.temperal_downsample = temperal_downsample
+        self.temperal_upsample = temperal_downsample[::-1]
+        # modules
+        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks, attn_scales, self.temperal_downsample, dropout)
+        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
+        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
+        self.decoder = Decoder3d(dec_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout)
+    def forward(self, x, scale=[0, 1]):
+        mu = self.encode(x, scale)
+        x_recon = self.decode(mu, scale)
+        return x_recon, mu
+    def encode(self, x, scale):
+        self.clear_cache()
+        x = patchify(x, patch_size=2)
+        t = x.shape[2]
+        iter_ = 1 + (t - 1) // 4
+        for i in range(iter_):
+            self._enc_conv_idx = [0]
+            if i == 0:
+                out = self.encoder(x[:, :, :1, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
+            else:
+                out_ = self.encoder(
+                    x[:, :, 1 + 4 * (i - 1) : 1 + 4 * i, :, :], feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx
+                )
+                out = torch.cat([out, out_], 2)
+        mu, log_var = self.conv1(out).chunk(2, dim=1)
+        if isinstance(scale[0], torch.Tensor):
+            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(1, self.z_dim, 1, 1, 1)
+        else:
+            mu = (mu - scale[0]) * scale[1]
+        self.clear_cache()
+        return mu
+    def decode(self, z, scale, group: torch.distributed.ProcessGroup = None):
+        self.clear_cache()
+        if isinstance(scale[0], torch.Tensor):
+            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(1, self.z_dim, 1, 1, 1)
+        else:
+            z = z / scale[1] + scale[0]
+        iter_ = z.shape[2]
+        x = self.conv2(z, group=group)
+        for i in range(iter_):
+            self._conv_idx = [0]
+            if i == 0:
+                out = self.decoder(
+                    x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx, first_chunk=True, group=group
+                )
+            else:
+                out_ = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx, group=group)
+                out = torch.cat([out, out_], 2)
+        out = unpatchify(out, patch_size=2)
+        self.clear_cache()
+        return out
+    def reparameterize(self, mu, log_var):
+        std = torch.exp(0.5 * log_var)
+        eps = torch.randn_like(std)
+        return eps * std + mu
+    def sample(self, imgs, deterministic=False):
+        mu, log_var = self.encode(imgs)
+        if deterministic:
+            return mu
+        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
+        return mu + std * torch.randn_like(std)
+    def clear_cache(self):
+        self._conv_num = count_conv3d(self.decoder)
+        self._conv_idx = [0]
+        self._feat_map = [None] * self._conv_num
+        # cache encode
+        self._enc_conv_num = count_conv3d(self.encoder)
+        self._enc_conv_idx = [0]
+        self._enc_feat_map = [None] * self._enc_conv_num
+def _video_vae(pretrained_path=None, z_dim=16, dim=160, device="cpu", **kwargs):
+    # params
+    cfg = dict(
+        dim=dim,
+        z_dim=z_dim,
+        dim_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_scales=[],
+        temperal_downsample=[True, True, True],
+        dropout=0.0,
+    )
+    cfg.update(**kwargs)
+    # init model
+    with torch.device("meta"):
+        model = WanVAE_(**cfg)
+    # load checkpoint
+    logging.info(f"loading {pretrained_path}")
+    model.load_state_dict(torch.load(pretrained_path, map_location=device), assign=True)
+    return model
+class Wan2_2_VAE:
+    def __init__(
+        self,
+        z_dim=48,
+        c_dim=160,
+        vae_pth=None,
+        dim_mult=[1, 2, 4, 4],
+        temperal_downsample=[False, True, True],
+        dtype=torch.float,
+        device="cuda",
+    ):
+        self.dtype = dtype
+        self.device = device
+        self.mean = torch.tensor(
+            [
+                -0.2289,
+                -0.0052,
+                -0.1323,
+                -0.2339,
+                -0.2799,
+                0.0174,
+                0.1838,
+                0.1557,
+                -0.1382,
+                0.0542,
+                0.2813,
+                0.0891,
+                0.1570,
+                -0.0098,
+                0.0375,
+                -0.1825,
+                -0.2246,
+                -0.1207,
+                -0.0698,
+                0.5109,
+                0.2665,
+                -0.2108,
+                -0.2158,
+                0.2502,
+                -0.2055,
+                -0.0322,
+                0.1109,
+                0.1567,
+                -0.0729,
+                0.0899,
+                -0.2799,
+                -0.1230,
+                -0.0313,
+                -0.1649,
+                0.0117,
+                0.0723,
+                -0.2839,
+                -0.2083,
+                -0.0520,
+                0.3748,
+                0.0152,
+                0.1957,
+                0.1433,
+                -0.2944,
+                0.3573,
+                -0.0548,
+                -0.1681,
+                -0.0667,
+            ],
+            dtype=dtype,
+            device=device,
+        )
+        self.std = torch.tensor(
+            [
+                0.4765,
+                1.0364,
+                0.4514,
+                1.1677,
+                0.5313,
+                0.4990,
+                0.4818,
+                0.5013,
+                0.8158,
+                1.0344,
+                0.5894,
+                1.0901,
+                0.6885,
+                0.6165,
+                0.8454,
+                0.4978,
+                0.5759,
+                0.3523,
+                0.7135,
+                0.6804,
+                0.5833,
+                1.4146,
+                0.8986,
+                0.5659,
+                0.7069,
+                0.5338,
+                0.4889,
+                0.4917,
+                0.4069,
+                0.4999,
+                0.6866,
+                0.4093,
+                0.5709,
+                0.6065,
+                0.6415,
+                0.4944,
+                0.5726,
+                1.2042,
+                0.5458,
+                1.6887,
+                0.3971,
+                1.0600,
+                0.3943,
+                0.5537,
+                0.5444,
+                0.4089,
+                0.7468,
+                0.7744,
+            ],
+            dtype=dtype,
+            device=device,
+        )
+        self.scale = [self.mean, 1.0 / self.std]
+        # init model
+        self.vae = (
+            _video_vae(
+                pretrained_path=vae_pth, z_dim=z_dim, dim=c_dim, dim_mult=dim_mult, temperal_downsample=temperal_downsample
+            )
+            .eval()
+            .requires_grad_(False)
+            .to(device)
+        )
+    def encode(self, video):
+        return self.vae.encode(video, self.scale).float()
+    def to(self, *args, **kwargs):
+        self.mean = self.mean.to(*args, **kwargs)
+        self.std = self.std.to(*args, **kwargs)
+        self.scale = [self.mean, 1.0 / self.std]
+        self.vae = self.vae.to(*args, **kwargs)
+        return self
+    def decode(self, z, group: torch.distributed.ProcessGroup = None):
+        return self.vae.decode(z, self.scale, group=group).float().clamp_(-1, 1)

inference/pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .pipeline import MagiPipeline
+__all__ = [
+    # pipeline
+    "MagiPipeline",
+]

inference/pipeline/data_proxy.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Any, Literal, Optional, TYPE_CHECKING
+import torch
+from einops import rearrange
+from inference.common import DataProxyConfig, Modality, VarlenHandler
+from inference.model.dit.dit_module import FFAHandler
+from torch.nn import functional as F
+from unfoldNd import UnfoldNd
+if TYPE_CHECKING:
+    from inference.pipeline.video_generate import EvalInput
+def calc_local_qk_range(num_video_tokens, num_audio_and_txt_tokens, num_frames, frame_receptive_field):
+    token_per_frame = num_video_tokens // num_frames
+    total_tokens = num_video_tokens + num_audio_and_txt_tokens
+    q_range_list = []
+    k_range_list = []
+    for i in range(num_frames):
+        local_q_range = torch.tensor([i * token_per_frame, (i + 1) * token_per_frame])
+        local_k_range = torch.tensor(
+            [(i - frame_receptive_field) * token_per_frame, (i + frame_receptive_field + 1) * token_per_frame]
+        )
+        q_range_list.append(local_q_range)
+        k_range_list.append(local_k_range)
+    local_q_range = torch.stack(q_range_list, dim=0)
+    local_k_range = torch.stack(k_range_list, dim=0)
+    local_k_range[local_k_range < 0] = 0
+    local_k_range[local_k_range > num_video_tokens] = num_video_tokens
+    video_q_range = torch.tensor([[0, num_video_tokens]])
+    video_k_range = torch.tensor([[num_video_tokens, num_video_tokens + num_audio_and_txt_tokens]])
+    at_q_ranges = torch.tensor([[num_video_tokens, total_tokens]])
+    at_k_ranges = torch.tensor([[0, total_tokens]])
+    q_ranges = torch.cat([local_q_range, video_q_range, at_q_ranges], dim=0).to(torch.int32).to("cuda", non_blocking=True)
+    k_ranges = torch.cat([local_k_range, video_k_range, at_k_ranges], dim=0).to(torch.int32).to("cuda", non_blocking=True)
+    return (q_ranges, k_ranges)
+def calc_local_attn_ffa_handler(num_video_tokens, num_audio_and_txt_tokens, num_frames, frame_receptive_field):
+    q_ranges, k_ranges = calc_local_qk_range(num_video_tokens, num_audio_and_txt_tokens, num_frames, frame_receptive_field)
+    max_seqlen_q = num_video_tokens + num_audio_and_txt_tokens
+    max_seqlen_k = num_video_tokens + num_audio_and_txt_tokens
+    attn_type_map = torch.zeros([q_ranges.shape[0]], device="cuda", dtype=torch.int32)
+    softmax_scale = None
+    ffa_handler = FFAHandler(
+        q_ranges=q_ranges,
+        k_ranges=k_ranges,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
+        attn_type_map=attn_type_map,
+        softmax_scale=softmax_scale,
+    )
+    return ffa_handler
+def get_coords(
+    shape: list[int],
+    ref_feat_shape: list[int],
+    offset_thw: list[int] = [0, 0, 0],
+    device: torch.device = torch.device("cpu"),
+    dtype: torch.dtype = torch.float32,
+):
+    """
+    Generate feature-grid coordinates and corresponding original/reference size metadata.
+    Args:
+        feat_shape: [T, H, W] original feature-map shape
+        ref_feat_shape: [T_ref, H_ref, W_ref] reference feature-map shape
+        device: device for coordinate tensors
+    Returns:
+        coords: tensor shape (T*H*W, 9), containing (t, h, w, T, H, W, ref_T, ref_H, ref_W)
+    """
+    ori_t, ori_h, ori_w = shape
+    ref_t, ref_h, ref_w = ref_feat_shape
+    # Generate index ranges
+    offset_t, offset_h, offset_w = offset_thw
+    time_rng = torch.arange(ori_t, device=device, dtype=dtype) + offset_t
+    height_rng = torch.arange(ori_h, device=device, dtype=dtype) + offset_h
+    width_rng = torch.arange(ori_w, device=device, dtype=dtype) + offset_w
+    # Use meshgrid to generate a 3D grid (T, H, W)
+    time_grid, height_grid, width_grid = torch.meshgrid(time_rng, height_rng, width_rng, indexing="ij")
+    # Stack and flatten
+    coords_grid = torch.stack([time_grid, height_grid, width_grid], dim=-1)
+    coords_flat = coords_grid.reshape(-1, 3)
+    # Build and expand size metadata
+    meta = torch.tensor([ori_t, ori_h, ori_w, ref_t, ref_h, ref_w], device=device, dtype=dtype)
+    meta_expanded = meta.expand(coords_flat.size(0), -1)
+    # Merge and return
+    return torch.cat([coords_flat, meta_expanded], dim=-1)
+@dataclass
+class SingleData:
+    video_x_t: torch.Tensor
+    audio_x_t: torch.Tensor
+    audio_feat_len: int
+    txt_feat: torch.Tensor
+    txt_feat_len: int
+    t: int
+    h: int
+    w: int
+    patch_size: int
+    t_patch_size: int
+    spatial_rope_interpolation: Literal["inter", "extra"]
+    ref_audio_offset: int
+    text_offset: int
+    coords_style: Literal["v1", "v2"] = "v1"
+    def __post_init__(self):
+        self.video_token_num = self.video_x_t.shape[0]
+        self.audio_x_t = self.audio_x_t[: self.audio_feat_len]
+        self.txt_feat = self.txt_feat[: self.txt_feat_len]
+        self.video_channel = self.video_x_t.shape[-1]
+        self.audio_channel = self.audio_x_t.shape[-1]
+        self.txt_channel = self.txt_feat.shape[-1]
+    @property
+    def device(self):
+        return self.video_x_t.device
+    @property
+    def default_dtype(self):
+        return self.video_x_t.dtype
+    @property
+    def total_token_num(self):
+        return self.video_token_num + self.audio_feat_len + self.txt_feat_len
+    @property
+    def token_sequence(self):
+        tensors_to_concat = [self.video_x_t, self.audio_x_t, self.txt_feat]
+        max_channel = max(tensor.shape[-1] for tensor in tensors_to_concat)
+        padded_tensors = [F.pad(t, (0, max_channel - t.shape[-1])) for t in tensors_to_concat]
+        ret_val = torch.cat(padded_tensors, dim=0)
+        return ret_val
+    @property
+    def modality_mapping(self):
+        v_map = torch.full((self.video_token_num,), Modality.VIDEO, dtype=torch.int64, device=self.device)
+        a_map = torch.full((self.audio_feat_len,), Modality.AUDIO, dtype=torch.int64, device=self.device)
+        t_map = torch.full((self.txt_feat_len,), Modality.TEXT, dtype=torch.int64, device=self.device)
+        modality_mapping = torch.cat([v_map, a_map, t_map], dim=0)
+        return modality_mapping
+    def default_coords(self, shape, ref_feat_shape, offset_thw=[0, 0, 0]):
+        return get_coords(
+            shape=shape, ref_feat_shape=ref_feat_shape, offset_thw=offset_thw, device=self.device, dtype=self.default_dtype
+        )
+    @property
+    def coords_mapping(self):
+        if self.spatial_rope_interpolation == "inter":
+            video_ref_feat_shape = (self.t // self.t_patch_size, 32, 32)
+        else:
+            video_ref_feat_shape = (self.t // self.t_patch_size, self.h // self.patch_size, self.w // self.patch_size)
+        video_coords = self.default_coords(
+            shape=(self.t // self.t_patch_size, self.h // self.patch_size, self.w // self.patch_size),
+            ref_feat_shape=video_ref_feat_shape,
+        )
+        if self.coords_style == "v1":
+            audio_coords = self.default_coords(
+                shape=(self.audio_feat_len, 1, 1), ref_feat_shape=(self.t // self.t_patch_size, 1, 1)
+            )
+            text_coords = self.default_coords(
+                shape=(self.txt_feat_len, 1, 1), ref_feat_shape=(2, 1, 1), offset_thw=[self.text_offset, 0, 0]
+            )
+        elif self.coords_style == "v2":
+            magic_audio_ref_t = (self.audio_feat_len - 1) // 4 + 1
+            audio_coords = self.default_coords(
+                shape=(self.audio_feat_len, 1, 1), ref_feat_shape=(magic_audio_ref_t // self.t_patch_size, 1, 1)
+            )
+            text_coords = self.default_coords(
+                shape=(self.txt_feat_len, 1, 1), ref_feat_shape=(1, 1, 1), offset_thw=[-self.txt_feat_len, 0, 0]
+            )
+        coords_mapping = torch.cat([video_coords, audio_coords, text_coords], dim=0)
+        return coords_mapping
+    def depack_token_sequence(self, token_sequence):
+        video_x_t = token_sequence[: self.video_token_num, : self.video_channel]
+        video_x_t = rearrange(
+            video_x_t,
+            "(T H W) (pT pH pW C) -> C (T pT) (H pH) (W pW)",
+            H=self.h // self.patch_size,
+            W=self.w // self.patch_size,
+            pT=self.t_patch_size,
+            pH=self.patch_size,
+            pW=self.patch_size,
+        ).contiguous()
+        audio_x_t = token_sequence[self.video_token_num : self.video_token_num + self.audio_feat_len, : self.audio_channel]
+        return video_x_t, audio_x_t
+@dataclass
+class SimplePackedData:
+    items: list[SingleData]
+    @property
+    def token_sequence(self):
+        return torch.cat([item.token_sequence for item in self.items], dim=0)
+    @property
+    def modality_mapping(self):
+        return torch.cat([item.modality_mapping for item in self.items], dim=0)
+    @property
+    def coords_mapping(self):
+        return torch.cat([item.coords_mapping for item in self.items], dim=0)
+    @property
+    def total_token_num(self):
+        return sum([item.total_token_num for item in self.items])
+    def __getitem__(self, index):
+        return self.items[index]
+    @property
+    def cu_seqlen(self):
+        cu_seqlen = torch.cumsum(torch.tensor([item.total_token_num for item in self.items]), dim=0)
+        cu_seqlen = torch.nn.functional.pad(cu_seqlen, (1, 0))
+        return cu_seqlen
+    @property
+    def max_seqlen(self):
+        return torch.tensor(max([item.total_token_num for item in self.items]))
+    def depack_token_sequence(self, token_sequence):
+        video_x_t_list = []
+        audio_x_t_list = []
+        token_sequence_list = torch.split(token_sequence, [item.total_token_num for item in self.items], dim=0)
+        for item, token_sequence in zip(self.items, token_sequence_list):
+            video_x_t, audio_x_t = item.depack_token_sequence(token_sequence)
+            video_x_t_list.append(video_x_t)
+            audio_x_t_list.append(audio_x_t)
+        return torch.stack(video_x_t_list, dim=0), torch.stack(audio_x_t_list, dim=0)
+class MagiDataProxy:
+    def __init__(self, config: DataProxyConfig):
+        self.patch_size = config.patch_size
+        self.t_patch_size = config.t_patch_size
+        self.frame_receptive_field = config.frame_receptive_field
+        self.spatial_rope_interpolation = 'extra'
+        self.ref_audio_offset = config.ref_audio_offset
+        self.text_offset = config.text_offset
+        self.unfold = UnfoldNd(
+            kernel_size=(self.t_patch_size, self.patch_size, self.patch_size),
+            stride=(self.t_patch_size, self.patch_size, self.patch_size),
+        )
+        self.coords_style = config.coords_style
+        self._saved_data: dict[str, Any] = {}
+    def saved_for_output(self, **kwargs):
+        """
+        Store intermediate data used by process_output.
+        Supports keyword-argument style calls: saved_for_output(a=1, b=2)
+        Can be called multiple times to accumulate data
+        Args:
+            **kwargs: key-value pairs to store
+        """
+        # Directly update dict; supports accumulation across calls
+        self._saved_data.update(kwargs)
+    def get_saved_data(self, key: str):
+        """
+        Get stored data
+        """
+        return self._saved_data[key]
+    def img2tokens(self, x_t: torch.Tensor):
+        x_t_unfolded = self.unfold(x_t)
+        # Transpose dimensions from (N, col_dim, num_tokens) -> (N, num_tokens, col_dim)
+        x_t = rearrange(x_t_unfolded, "N col_dim num_tokens -> N num_tokens col_dim").contiguous()
+        return x_t
+    def process_input(self, transported_data: "EvalInput"):
+        # init img2col module
+        batch_size, _, t, h, w = transported_data.x_t.shape
+        # 1. Process video features while keeping the batch dimension
+        x_t = self.img2tokens(transported_data.x_t)
+        # 2. Process audio features while keeping the batch dimension
+        # Assume transported_data.audio_x_t shape is already (N, num_tokens, col_dim)
+        audio_x_t = transported_data.audio_x_t.contiguous()
+        # Here we assume text_in shape is (N, num_tokens, col_dim)
+        text_in = transported_data.txt_feat.contiguous()
+        simple_packed_data = SimplePackedData(items=[])
+        for i in range(batch_size):
+            single_data = SingleData(
+                video_x_t=x_t[i],
+                audio_x_t=audio_x_t[i],
+                audio_feat_len=transported_data.audio_feat_len[i],
+                txt_feat=text_in[i],
+                txt_feat_len=transported_data.txt_feat_len[i],
+                t=t,
+                h=h,
+                w=w,
+                patch_size=self.patch_size,
+                t_patch_size=self.t_patch_size,
+                spatial_rope_interpolation=self.spatial_rope_interpolation,
+                ref_audio_offset=self.ref_audio_offset,
+                text_offset=self.text_offset,
+                coords_style=self.coords_style,
+            )
+            simple_packed_data.items.append(single_data)
+        if self.frame_receptive_field != -1:
+            assert batch_size == 1, "local attention only supports batch size 1"
+            local_attn_handler = calc_local_attn_ffa_handler(
+                num_video_tokens=simple_packed_data[0].video_token_num,
+                num_audio_and_txt_tokens=simple_packed_data[0].audio_feat_len + simple_packed_data[0].txt_feat_len,
+                num_frames=t,
+                frame_receptive_field=self.frame_receptive_field,
+            )
+            if isinstance(local_attn_handler.max_seqlen_k, torch.Tensor):
+                local_attn_handler.max_seqlen_k = local_attn_handler.max_seqlen_k.item()
+            if isinstance(local_attn_handler.max_seqlen_q, torch.Tensor):
+                local_attn_handler.max_seqlen_q = local_attn_handler.max_seqlen_q.item()
+        else:
+            local_attn_handler = None
+        varlen_handler = VarlenHandler(
+            cu_seqlens_q=simple_packed_data.cu_seqlen.to(torch.int32).cuda(),
+            cu_seqlens_k=simple_packed_data.cu_seqlen.to(torch.int32).cuda(),
+            max_seqlen_q=simple_packed_data.max_seqlen.to(torch.int32).cuda(),
+            max_seqlen_k=simple_packed_data.max_seqlen.to(torch.int32).cuda(),
+        )
+        self.saved_for_output(simple_packed_data=simple_packed_data)
+        x = simple_packed_data.token_sequence
+        coords_mapping = simple_packed_data.coords_mapping
+        modality_mapping = simple_packed_data.modality_mapping
+        return (x, coords_mapping, modality_mapping, varlen_handler, local_attn_handler)
+    def process_output(self, x: torch.Tensor):
+        # Inserting operations in between may corrupt parallel-runtime data and cause latent errors
+        simple_packed_data: SimplePackedData = self.get_saved_data("simple_packed_data")
+        x_video, x_audio = simple_packed_data.depack_token_sequence(x)
+        return (x_video, x_audio)

inference/pipeline/entry.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import sys
+from inference.common import parse_config
+from inference.infra import initialize_infra
+from inference.model.dit import get_dit
+from inference.utils import print_rank_0
+try:
+    from .pipeline import MagiPipeline
+except ImportError:
+    # Keep compatibility when entry.py is executed as a script path.
+    from inference.pipeline import MagiPipeline
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Run DiT pipeline with unified offline entry.")
+    parser.add_argument("--prompt", type=str)
+    parser.add_argument("--save_path_prefix", type=str, help="Path prefix for saving outputs.")
+    parser.add_argument("--output_path", type=str, help="Alias of --save_path_prefix for MAGI-style CLI.")
+    parser.add_argument("--image_path", type=str, help="Path to image for i2v mode.")
+    parser.add_argument(
+        "--audio_path", type=str, default=None, help="Path to optional audio for lipsync mode; omit to use i2v or t2v"
+    )
+    # Optional runtime controls; forwarded to pipeline methods when provided.
+    parser.add_argument("--seed", type=int)
+    parser.add_argument("--seconds", type=int)
+    parser.add_argument("--br_width", type=int)
+    parser.add_argument("--br_height", type=int)
+    parser.add_argument("--sr_width", type=int)
+    parser.add_argument("--sr_height", type=int)
+    parser.add_argument("--output_width", type=int)
+    parser.add_argument("--output_height", type=int)
+    parser.add_argument("--upsample_mode", type=str)
+    args, _ = parser.parse_known_args()
+    return args
+def main():
+    args = parse_arguments()
+    config = parse_config()
+    model = get_dit(config.arch_config, config.engine_config)
+    pipeline = MagiPipeline(model, config.evaluation_config)
+    save_path_prefix = args.save_path_prefix or args.output_path
+    if not save_path_prefix:
+        print_rank_0("Error: --save_path_prefix (or --output_path) is required.")
+        sys.exit(1)
+    optional_kwargs = {
+        "seed": args.seed,
+        "seconds": args.seconds,
+        "br_width": args.br_width,
+        "br_height": args.br_height,
+        "sr_width": args.sr_width,
+        "sr_height": args.sr_height,
+        "output_width": args.output_width,
+        "output_height": args.output_height,
+        "upsample_mode": args.upsample_mode,
+    }
+    optional_kwargs = {k: v for k, v in optional_kwargs.items() if v is not None and v is not False}
+    prompt = args.prompt
+    image_path = args.image_path
+    audio_path = args.audio_path
+    if not prompt:
+        print_rank_0("Error: --prompt is required.")
+        sys.exit(1)
+    if not image_path:
+        print_rank_0("Error: --image_path is required.")
+        sys.exit(1)
+    pipeline.run_offline(
+        prompt=prompt, image=image_path, audio=audio_path, save_path_prefix=save_path_prefix, **optional_kwargs
+    )
+if __name__ == "__main__":
+    initialize_infra()
+    main()

inference/pipeline/pipeline.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+from typing import Optional, Union
+import imageio
+import soundfile as sf
+import torch
+from PIL import Image
+from inference.common import EvaluationConfig, parse_config
+from inference.model.dit import get_dit
+from inference.model.dit import DiTModel
+from .video_generate import MagiEvaluator
+from .video_process import merge_video_and_audio, upsample_video
+class MagiPipeline:
+    """Pipeline facade for inference."""
+    def __init__(self, model: DiTModel, evaluation_config: EvaluationConfig, device: str = "cuda"):
+        self.model = model
+        self.evaluation_config = evaluation_config
+        config = parse_config()
+        if evaluation_config.use_sr_model:
+            config.engine_config.load = evaluation_config.sr_model_path
+            sr_model = get_dit(config.sr_arch_config, config.engine_config)
+        else:
+            sr_model = None
+        self.evaluator = MagiEvaluator(model, sr_model, evaluation_config, device)
+    def _validate_offline_request(
+        self,
+        prompt: str,
+        save_path_prefix: str,
+    ):
+        if not prompt or not prompt.strip():
+            raise ValueError("`prompt` must be a non-empty string.")
+        if not save_path_prefix or not save_path_prefix.strip():
+            raise ValueError("`save_path_prefix` must be a non-empty string.")
+    def run_offline(
+        self,
+        prompt: str,
+        image: Union[str, Image.Image, None],
+        audio: Optional[str],
+        save_path_prefix: str,
+        seed: int = 42,
+        seconds: int = 4,
+        br_width: int = 480,
+        br_height: int = 272,
+        sr_width: Optional[int] = None,
+        sr_height: Optional[int] = None,
+        output_width: Optional[int] = None,
+        output_height: Optional[int] = None,
+        upsample_mode: Optional[str] = None,
+    ):
+        self._validate_offline_request(prompt=prompt, save_path_prefix=save_path_prefix)
+        if self.evaluator.sr_model is not None:
+            save_path = f"{save_path_prefix}_{seconds}s_{br_width}x{br_height}_{sr_width}x{sr_height}.mp4"
+        else:
+            save_path = f"{save_path_prefix}_{seconds}s_{br_width}x{br_height}.mp4"
+        with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
+            torch.random.manual_seed(seed)
+            video_np, audio_np = self.evaluator.evaluate(
+                prompt,
+                image,
+                audio,
+                seconds=seconds,
+                br_width=br_width,
+                br_height=br_height,
+                sr_width=sr_width,
+                sr_height=sr_height,
+                br_num_inference_steps=self.evaluation_config.num_inference_steps,
+                sr_num_inference_steps=self.evaluation_config.sr_num_inference_steps,
+            )
+        if output_width is not None and output_height is not None:
+            video_np = upsample_video(video_np, output_width, output_height, upsample_mode)
+        if torch.distributed.get_rank() == torch.distributed.get_world_size() - 1:
+            saving_name = f"{prompt.replace(' ', '_')[:10]}"
+            audio_path = saving_name + str(random.randint(0, 1000000)) + ".wav"
+            video_path = saving_name + str(random.randint(0, 1000000)) + ".mp4"
+            sf.write(audio_path, audio_np, self.evaluator.audio_vae.sample_rate)
+            imageio.mimwrite(video_path, video_np, fps=self.evaluation_config.fps, quality=8, output_params=["-loglevel", "error"])
+            assert os.path.exists(video_path)
+            merge_video_and_audio(video_path, audio_path, save_path)
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+        return save_path

inference/pipeline/prompt_process.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch
+from torch.nn import functional as F
+from inference.model.t5_gemma import get_t5_gemma_embedding
+def pad_or_trim(tensor: torch.Tensor, target_size: int, dim: int, pad_value: float = 0.0) -> Tuple[torch.Tensor, int]:
+    """
+    Pads or trims a tensor along a specified dimension to reach a target size.
+    Args:
+        tensor (torch.Tensor): The input tensor to be processed.
+        target_size (int): The desired size for the specified dimension.
+        dim (int): The dimension along which to pad or trim.
+        pad_value (float, optional): The value used for padding. Defaults to 0.0.
+    Returns:
+        torch.Tensor: The resulting tensor with the target size in the specified dimension.
+    """
+    current_size = tensor.size(dim)
+    if current_size < target_size:
+        padding_amount = target_size - current_size
+        padding_tuple = [0] * (2 * tensor.dim())
+        padding_dim_index = tensor.dim() - 1 - dim
+        padding_tuple[2 * padding_dim_index + 1] = padding_amount
+        return F.pad(tensor, tuple(padding_tuple), "constant", pad_value), current_size
+    slicing = [slice(None)] * tensor.dim()
+    slicing[dim] = slice(0, target_size)
+    return tensor[tuple(slicing)], target_size
+def get_padded_t5_gemma_embedding(
+    prompt: str,
+    model_path: str,
+    device: str,
+    weight_dtype: torch.dtype,
+    target_length: int,
+) -> Tuple[torch.Tensor, int]:
+    txt_feat = get_t5_gemma_embedding(prompt, model_path, device, weight_dtype)
+    txt_feat, original_len = pad_or_trim(txt_feat, target_size=target_length, dim=1)
+    return txt_feat.to(torch.float32), original_len

inference/pipeline/scheduler_unipc.py ADDED Viewed

	@@ -0,0 +1,832 @@

+# Copyright (c) 2026 SandAI. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
+# Convert unipc for flow matching
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import math
+from typing import Any, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+from diffusers.utils import deprecate
+from diffusers.utils.torch_utils import randn_tensor
+class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to "flow_prediction"):
+            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
+            the flow of the diffusion process.
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
+            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        solver_order: int = 2,
+        prediction_type: str = "flow_prediction",
+        shift: float = 1.0,
+        use_dynamic_shifting=False,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        predict_x0: bool = True,
+        solver_type: str = "bh2",
+        lower_order_final: bool = True,
+        disable_corrector: List[int] = [],
+        solver_p: SchedulerMixin = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        alphas = np.linspace(1, 1 / num_train_timesteps, num_train_timesteps)[::-1].copy()
+        sigmas = 1.0 - alphas
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
+        if not use_dynamic_shifting:
+            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # pyright: ignore
+        self.sigmas = sigmas
+        self.timesteps = sigmas * num_train_timesteps
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index: Optional[int] = None
+        self._begin_index: Optional[int] = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.sigma_min = self.sigmas[-1].item()
+        self.sigma_max = self.sigmas[0].item()
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increase 1 after each scheduler step.
+        """
+        return self._step_index
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set by `inference.pipeline` with `set_begin_index`.
+        """
+        return self._begin_index
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run by `inference.pipeline` before inference.
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Union[int, None] = None,
+        device: Union[str, torch.device] = None,
+        sigmas: Optional[List[float]] = None,
+        mu: Optional[Union[float, None]] = None,
+        shift: Optional[Union[float, None]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+        Args:
+            num_inference_steps (`int`):
+                Total number of the spacing of the time steps.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        if self.config.use_dynamic_shifting and mu is None:
+            raise ValueError(" you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`")
+        if sigmas is None:
+            sigmas = np.linspace(self.sigma_max, self.sigma_min, num_inference_steps + 1).copy()[:-1]  # type: ignore
+        if self.config.use_dynamic_shifting:
+            sigmas = self.time_shift(mu, 1.0, sigmas)  # type: ignore
+        else:
+            if shift is None:
+                shift = self.config.shift
+            sigmas = shift * sigmas / (1 + (shift - 1) * sigmas)  # type: ignore
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+        timesteps = sigmas * self.config.num_train_timesteps
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)  # pyright: ignore
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+        self.num_inference_steps = len(timesteps)  # type: ignore
+        self.model_outputs = [None] * self.config.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+        return sample
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma):
+        return sigma * self.config.num_train_timesteps
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        return 1 - sigma, sigma
+    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
+    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
+        return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+    def convert_model_output(self, model_output: torch.Tensor, *args, sample: torch.Tensor = None, **kwargs) -> torch.Tensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+        Returns:
+            `torch.Tensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated "
+                "and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        if self.predict_x0:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+            return x0_pred
+        else:
+            if self.config.prediction_type == "flow_prediction":
+                sigma_t = self.sigmas[self.step_index]
+                epsilon = sample - (1 - sigma_t) * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
+                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
+                )
+            if self.config.thresholding:
+                sigma_t = self.sigmas[self.step_index]
+                x0_pred = sample - sigma_t * model_output
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = model_output + x0_pred
+            return epsilon
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.Tensor,
+        *args,
+        sample: Optional[torch.Tensor] = None,
+        order: Optional[int] = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+        Returns:
+            `torch.Tensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated "
+                "and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+        sigma_t, sigma_s0 = (self.sigmas[self.step_index + 1], self.sigmas[self.step_index])  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = sample.device
+        rks = []
+        D1s: Optional[List[Any]] = []
+        for i in range(1, order):
+            si = self.step_index - i  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # type: ignore
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:  # type: ignore
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]).to(device).to(x.dtype)  # type: ignore
+        else:
+            D1s = None
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)  # pyright: ignore
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.Tensor,
+        *args,
+        last_sample: torch.Tensor = None,
+        this_sample: torch.Tensor = None,
+        order: Optional[int] = None,  # pyright: ignore
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        One step for the UniC (B(h) version).
+        Args:
+            this_model_output (`torch.Tensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.Tensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.Tensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+        Returns:
+            `torch.Tensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated "
+                "and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+        sigma_t, sigma_s0 = (self.sigmas[self.step_index], self.sigmas[self.step_index - 1])  # pyright: ignore
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+        rks = []
+        D1s: Optional[List[Any]] = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)  # pyright: ignore
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)  # type: ignore
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+        R = []
+        b = []
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+        if len(D1s) > 0:  # type: ignore
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+        indices = (schedule_timesteps == timestep).nonzero()
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+        return indices[pos].item()
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: Union[int, torch.Tensor],
+        sample: torch.Tensor,
+        return_dict: bool = True,
+        generator=None,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if self.step_index is None:  # type: ignore
+            self._init_step_index(timestep)
+        use_corrector = (
+            self.step_index > 0
+            and self.step_index - 1 not in self.disable_corrector
+            and self.last_sample is not None  # pyright: ignore
+        )
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+        if use_corrector:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert, last_sample=self.last_sample, this_sample=sample, order=self.this_order
+            )
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep  # pyright: ignore
+        if self.config.lower_order_final:
+            this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index)  # pyright: ignore
+        else:
+            this_order = self.config.solver_order
+        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+        # upon completion increase step index by one
+        self._step_index += 1  # pyright: ignore
+        if not return_dict:
+            return (prev_sample,)
+        return SchedulerOutput(prev_sample=prev_sample)
+    def step_ddim(
+        # https://github.com/yifan123/flow_grpo/blob/main/flow_grpo/diffusers_patch/sd3_sde_with_logprob.py
+        self,
+        velocity: torch.FloatTensor,
+        t: int,
+        curr_state: torch.FloatTensor,
+        prev_state: Optional[torch.FloatTensor] = None,
+        generator: Optional[torch.Generator] = None,
+    ):
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        device = curr_state.device
+        curr_t = self.sigmas[t]
+        prev_t = self.sigmas[t + 1]
+        variance_noise = randn_tensor(curr_state.shape, generator=generator, device=device, dtype=curr_state.dtype)
+        cur_clean_ = curr_state - curr_t * velocity
+        prev_state = prev_t * variance_noise + (1 - prev_t) * cur_clean_
+        return prev_state
+    def step_sde(
+        # https://github.com/yifan123/flow_grpo/blob/main/flow_grpo/diffusers_patch/sd3_sde_with_logprob.py
+        self,
+        velocity: torch.FloatTensor,
+        t: int,
+        curr_state: torch.FloatTensor,
+        noise_theta: float = 1.0,
+        prev_state: Optional[torch.FloatTensor] = None,
+        generator: Optional[torch.Generator] = None,
+    ):
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the flow
+        process from the learned model outputs (most often the predicted velocity).
+        Args:
+            velocity (`torch.FloatTensor`): (B, C, T, H, W)
+                The direct output from learned flow model.
+            timestep (`float`): (B, )
+                The current discrete timestep in the diffusion chain.
+            curr_state (`torch.FloatTensor`): (B, C, T, H, W)
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+        """
+        device = curr_state.device
+        curr_t = self.sigmas[t]
+        prev_t = self.sigmas[t + 1]
+        cos = torch.cos(torch.tensor(noise_theta) * torch.pi / 2).to(device)  # if noise_theta is 0, it degenerates to standard flow matching
+        sin = torch.sin(torch.tensor(noise_theta) * torch.pi / 2).to(device)
+        prev_sample_mean = (1 - prev_t + prev_t * cos) * (curr_state - curr_t * velocity) + prev_t * cos * velocity
+        std_dev_t = prev_t * sin
+        std_dev_t = torch.ones((1, 1)).to(curr_state) * std_dev_t
+        if prev_state is None:
+            variance_noise = randn_tensor(curr_state.shape, generator=generator, device=device, dtype=curr_state.dtype)
+            prev_state = prev_sample_mean + std_dev_t * variance_noise
+        else:
+            prev_state = prev_sample_mean + (prev_state - prev_sample_mean.detach())
+        return prev_state
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        elif self.step_index is not None:
+            # add_noise is called after first denoising step (for inpainting)
+            step_indices = [self.step_index] * timesteps.shape[0]
+        else:
+            # add noise is called before first denoising step to create initial latent(img2img)
+            step_indices = [self.begin_index] * timesteps.shape[0]
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+    def __len__(self):
+        return self.config.num_train_timesteps