Spaces:
Running on Zero
Running on Zero
File size: 6,406 Bytes
08c5e28 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | from dataclasses import dataclass, replace
from typing import Protocol
import torch
from torch._prims_common import DeviceLikeType
from ltx_core.components.patchifiers import (
AudioLatentShape,
AudioPatchifier,
VideoLatentPatchifier,
VideoLatentShape,
get_pixel_coords,
)
from ltx_core.components.protocols import Patchifier
from ltx_core.types import LatentState, SpatioTemporalScaleFactors
DEFAULT_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
class LatentTools(Protocol):
"""
Tools for building latent states.
"""
patchifier: Patchifier
target_shape: VideoLatentShape | AudioLatentShape
def create_initial_state(
self,
device: DeviceLikeType,
dtype: torch.dtype,
initial_latent: torch.Tensor | None = None,
) -> LatentState:
"""
Create an initial latent state. If initial_latent is provided, it will be used to create the latent state.
"""
...
def patchify(self, latent_state: LatentState) -> LatentState:
"""
Patchify the latent state.
"""
if latent_state.latent.shape != self.target_shape.to_torch_shape():
raise ValueError(
f"Latent state has shape {latent_state.latent.shape}, expected shape is "
f"{self.target_shape.to_torch_shape()}"
)
latent_state = latent_state.clone()
latent = self.patchifier.patchify(latent_state.latent)
clean_latent = self.patchifier.patchify(latent_state.clean_latent)
denoise_mask = self.patchifier.patchify(latent_state.denoise_mask)
return replace(latent_state, latent=latent, denoise_mask=denoise_mask, clean_latent=clean_latent)
def unpatchify(self, latent_state: LatentState) -> LatentState:
"""
Unpatchify the latent state.
"""
latent_state = latent_state.clone()
latent = self.patchifier.unpatchify(latent_state.latent, output_shape=self.target_shape)
clean_latent = self.patchifier.unpatchify(latent_state.clean_latent, output_shape=self.target_shape)
denoise_mask = self.patchifier.unpatchify(
latent_state.denoise_mask, output_shape=self.target_shape.mask_shape()
)
return replace(latent_state, latent=latent, denoise_mask=denoise_mask, clean_latent=clean_latent)
def clear_conditioning(self, latent_state: LatentState) -> LatentState:
"""
Clear the conditioning from the latent state. This method removes extra tokens from the end of the latent.
Therefore, conditioning items should add extra tokens ONLY to the end of the latent.
"""
latent_state = latent_state.clone()
num_tokens = self.patchifier.get_token_count(self.target_shape)
latent = latent_state.latent[:, :num_tokens]
clean_latent = latent_state.clean_latent[:, :num_tokens]
denoise_mask = torch.ones_like(latent_state.denoise_mask)[:, :num_tokens]
positions = latent_state.positions[:, :, :num_tokens]
return LatentState(
latent=latent,
denoise_mask=denoise_mask,
positions=positions,
clean_latent=clean_latent,
attention_mask=None,
)
@dataclass(frozen=True)
class VideoLatentTools(LatentTools):
"""
Tools for building video latent states.
"""
patchifier: VideoLatentPatchifier
target_shape: VideoLatentShape
fps: float
scale_factors: SpatioTemporalScaleFactors = DEFAULT_SCALE_FACTORS
causal_fix: bool = True
def create_initial_state(
self,
device: DeviceLikeType,
dtype: torch.dtype,
initial_latent: torch.Tensor | None = None,
) -> LatentState:
if initial_latent is not None:
assert initial_latent.shape == self.target_shape.to_torch_shape(), (
f"Latent shape {initial_latent.shape} does not match target shape {self.target_shape.to_torch_shape()}"
)
else:
initial_latent = torch.zeros(
*self.target_shape.to_torch_shape(),
device=device,
dtype=dtype,
)
clean_latent = initial_latent.clone()
denoise_mask = torch.ones(
*self.target_shape.mask_shape().to_torch_shape(),
device=device,
dtype=torch.float32,
)
latent_coords = self.patchifier.get_patch_grid_bounds(
output_shape=self.target_shape,
device=device,
)
positions = get_pixel_coords(
latent_coords=latent_coords,
scale_factors=self.scale_factors,
causal_fix=self.causal_fix,
).float()
positions[:, 0, ...] = positions[:, 0, ...] / self.fps
return self.patchify(
LatentState(
latent=initial_latent,
denoise_mask=denoise_mask,
positions=positions.to(dtype),
clean_latent=clean_latent,
)
)
@dataclass(frozen=True)
class AudioLatentTools(LatentTools):
"""
Tools for building audio latent states.
"""
patchifier: AudioPatchifier
target_shape: AudioLatentShape
def create_initial_state(
self,
device: DeviceLikeType,
dtype: torch.dtype,
initial_latent: torch.Tensor | None = None,
) -> LatentState:
if initial_latent is not None:
assert initial_latent.shape == self.target_shape.to_torch_shape(), (
f"Latent shape {initial_latent.shape} does not match target shape {self.target_shape.to_torch_shape()}"
)
else:
initial_latent = torch.zeros(
*self.target_shape.to_torch_shape(),
device=device,
dtype=dtype,
)
clean_latent = initial_latent.clone()
denoise_mask = torch.ones(
*self.target_shape.mask_shape().to_torch_shape(),
device=device,
dtype=torch.float32,
)
latent_coords = self.patchifier.get_patch_grid_bounds(
output_shape=self.target_shape,
device=device,
)
return self.patchify(
LatentState(
latent=initial_latent, denoise_mask=denoise_mask, positions=latent_coords, clean_latent=clean_latent
)
)
|