Spaces:
Running on Zero
Running on Zero
feat(modes): A2V + Lipsync + Keyframe + Style parameterize_fn
Browse files- modes.py +149 -0
- tests/test_modes.py +53 -0
modes.py
CHANGED
|
@@ -79,6 +79,46 @@ I2V_NODE_FPS = 5445
|
|
| 79 |
I2V_NODE_CLIP_LENGTH = 196
|
| 80 |
I2V_NODE_IMAGE = 149 # LoadImage "Load Image1" — wv[0] = filename
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
def _frames_to_seconds(frames: int, fps: int) -> int:
|
| 84 |
"""Convert (frames, fps) to integer seconds for the mxSlider clip-length widget.
|
|
@@ -115,6 +155,50 @@ def _i2v_parameterize(inp: dict[str, Any]) -> list[Patch]:
|
|
| 115 |
]
|
| 116 |
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
_T2V_STAGES = [
|
| 119 |
Stage("Encode prompt", 5),
|
| 120 |
Stage("Diffusion (Stage 1)", 60),
|
|
@@ -132,6 +216,43 @@ _I2V_STAGES = [
|
|
| 132 |
Stage("Decode video", 10),
|
| 133 |
]
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
MODE_REGISTRY["t2v"] = Mode(
|
| 136 |
name="t2v",
|
| 137 |
label="Text → Video",
|
|
@@ -146,3 +267,31 @@ MODE_REGISTRY["i2v"] = Mode(
|
|
| 146 |
parameterize_fn=_i2v_parameterize,
|
| 147 |
stage_map=_I2V_STAGES,
|
| 148 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
I2V_NODE_CLIP_LENGTH = 196
|
| 80 |
I2V_NODE_IMAGE = 149 # LoadImage "Load Image1" — wv[0] = filename
|
| 81 |
|
| 82 |
+
# Mode-specific media nodes — captured from workflows/{a2v,lipsync,keyframe,style}.json
|
| 83 |
+
# on 2026-04-30. All four templates contain the same node ids for these inputs (the
|
| 84 |
+
# Loaders group is shared across modes); only a subset is wired into each mode's
|
| 85 |
+
# pipeline.
|
| 86 |
+
#
|
| 87 |
+
# VHS_LoadAudioUpload and VHS_LoadVideo carry dict-style widgets_values keyed by
|
| 88 |
+
# "audio"/"video". The current set_input helper is list-indexed; passing
|
| 89 |
+
# widget_index=0 against a dict adds a numeric "0" key without replacing the
|
| 90 |
+
# canonical "audio"/"video" entry. The runtime file-path swap is therefore not
|
| 91 |
+
# yet wired — Task 12 only validates the patch tuple set. Real path injection
|
| 92 |
+
# lands when backend.py grows file-staging in Task 17.
|
| 93 |
+
|
| 94 |
+
A2V_NODE_PROMPT = 5536
|
| 95 |
+
A2V_NODE_NEG_PROMPT = 5537
|
| 96 |
+
A2V_NODE_WIDTH = 5383
|
| 97 |
+
A2V_NODE_HEIGHT = 5382
|
| 98 |
+
A2V_NODE_FPS = 5445
|
| 99 |
+
A2V_NODE_CLIP_LENGTH = 196
|
| 100 |
+
A2V_NODE_AUDIO = 5400 # VHS_LoadAudioUpload — dict wv keyed by "audio"
|
| 101 |
+
|
| 102 |
+
LIPSYNC_NODE_PROMPT = 5536
|
| 103 |
+
LIPSYNC_NODE_NEG_PROMPT = 5537
|
| 104 |
+
LIPSYNC_NODE_FPS = 5445
|
| 105 |
+
LIPSYNC_NODE_CLIP_LENGTH = 196
|
| 106 |
+
LIPSYNC_NODE_IMAGE = 149 # LoadImage "Load Image1" — wv[0] = filename
|
| 107 |
+
LIPSYNC_NODE_AUDIO = 5400 # VHS_LoadAudioUpload — dict wv keyed by "audio"
|
| 108 |
+
|
| 109 |
+
KEYFRAME_NODE_PROMPT = 5536
|
| 110 |
+
KEYFRAME_NODE_NEG_PROMPT = 5537
|
| 111 |
+
KEYFRAME_NODE_FPS = 5445
|
| 112 |
+
KEYFRAME_NODE_CLIP_LENGTH = 196
|
| 113 |
+
KEYFRAME_NODE_FIRST_FRAME = 149 # LoadImage "Load Image1" — wv[0] = filename
|
| 114 |
+
KEYFRAME_NODE_LAST_FRAME = 5437 # LoadImage "Load Image2" — wv[0] = filename
|
| 115 |
+
|
| 116 |
+
STYLE_NODE_PROMPT = 5536
|
| 117 |
+
STYLE_NODE_NEG_PROMPT = 5537
|
| 118 |
+
STYLE_NODE_FPS = 5445
|
| 119 |
+
STYLE_NODE_CLIP_LENGTH = 196
|
| 120 |
+
STYLE_NODE_INPUT_VIDEO = 5444 # VHS_LoadVideo — dict wv keyed by "video"
|
| 121 |
+
|
| 122 |
|
| 123 |
def _frames_to_seconds(frames: int, fps: int) -> int:
|
| 124 |
"""Convert (frames, fps) to integer seconds for the mxSlider clip-length widget.
|
|
|
|
| 155 |
]
|
| 156 |
|
| 157 |
|
| 158 |
+
def _a2v_parameterize(inp: dict[str, Any]) -> list[Patch]:
|
| 159 |
+
return [
|
| 160 |
+
(A2V_NODE_PROMPT, 0, inp["prompt"]),
|
| 161 |
+
(A2V_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
|
| 162 |
+
(A2V_NODE_AUDIO, 0, inp["audio"]),
|
| 163 |
+
(A2V_NODE_WIDTH, 0, int(inp["width"])),
|
| 164 |
+
(A2V_NODE_HEIGHT, 0, int(inp["height"])),
|
| 165 |
+
(A2V_NODE_FPS, 0, int(inp["fps"])),
|
| 166 |
+
(A2V_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _lipsync_parameterize(inp: dict[str, Any]) -> list[Patch]:
|
| 171 |
+
return [
|
| 172 |
+
(LIPSYNC_NODE_PROMPT, 0, inp["prompt"]),
|
| 173 |
+
(LIPSYNC_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
|
| 174 |
+
(LIPSYNC_NODE_IMAGE, 0, inp["image"]),
|
| 175 |
+
(LIPSYNC_NODE_AUDIO, 0, inp["audio"]),
|
| 176 |
+
(LIPSYNC_NODE_FPS, 0, int(inp["fps"])),
|
| 177 |
+
(LIPSYNC_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def _keyframe_parameterize(inp: dict[str, Any]) -> list[Patch]:
|
| 182 |
+
return [
|
| 183 |
+
(KEYFRAME_NODE_PROMPT, 0, inp["prompt"]),
|
| 184 |
+
(KEYFRAME_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
|
| 185 |
+
(KEYFRAME_NODE_FIRST_FRAME, 0, inp["first_frame"]),
|
| 186 |
+
(KEYFRAME_NODE_LAST_FRAME, 0, inp["last_frame"]),
|
| 187 |
+
(KEYFRAME_NODE_FPS, 0, int(inp["fps"])),
|
| 188 |
+
(KEYFRAME_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
|
| 189 |
+
]
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def _style_parameterize(inp: dict[str, Any]) -> list[Patch]:
|
| 193 |
+
return [
|
| 194 |
+
(STYLE_NODE_PROMPT, 0, inp["prompt"]),
|
| 195 |
+
(STYLE_NODE_NEG_PROMPT, 0, inp.get("negative_prompt", "")),
|
| 196 |
+
(STYLE_NODE_INPUT_VIDEO, 0, inp["input_video"]),
|
| 197 |
+
(STYLE_NODE_FPS, 0, int(inp["fps"])),
|
| 198 |
+
(STYLE_NODE_CLIP_LENGTH, 0, _frames_to_seconds(int(inp["frames"]), int(inp["fps"]))),
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
|
| 202 |
_T2V_STAGES = [
|
| 203 |
Stage("Encode prompt", 5),
|
| 204 |
Stage("Diffusion (Stage 1)", 60),
|
|
|
|
| 216 |
Stage("Decode video", 10),
|
| 217 |
]
|
| 218 |
|
| 219 |
+
_A2V_STAGES = [
|
| 220 |
+
Stage("Encode prompt", 5),
|
| 221 |
+
Stage("Encode audio", 5),
|
| 222 |
+
Stage("Diffusion (Stage 1)", 55),
|
| 223 |
+
Stage("Spatial upscale", 7),
|
| 224 |
+
Stage("Diffusion (Stage 2)", 18),
|
| 225 |
+
Stage("Decode video", 10),
|
| 226 |
+
]
|
| 227 |
+
|
| 228 |
+
_LIPSYNC_STAGES = [
|
| 229 |
+
Stage("Encode prompt", 5),
|
| 230 |
+
Stage("Encode image", 3),
|
| 231 |
+
Stage("Encode audio", 5),
|
| 232 |
+
Stage("Diffusion (Stage 1)", 52),
|
| 233 |
+
Stage("Spatial upscale", 7),
|
| 234 |
+
Stage("Diffusion (Stage 2)", 18),
|
| 235 |
+
Stage("Decode video", 10),
|
| 236 |
+
]
|
| 237 |
+
|
| 238 |
+
_KEYFRAME_STAGES = [
|
| 239 |
+
Stage("Encode prompt", 5),
|
| 240 |
+
Stage("Encode keyframes", 5),
|
| 241 |
+
Stage("Diffusion (Stage 1)", 55),
|
| 242 |
+
Stage("Spatial upscale", 7),
|
| 243 |
+
Stage("Diffusion (Stage 2)", 18),
|
| 244 |
+
Stage("Decode video", 10),
|
| 245 |
+
]
|
| 246 |
+
|
| 247 |
+
_STYLE_STAGES = [
|
| 248 |
+
Stage("Encode prompt", 5),
|
| 249 |
+
Stage("Decode source video", 5),
|
| 250 |
+
Stage("Diffusion (Stage 1)", 55),
|
| 251 |
+
Stage("Spatial upscale", 7),
|
| 252 |
+
Stage("Diffusion (Stage 2)", 18),
|
| 253 |
+
Stage("Decode video", 10),
|
| 254 |
+
]
|
| 255 |
+
|
| 256 |
MODE_REGISTRY["t2v"] = Mode(
|
| 257 |
name="t2v",
|
| 258 |
label="Text → Video",
|
|
|
|
| 267 |
parameterize_fn=_i2v_parameterize,
|
| 268 |
stage_map=_I2V_STAGES,
|
| 269 |
)
|
| 270 |
+
MODE_REGISTRY["a2v"] = Mode(
|
| 271 |
+
name="a2v",
|
| 272 |
+
label="Audio → Video",
|
| 273 |
+
icon="🎵",
|
| 274 |
+
parameterize_fn=_a2v_parameterize,
|
| 275 |
+
stage_map=_A2V_STAGES,
|
| 276 |
+
)
|
| 277 |
+
MODE_REGISTRY["lipsync"] = Mode(
|
| 278 |
+
name="lipsync",
|
| 279 |
+
label="Lipsync",
|
| 280 |
+
icon="👄",
|
| 281 |
+
parameterize_fn=_lipsync_parameterize,
|
| 282 |
+
stage_map=_LIPSYNC_STAGES,
|
| 283 |
+
)
|
| 284 |
+
MODE_REGISTRY["keyframe"] = Mode(
|
| 285 |
+
name="keyframe",
|
| 286 |
+
label="Keyframe → Video",
|
| 287 |
+
icon="🎞",
|
| 288 |
+
parameterize_fn=_keyframe_parameterize,
|
| 289 |
+
stage_map=_KEYFRAME_STAGES,
|
| 290 |
+
)
|
| 291 |
+
MODE_REGISTRY["style"] = Mode(
|
| 292 |
+
name="style",
|
| 293 |
+
label="Style Transfer",
|
| 294 |
+
icon="🎨",
|
| 295 |
+
parameterize_fn=_style_parameterize,
|
| 296 |
+
stage_map=_STYLE_STAGES,
|
| 297 |
+
)
|
tests/test_modes.py
CHANGED
|
@@ -46,3 +46,56 @@ def test_t2v_and_i2v_in_registry():
|
|
| 46 |
"""T2V and I2V exist in MODE_REGISTRY (full completeness in Task 12)."""
|
| 47 |
assert "t2v" in modes.MODE_REGISTRY
|
| 48 |
assert "i2v" in modes.MODE_REGISTRY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"""T2V and I2V exist in MODE_REGISTRY (full completeness in Task 12)."""
|
| 47 |
assert "t2v" in modes.MODE_REGISTRY
|
| 48 |
assert "i2v" in modes.MODE_REGISTRY
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@pytest.mark.parametrize("mode_name", ["a2v", "lipsync", "keyframe", "style"])
|
| 52 |
+
def test_remaining_modes_parameterize_validates(mode_name, canonical_inputs):
|
| 53 |
+
inputs = canonical_inputs[mode_name]
|
| 54 |
+
mode = modes.MODE_REGISTRY[mode_name]
|
| 55 |
+
patches = mode.parameterize_fn(inputs)
|
| 56 |
+
assert len(patches) > 0
|
| 57 |
+
|
| 58 |
+
wf = workflow.load_template(mode_name)
|
| 59 |
+
for patch in patches:
|
| 60 |
+
workflow.set_input(wf, *patch)
|
| 61 |
+
workflow.validate(wf)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def test_a2v_parameterize_passes_audio_path(canonical_inputs):
|
| 65 |
+
patches = modes.MODE_REGISTRY["a2v"].parameterize_fn(canonical_inputs["a2v"])
|
| 66 |
+
assert canonical_inputs["a2v"]["audio"] in [p[2] for p in patches]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_lipsync_parameterize_passes_image_and_audio(canonical_inputs):
|
| 70 |
+
patches = modes.MODE_REGISTRY["lipsync"].parameterize_fn(canonical_inputs["lipsync"])
|
| 71 |
+
values = [p[2] for p in patches]
|
| 72 |
+
assert canonical_inputs["lipsync"]["image"] in values
|
| 73 |
+
assert canonical_inputs["lipsync"]["audio"] in values
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def test_keyframe_parameterize_passes_two_frames(canonical_inputs):
|
| 77 |
+
patches = modes.MODE_REGISTRY["keyframe"].parameterize_fn(canonical_inputs["keyframe"])
|
| 78 |
+
values = [p[2] for p in patches]
|
| 79 |
+
assert canonical_inputs["keyframe"]["first_frame"] in values
|
| 80 |
+
assert canonical_inputs["keyframe"]["last_frame"] in values
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def test_style_parameterize_passes_input_video(canonical_inputs):
|
| 84 |
+
patches = modes.MODE_REGISTRY["style"].parameterize_fn(canonical_inputs["style"])
|
| 85 |
+
assert canonical_inputs["style"]["input_video"] in [p[2] for p in patches]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def test_mode_registry_has_all_six_keys():
|
| 89 |
+
"""All six modes are in the registry now."""
|
| 90 |
+
assert set(modes.MODE_REGISTRY.keys()) == {
|
| 91 |
+
"t2v", "a2v", "i2v", "lipsync", "keyframe", "style",
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_each_mode_has_required_attributes():
|
| 96 |
+
for name, mode in modes.MODE_REGISTRY.items():
|
| 97 |
+
assert mode.name == name
|
| 98 |
+
assert mode.label # non-empty
|
| 99 |
+
assert mode.icon # non-empty
|
| 100 |
+
assert callable(mode.parameterize_fn)
|
| 101 |
+
assert isinstance(mode.stage_map, list) and len(mode.stage_map) > 0
|