ffy2000 commited on
Commit
afd380b
·
1 Parent(s): a14b598

Vendor RIFE into repo

Browse files
app.py CHANGED
The diff for this file is too large to render. See raw diff
 
app_save.py CHANGED
The diff for this file is too large to render. See raw diff
 
app_wrong.py DELETED
@@ -1,2247 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import base64
5
- import concurrent.futures
6
- import gc
7
- import json
8
- import os
9
- import random
10
- import subprocess
11
- import threading
12
- import time
13
- import traceback
14
- from collections import deque
15
- from copy import deepcopy
16
- from datetime import datetime
17
- from pathlib import Path
18
- from typing import Optional
19
-
20
- import gradio as gr
21
- import torch
22
- from huggingface_hub import snapshot_download
23
- from safetensors.torch import load_file
24
- from transformers import set_seed
25
- from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
26
-
27
- from common.utils.logging import get_logger
28
- from common.utils.misc import AutoEncoderParams, tuple_mul
29
- from config.config_factory import DataArguments, InferenceArguments, ModelArguments
30
- from data.data_utils import add_special_tokens
31
- from data.dataset_base import DataConfig, simple_custom_collate
32
- from data.datasets_custom import ValidationDataset
33
- from inference_lance import (
34
- PROMPT_JSON_FILENAME,
35
- apply_inference_defaults,
36
- clean_memory,
37
- init_from_model_path_if_needed,
38
- save_prompt_results,
39
- validate_on_fixed_batch,
40
- )
41
- from modeling.lance import Lance, LanceConfig, Qwen2ForCausalLM
42
- from modeling.qwen2 import Qwen2Tokenizer
43
- from modeling.qwen2.modeling_qwen2 import Qwen2Config
44
- from modeling.vae.wan.model import WanVideoVAE
45
- from modeling.vit.qwen2_5_vl_vit import Qwen2_5_VisionTransformerPretrainedModel
46
-
47
-
48
- REPO_ROOT = Path(__file__).resolve().parent
49
- GRADIO_TMP_ROOT = Path(os.getenv("LANCE_GRADIO_TMP_ROOT", "/tmp/lance_gradio")).expanduser()
50
- TMP_INPUT_DIR = GRADIO_TMP_ROOT / "inputs"
51
- RESULTS_ROOT = GRADIO_TMP_ROOT / "results"
52
- GLOBAL_RECORDS_FILE = GRADIO_TMP_ROOT / "generation_records.jsonl"
53
- RUN_RECORD_FILENAME = "generation_record.json"
54
-
55
- LOCAL_MODEL_BASE_DIR = Path("downloads")
56
- SPACE_MODEL_BASE_DIR = Path("/data/lance_models")
57
- DEFAULT_MODEL_REPO_ID = "bytedance-research/Lance"
58
- DEFAULT_MODEL_VARIANT = "video"
59
- MODEL_VARIANT_VIDEO = "video"
60
- MODEL_VARIANT_IMAGE = "image"
61
- MODEL_VARIANT_TO_DIR = {
62
- MODEL_VARIANT_VIDEO: "Lance_3B_Video",
63
- MODEL_VARIANT_IMAGE: "Lance_3B",
64
- }
65
- DEFAULT_MODEL_PATH = LOCAL_MODEL_BASE_DIR / MODEL_VARIANT_TO_DIR[MODEL_VARIANT_VIDEO]
66
- DEFAULT_VIT_TYPE = "qwen_2_5_vl_original"
67
- DEFAULT_TASK = "t2v"
68
- DEFAULT_TIMESTEPS = 30
69
- DEFAULT_TIMESTEP_SHIFT = 3.5
70
- DEFAULT_CFG_TEXT_SCALE = 4.0
71
- DEFAULT_RESOLUTION = "video_848x480"
72
- DEFAULT_IMAGE_RESOLUTION = "image_768x768"
73
- DEFAULT_BASIC_SEED = 42
74
- DEFAULT_HEIGHT = 480
75
- DEFAULT_WIDTH = 848
76
- DEFAULT_IMAGE_SIZE = 768
77
- DEFAULT_VIDEO_DURATION_SECONDS = 5
78
- DEFAULT_NUM_FRAMES = 12 * DEFAULT_VIDEO_DURATION_SECONDS + 1
79
- DEFAULT_VIDEO_ASPECT_RATIO = "16:9"
80
- DEFAULT_IMAGE_ASPECT_RATIO = "1:1"
81
- FRAME_INTERPOLATION_YES = "Yes"
82
- FRAME_INTERPOLATION_NO = "No"
83
- DEFAULT_FRAME_INTERPOLATION = FRAME_INTERPOLATION_YES
84
- ASPECT_RATIO_CHOICES = ["21:9", "16:9", "3:2", "4:3", "1:1", "3:4", "2:3", "9:16", "9:21"]
85
-
86
- VIDEO_ASPECT_RATIO_TO_SIZE = {
87
- "21:9": (976, 416),
88
- "16:9": (848, 480),
89
- "3:2": (784, 528),
90
- "4:3": (736, 560),
91
- "1:1": (640, 640),
92
- "3:4": (560, 736),
93
- "2:3": (528, 784),
94
- "9:16": (480, 848),
95
- "9:21": (416, 976),
96
- }
97
-
98
- IMAGE_ASPECT_RATIO_TO_SIZE = {
99
- "21:9": (1168, 496),
100
- "16:9": (1024, 576),
101
- "3:2": (944, 624),
102
- "4:3": (880, 672),
103
- "1:1": (768, 768),
104
- "3:4": (672, 880),
105
- "2:3": (624, 944),
106
- "9:16": (576, 1024),
107
- "9:21": (496, 1168),
108
- }
109
- DEFAULT_GPUS = "0"
110
- DEFAULT_QUEUE_SIZE = 32
111
- USE_KVCACHE = True
112
- TEXT_TEMPLATE = True
113
- RECORD_WRITE_LOCK = threading.Lock()
114
-
115
- LANCE_HOMEPAGE_URL = "https://lance-project.github.io/"
116
- LANCE_PAPER_URL = "http://arxiv.org/abs/2605.18678"
117
- LANCE_HUGGING_FACE_URL = "https://huggingface.co/bytedance-research/Lance"
118
- LANCE_GITHUB_URL = "https://github.com/bytedance/Lance"
119
- LANCE_LOGO_PATH = REPO_ROOT / "assets" / "logo" / "lance-logo.webp"
120
-
121
- APP_CSS = """
122
- .gradio-container {
123
- max-width: 1680px !important;
124
- margin-left: auto !important;
125
- margin-right: auto !important;
126
- }
127
-
128
- .contain {
129
- max-width: 1680px !important;
130
- margin-left: auto !important;
131
- margin-right: auto !important;
132
- }
133
-
134
- .lance-hero {
135
- text-align: center;
136
- padding: 8px 12px 6px;
137
- }
138
-
139
- .lance-logo {
140
- width: min(160px, 36vw);
141
- height: auto;
142
- display: block;
143
- margin: 0 auto 4px;
144
- }
145
-
146
- .lance-title {
147
- margin: 0 auto 5px;
148
- font-size: clamp(20px, 2.4vw, 30px);
149
- line-height: 1.08;
150
- font-weight: 800;
151
- letter-spacing: 0;
152
- }
153
-
154
- .lance-authors {
155
- margin: 0 auto 6px;
156
- max-width: 1280px;
157
- font-size: 20px;
158
- line-height: 1.24;
159
- color: var(--body-text-color-subdued);
160
- }
161
-
162
- .lance-authors a {
163
- color: inherit;
164
- text-decoration: none;
165
- }
166
-
167
- .lance-authors a:hover {
168
- text-decoration: underline;
169
- }
170
-
171
- .lance-badges {
172
- display: flex;
173
- flex-wrap: wrap;
174
- justify-content: center;
175
- gap: 5px;
176
- margin: 4px auto 0;
177
- }
178
-
179
- .lance-badges a {
180
- line-height: 0;
181
- }
182
-
183
- .lance-badges img {
184
- height: 20px;
185
- width: auto;
186
- display: block;
187
- }
188
-
189
- .lance-status {
190
- max-width: 1180px;
191
- margin: 0 auto 18px;
192
- }
193
-
194
- .task-selector {
195
- overflow-x: auto;
196
- }
197
-
198
- .lance-main-column > label span,
199
- .lance-main-column > .block-title,
200
- .lance-main-column > .label-wrap span,
201
- .lance-main-column > .form > label span,
202
- .lance-main-column > .form > .block-title,
203
- .lance-main-column > .form > .label-wrap span {
204
- font-size: 20px !important;
205
- font-weight: 700 !important;
206
- }
207
-
208
- .task-selector .wrap {
209
- display: grid;
210
- grid-template-columns: repeat(3, minmax(220px, 1fr));
211
- gap: 8px;
212
- min-width: 680px;
213
- }
214
-
215
- .task-selector label {
216
- justify-content: center;
217
- min-height: 38px;
218
- white-space: nowrap;
219
- border-radius: 10px !important;
220
- }
221
-
222
- .task-selector span {
223
- font-size: 20px !important;
224
- }
225
-
226
- .recommended-title {
227
- text-align: center !important;
228
- margin: 14px auto 10px !important;
229
- }
230
-
231
- .recommended-title h3,
232
- .recommended-title p {
233
- text-align: center !important;
234
- font-size: 22px !important;
235
- font-weight: 800 !important;
236
- color: var(--body-text-color) !important;
237
- }
238
-
239
- .example-panel {
240
- margin-top: 14px !important;
241
- padding: 10px 12px !important;
242
- border-radius: 8px !important;
243
- background: rgba(248, 250, 252, 0.72) !important;
244
- border: 1px solid var(--border-color-primary) !important;
245
- }
246
-
247
- .prompt-examples table,
248
- .prompt-examples th,
249
- .prompt-examples td {
250
- border: 1px solid var(--border-color-primary) !important;
251
- }
252
-
253
- .prompt-examples table {
254
- border-collapse: collapse !important;
255
- width: 100% !important;
256
- }
257
-
258
- .prompt-examples td {
259
- border-bottom: 1px solid var(--border-color-primary) !important;
260
- padding: 12px !important;
261
- vertical-align: top !important;
262
- }
263
-
264
- .example-panel th,
265
- .example-panel .block-label,
266
- .example-panel label span,
267
- .example-panel .label-wrap span {
268
- font-size: 18px !important;
269
- font-weight: 700 !important;
270
- }
271
-
272
- .prompt-dataset {
273
- max-height: 420px !important;
274
- overflow-y: auto !important;
275
- overscroll-behavior: contain !important;
276
- scrollbar-gutter: stable !important;
277
- }
278
-
279
- .prompt-dataset button {
280
- height: auto !important;
281
- min-height: 48px !important;
282
- white-space: normal !important;
283
- text-align: left !important;
284
- align-items: flex-start !important;
285
- }
286
-
287
- .prompt-dataset .paginate {
288
- display: none !important;
289
- }
290
-
291
- .prompt-example-proxy {
292
- display: none !important;
293
- }
294
-
295
- .lance-main-row {
296
- display: grid !important;
297
- grid-template-columns: minmax(0, 1fr) minmax(0, 1fr) !important;
298
- gap: 16px !important;
299
- align-items: start !important;
300
- }
301
-
302
- .lance-main-column {
303
- min-width: 0 !important;
304
- width: 100% !important;
305
- }
306
-
307
- .lance-display-frame,
308
- .lance-display-frame > div,
309
- .lance-display-frame textarea {
310
- width: 100% !important;
311
- }
312
-
313
- .lance-display-frame textarea {
314
- min-height: 360px !important;
315
- }
316
-
317
- .lance-run-button {
318
- font-size: 18px !important;
319
- font-weight: 800 !important;
320
- }
321
-
322
- .generation-controls-row {
323
- width: 100% !important;
324
- max-width: 100% !important;
325
- overflow-x: hidden !important;
326
- }
327
-
328
- .generation-controls-row > .form {
329
- display: grid !important;
330
- grid-template-columns:
331
- minmax(0, 1.25fr)
332
- minmax(0, 1.3fr)
333
- minmax(0, 1fr)
334
- minmax(0, 1.25fr) !important;
335
- gap: 12px !important;
336
- align-items: start !important;
337
- width: 100% !important;
338
- max-width: 100% !important;
339
- overflow: visible !important;
340
- }
341
-
342
- .generation-control,
343
- .generation-control > div,
344
- .generation-controls-row > .form > div {
345
- min-width: 0 !important;
346
- max-width: 100% !important;
347
- }
348
-
349
- .generation-controls-row .generation-control label,
350
- .generation-controls-row .generation-control label span,
351
- .generation-controls-row .generation-control .block-label,
352
- .generation-controls-row .generation-control .block-title,
353
- .generation-controls-row .generation-control > label,
354
- .generation-controls-row .generation-control .label-wrap,
355
- .generation-controls-row .generation-control .label-wrap span {
356
- font-size: 22px !important;
357
- font-weight: 700 !important;
358
- line-height: 1.15 !important;
359
- letter-spacing: 0 !important;
360
- white-space: normal !important;
361
- }
362
-
363
- .generation-controls-row .generation-value-control input,
364
- .generation-controls-row .generation-value-control textarea,
365
- .generation-controls-row .generation-value-control [data-testid="textbox"],
366
- .generation-controls-row .generation-dropdown-control input[role="listbox"],
367
- .generation-controls-row .generation-dropdown-control input.border-none[role="listbox"],
368
- .generation-controls-row .generation-dropdown-control .secondary-wrap input {
369
- font-size: 22px !important;
370
- font-weight: 700 !important;
371
- line-height: 1.2 !important;
372
- letter-spacing: 0 !important;
373
- text-align: left !important;
374
- }
375
-
376
- .generation-controls-row .generation-value-control input,
377
- .generation-controls-row .generation-value-control textarea,
378
- .generation-controls-row .generation-dropdown-control input[role="listbox"],
379
- .generation-controls-row .generation-dropdown-control input.border-none[role="listbox"],
380
- .generation-controls-row .generation-dropdown-control .secondary-wrap input {
381
- min-height: 64px !important;
382
- width: 100% !important;
383
- box-sizing: border-box !important;
384
- }
385
-
386
- @media (max-width: 1100px) {
387
- .generation-controls-row > .form {
388
- grid-template-columns: repeat(2, minmax(0, 1fr)) !important;
389
- }
390
- }
391
-
392
- @media (max-width: 900px) {
393
- .lance-main-row {
394
- grid-template-columns: minmax(0, 1fr) !important;
395
- }
396
- }
397
- """
398
-
399
- TASK_T2V = "t2v"
400
- TASK_T2I = "t2i"
401
- TASK_V2T = "v2t"
402
- TASK_X2T = "x2t"
403
- TASK_X2T_VIDEO = "x2t_video"
404
- TASK_X2T_IMAGE = "x2t_image"
405
- TASK_IMAGE_EDIT = "image_edit"
406
- TASK_VIDEO_EDIT = "video_edit"
407
- TASK_LABEL_VIDEO_GENERATION = "Video Generation"
408
- TASK_LABEL_VIDEO_EDIT = "Video Edit"
409
- TASK_LABEL_VIDEO_UNDERSTANDING = "Video Understanding"
410
- TASK_LABEL_IMAGE_GENERATION = "Image Generation"
411
- TASK_LABEL_IMAGE_EDIT = "Image Edit"
412
- TASK_LABEL_IMAGE_UNDERSTANDING = "Image Understanding"
413
- TASK_CHOICES = [
414
- TASK_LABEL_VIDEO_GENERATION,
415
- TASK_LABEL_VIDEO_EDIT,
416
- TASK_LABEL_VIDEO_UNDERSTANDING,
417
- TASK_LABEL_IMAGE_GENERATION,
418
- TASK_LABEL_IMAGE_EDIT,
419
- TASK_LABEL_IMAGE_UNDERSTANDING,
420
- ]
421
- TASK_LABEL_TO_INTERNAL = {
422
- TASK_LABEL_VIDEO_GENERATION: TASK_T2V,
423
- TASK_LABEL_VIDEO_EDIT: TASK_VIDEO_EDIT,
424
- TASK_LABEL_VIDEO_UNDERSTANDING: TASK_X2T_VIDEO,
425
- TASK_LABEL_IMAGE_GENERATION: TASK_T2I,
426
- TASK_LABEL_IMAGE_EDIT: TASK_IMAGE_EDIT,
427
- TASK_LABEL_IMAGE_UNDERSTANDING: TASK_X2T_IMAGE,
428
- TASK_T2V: TASK_T2V,
429
- TASK_VIDEO_EDIT: TASK_VIDEO_EDIT,
430
- TASK_V2T: TASK_X2T_VIDEO,
431
- TASK_X2T: TASK_X2T_VIDEO,
432
- TASK_X2T_VIDEO: TASK_X2T_VIDEO,
433
- TASK_T2I: TASK_T2I,
434
- TASK_IMAGE_EDIT: TASK_IMAGE_EDIT,
435
- TASK_X2T_IMAGE: TASK_X2T_IMAGE,
436
- }
437
- GENERATION_TASKS = {TASK_T2V, TASK_T2I, TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
438
- UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
439
- IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
440
- VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
441
- EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
442
- VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
443
- IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
444
- RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
445
- CAPTION_SYSTEM_PROMPT_TEMPLATE = (
446
- "Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background."
447
- )
448
- V2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="video")
449
- I2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="image")
450
- V2T_QA_SYSTEM_PROMPT = "View the video attentively and provide a suitable answer to the posed question."
451
- I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
452
-
453
-
454
- def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
455
- """Get Aspect Ratio choices with default/recommended marker for the given task."""
456
- internal_task = normalize_task(task)
457
- default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
458
- return [
459
- (f"{ratio} (default)" if ratio == default_ratio else ratio, ratio)
460
- for ratio in ASPECT_RATIO_CHOICES
461
- ]
462
-
463
- def env_flag(name: str, default: bool) -> bool:
464
- value = os.getenv(name)
465
- if value is None:
466
- return default
467
- return value.strip().lower() in {"1", "true", "yes", "on"}
468
-
469
-
470
- def running_on_space() -> bool:
471
- return bool(os.getenv("SPACE_ID") or os.getenv("SPACE_HOST"))
472
-
473
-
474
- def display_path(path: Path) -> str:
475
- path_text = path.as_posix()
476
- if path.is_absolute():
477
- try:
478
- path_text = path.relative_to(Path.cwd()).as_posix()
479
- except ValueError:
480
- return path_text
481
- if path_text == "." or path_text.startswith("./"):
482
- return path_text
483
- return f"./{path_text}"
484
-
485
-
486
- def get_model_base_dir() -> Path:
487
- configured = os.getenv("LANCE_MODEL_BASE_DIR")
488
- if configured:
489
- return Path(configured).expanduser()
490
- if LOCAL_MODEL_BASE_DIR.exists():
491
- return LOCAL_MODEL_BASE_DIR
492
- return SPACE_MODEL_BASE_DIR if running_on_space() else LOCAL_MODEL_BASE_DIR
493
-
494
-
495
- def normalize_model_variant(model_variant: Optional[str] = None) -> str:
496
- variant = (model_variant or os.getenv("LANCE_MODEL_VARIANT", DEFAULT_MODEL_VARIANT)).strip().lower()
497
- if variant in {"image", "t2i", "i2t"}:
498
- return MODEL_VARIANT_IMAGE
499
- return MODEL_VARIANT_VIDEO
500
-
501
-
502
- def get_model_path(model_variant: Optional[str] = None) -> Path:
503
- variant = normalize_model_variant(model_variant)
504
- variant_env_name = "LANCE_IMAGE_MODEL_PATH" if variant == MODEL_VARIANT_IMAGE else "LANCE_VIDEO_MODEL_PATH"
505
- variant_configured = os.getenv(variant_env_name)
506
- if variant_configured:
507
- return Path(variant_configured).expanduser()
508
-
509
- configured = os.getenv("LANCE_MODEL_PATH")
510
- if configured:
511
- return Path(configured).expanduser()
512
-
513
- model_dir_name = MODEL_VARIANT_TO_DIR[variant]
514
- return get_model_base_dir() / model_dir_name
515
-
516
-
517
- def get_required_model_asset_paths(model_base_dir: Path, model_path: Path) -> list[Path]:
518
- return [
519
- model_path / "llm_config.json",
520
- model_path / "model.safetensors",
521
- model_base_dir / "Qwen2.5-VL-ViT" / "vit.safetensors",
522
- model_base_dir / "Wan2.2_VAE.pth",
523
- ]
524
-
525
-
526
- def ensure_model_assets(model_variant: Optional[str] = None) -> Path:
527
- model_base_dir = get_model_base_dir()
528
- os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
529
- model_path = get_model_path(model_variant)
530
-
531
- required_paths = get_required_model_asset_paths(model_base_dir, model_path)
532
- if all(path.exists() for path in required_paths):
533
- return model_path
534
-
535
- downloads_model_base_dir = Path("downloads")
536
- if model_base_dir == Path(".") and downloads_model_base_dir.exists():
537
- downloads_model_path = downloads_model_base_dir / MODEL_VARIANT_TO_DIR[normalize_model_variant(model_variant)]
538
- downloads_required_paths = get_required_model_asset_paths(downloads_model_base_dir, downloads_model_path)
539
- if all(path.exists() for path in downloads_required_paths):
540
- model_base_dir = downloads_model_base_dir
541
- model_path = downloads_model_path
542
- required_paths = downloads_required_paths
543
- os.environ["LANCE_MODEL_BASE_DIR"] = display_path(model_base_dir)
544
- return model_path
545
-
546
- auto_download = env_flag("LANCE_AUTO_DOWNLOAD", running_on_space())
547
- if not auto_download:
548
- missing = "\n".join(f"- {display_path(path)}" for path in required_paths if not path.exists())
549
- raise FileNotFoundError(
550
- "Lance model assets are missing. Set LANCE_MODEL_BASE_DIR or enable "
551
- f"LANCE_AUTO_DOWNLOAD=1.\nMissing files:\n{missing}"
552
- )
553
-
554
- model_base_dir.mkdir(parents=True, exist_ok=True)
555
- repo_id = os.getenv("LANCE_MODEL_REPO_ID", DEFAULT_MODEL_REPO_ID)
556
- print(f"[startup] Downloading Lance model assets from {repo_id} to {display_path(model_base_dir)}", flush=True)
557
- snapshot_path = Path(
558
- snapshot_download(
559
- repo_id=repo_id,
560
- local_dir=str(model_base_dir),
561
- local_dir_use_symlinks=False,
562
- resume_download=True,
563
- )
564
- )
565
- if snapshot_path != model_base_dir and not model_path.exists():
566
- os.environ["LANCE_MODEL_BASE_DIR"] = display_path(snapshot_path)
567
- model_path = get_model_path(model_variant)
568
- return model_path
569
-
570
-
571
- def ensure_dirs() -> None:
572
- TMP_INPUT_DIR.mkdir(parents=True, exist_ok=True)
573
- RESULTS_ROOT.mkdir(parents=True, exist_ok=True)
574
-
575
-
576
- def save_generation_record(record: dict, save_dir: Path) -> None:
577
- ensure_dirs()
578
- run_record_path = save_dir / RUN_RECORD_FILENAME
579
- with run_record_path.open("w", encoding="utf-8") as f:
580
- json.dump(record, f, ensure_ascii=False, indent=2)
581
-
582
- with RECORD_WRITE_LOCK:
583
- with GLOBAL_RECORDS_FILE.open("a", encoding="utf-8") as f:
584
- f.write(json.dumps(record, ensure_ascii=False) + "\n")
585
-
586
-
587
- def normalize_seed(seed: int) -> int:
588
- return random.randint(0, 2**31 - 1) if seed == -1 else seed
589
-
590
-
591
- def normalize_task(task: str) -> str:
592
- task_key = (task or TASK_LABEL_VIDEO_GENERATION).strip()
593
- task = TASK_LABEL_TO_INTERNAL.get(task_key, TASK_LABEL_TO_INTERNAL.get(task_key.lower(), ""))
594
- if task not in GENERATION_TASKS | UNDERSTANDING_TASKS:
595
- raise ValueError(f"Unsupported task type: {task}")
596
- return task
597
-
598
-
599
- def normalize_resolution_for_backend(resolution: str, task: str) -> str:
600
- internal_task = normalize_task(task)
601
- if internal_task in IMAGE_TASKS:
602
- return DEFAULT_IMAGE_RESOLUTION
603
- if internal_task in VIDEO_TASKS:
604
- return DEFAULT_RESOLUTION
605
- return str(resolution)
606
-
607
-
608
- def get_default_aspect_ratio(task: str) -> str:
609
- internal_task = normalize_task(task)
610
- return DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
611
-
612
-
613
- def get_size_for_aspect_ratio(task: str, aspect_ratio: str) -> tuple[int, int]:
614
- internal_task = normalize_task(task)
615
- aspect_ratio = aspect_ratio if aspect_ratio in ASPECT_RATIO_CHOICES else get_default_aspect_ratio(internal_task)
616
- size_map = IMAGE_ASPECT_RATIO_TO_SIZE if internal_task in IMAGE_TASKS else VIDEO_ASPECT_RATIO_TO_SIZE
617
- return size_map[aspect_ratio]
618
-
619
-
620
- def format_size_markdown(task: str, width: int, height: int) -> str:
621
- internal_task = normalize_task(task)
622
- if internal_task in UNDERSTANDING_TASKS:
623
- return ""
624
- return f"{width} x {height}"
625
-
626
-
627
- def normalize_frame_interpolation(value) -> bool:
628
- if isinstance(value, bool):
629
- return value
630
- return str(value or "").strip().lower() in {"1", "true", "yes", "on", "open"}
631
-
632
-
633
- def video_seconds_to_num_frames(seconds: int) -> int:
634
- seconds = max(1, min(10, int(seconds)))
635
- return 12 * seconds + 1
636
-
637
-
638
- def update_size_from_aspect_ratio(task: str, aspect_ratio: str):
639
- width, height = get_size_for_aspect_ratio(task, aspect_ratio)
640
- return height, width, format_size_markdown(task, width, height)
641
-
642
-
643
- def reset_generation_defaults_for_task(task: str):
644
- internal_task = normalize_task(task)
645
- aspect_ratio = get_default_aspect_ratio(internal_task)
646
- width, height = get_size_for_aspect_ratio(internal_task, aspect_ratio)
647
- resolution = DEFAULT_IMAGE_RESOLUTION if internal_task in IMAGE_TASKS else DEFAULT_RESOLUTION
648
- num_frames = DEFAULT_VIDEO_DURATION_SECONDS if internal_task == TASK_T2V else 1
649
- return aspect_ratio, height, width, num_frames, resolution, format_size_markdown(internal_task, width, height)
650
-
651
-
652
- def apply_prompt_example(task: str, evt: gr.SelectData):
653
- prompt_text = ""
654
- if isinstance(evt.row_value, list) and evt.row_value:
655
- prompt_text = str(evt.row_value[0])
656
- elif isinstance(evt.value, list) and evt.value:
657
- prompt_text = str(evt.value[0])
658
- elif evt.value is not None:
659
- prompt_text = str(evt.value)
660
- defaults = reset_generation_defaults_for_task(task)
661
- return (prompt_text, *defaults)
662
-
663
-
664
- def get_understanding_system_prompt_choices(task: str) -> list[str]:
665
- internal_task = normalize_task(task)
666
- if internal_task == TASK_X2T_IMAGE:
667
- return [I2T_QA_SYSTEM_PROMPT]
668
- return [V2T_QA_SYSTEM_PROMPT]
669
-
670
-
671
- def normalize_understanding_system_prompt(task: str, system_prompt: Optional[str]) -> str:
672
- return get_understanding_system_prompt_choices(task)[0]
673
-
674
-
675
- def create_request_json(
676
- task: str,
677
- prompt: str,
678
- input_video: Optional[str],
679
- input_image: Optional[str],
680
- system_prompt: Optional[str] = None,
681
- ) -> Path:
682
- ensure_dirs()
683
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
684
- prompt_file = TMP_INPUT_DIR / f"{task}_{timestamp}.json"
685
-
686
- if task == TASK_T2V:
687
- payload = {"000000.mp4": prompt}
688
- elif task == TASK_T2I:
689
- payload = {"000000.png": prompt}
690
- elif task == TASK_VIDEO_EDIT:
691
- if not input_video:
692
- raise ValueError("The video edit task requires an input video.")
693
- payload = {
694
- "000000": {
695
- "interleave_array": [prompt, input_video, input_video],
696
- "element_dtype_array": ["text", "video", "video"],
697
- "istarget_in_interleave": [0, 0, 1],
698
- }
699
- }
700
- elif task == TASK_IMAGE_EDIT:
701
- if not input_image:
702
- raise ValueError("The image edit task requires an input image.")
703
- payload = {
704
- "000000": {
705
- "interleave_array": [prompt, input_image, input_image],
706
- "element_dtype_array": ["text", "image", "image"],
707
- "istarget_in_interleave": [0, 0, 1],
708
- }
709
- }
710
- elif task == TASK_X2T_VIDEO:
711
- if not input_video:
712
- raise ValueError("The video understanding task requires an input video.")
713
- system_prompt = normalize_understanding_system_prompt(task, system_prompt)
714
- payload = {
715
- "000000": {
716
- "interleave_array": [input_video, [system_prompt, prompt, ""]],
717
- "element_dtype_array": ["video", "text"],
718
- "istarget_in_interleave": [0, 1],
719
- }
720
- }
721
- elif task == TASK_X2T_IMAGE:
722
- if not input_image:
723
- raise ValueError("The image understanding task requires an input image.")
724
- system_prompt = normalize_understanding_system_prompt(task, system_prompt)
725
- payload = {
726
- "000000": {
727
- "interleave_array": [input_image, [system_prompt, prompt, ""]],
728
- "element_dtype_array": ["image", "text"],
729
- "istarget_in_interleave": [0, 1],
730
- }
731
- }
732
- else:
733
- raise ValueError(f"Unsupported task type: {task}")
734
-
735
- with prompt_file.open("w", encoding="utf-8") as f:
736
- json.dump(payload, f, ensure_ascii=False, indent=2)
737
- return prompt_file
738
-
739
-
740
- def resolve_example_path(path: str) -> str:
741
- candidate = Path(path)
742
- if candidate.is_absolute():
743
- return str(candidate)
744
- repo_candidate = (REPO_ROOT / candidate)
745
- if repo_candidate.exists():
746
- return str(repo_candidate.resolve())
747
- if candidate.exists():
748
- return str(candidate.resolve())
749
- return path
750
-
751
-
752
- def resolve_browser_video_example_path(path: str) -> str:
753
- candidate = Path(path)
754
- compatible_candidate = candidate.with_name(f"{candidate.stem}_h264{candidate.suffix}")
755
- repo_compatible_candidate = REPO_ROOT / compatible_candidate
756
- if not compatible_candidate.is_absolute() and repo_compatible_candidate.exists():
757
- return str(repo_compatible_candidate.resolve())
758
- if compatible_candidate.is_absolute() and compatible_candidate.exists():
759
- return str(compatible_candidate.resolve())
760
- repo_candidate = REPO_ROOT / candidate
761
- if not candidate.is_absolute() and repo_candidate.exists():
762
- return str(repo_candidate.resolve())
763
- if candidate.is_absolute() and candidate.exists():
764
- return str(candidate.resolve())
765
- return resolve_example_path(path)
766
-
767
-
768
- def load_json_examples(relative_path: str) -> dict:
769
- path = REPO_ROOT / relative_path
770
- with path.open("r", encoding="utf-8") as f:
771
- return json.load(f)
772
-
773
-
774
- T2V_EXAMPLE_SUMMARIES = {
775
- "000000.mp4": "Red panda surfing on a bright seaside wave.",
776
- "000002.mp4": "Panda cub skateboarding in a creative loft.",
777
- "000004.mp4": "Young woman shaping clay in a sunlit pottery workshop.",
778
- "000005.mp4": "Panda boxing a robot in a luxurious palace ring.",
779
- "000008.mp4": "Fantasy pastel horse stepping through a glowing cloud valley.",
780
- }
781
-
782
-
783
- def make_generation_examples(
784
- task_label: str,
785
- relative_path: str,
786
- limit: int,
787
- image_task: bool,
788
- selected_keys: Optional[list[str]] = None,
789
- summaries: Optional[dict[str, str]] = None,
790
- ) -> list[list]:
791
- data = load_json_examples(relative_path)
792
- items = [(key, data[key]) for key in selected_keys if key in data] if selected_keys else list(data.items())[:limit]
793
- examples = []
794
- for output_name, prompt in items:
795
- examples.append([prompt])
796
- return examples
797
-
798
-
799
- def make_edit_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
800
- data = load_json_examples(relative_path)
801
- examples = []
802
- for sample in list(data.values())[:limit]:
803
- interleave = sample["interleave_array"]
804
- prompt = interleave[0]
805
- media_path = resolve_example_path(interleave[1])
806
- examples.append([
807
- prompt,
808
- media_path if media_type == "video" else None,
809
- media_path if media_type == "image" else None,
810
- ])
811
- return examples
812
-
813
-
814
- def make_understanding_examples(task_label: str, relative_path: str, limit: int, media_type: str) -> list[list]:
815
- data = load_json_examples(relative_path)
816
- examples = []
817
- for sample in list(data.values())[:limit]:
818
- interleave = sample["interleave_array"]
819
- media_path = (
820
- resolve_browser_video_example_path(interleave[0])
821
- if media_type == "video"
822
- else resolve_example_path(interleave[0])
823
- )
824
- text_payload = interleave[1]
825
- question = text_payload[1] if isinstance(text_payload, list) and len(text_payload) > 1 else ""
826
- examples.append([
827
- question,
828
- media_path if media_type == "video" else None,
829
- media_path if media_type == "image" else None,
830
- ])
831
- return examples
832
-
833
-
834
- def make_understanding_system_prompt_map(relative_path: str, task: str) -> dict[str, str]:
835
- data = load_json_examples(relative_path)
836
- system_prompts = {}
837
- for sample in data.values():
838
- interleave = sample["interleave_array"]
839
- text_payload = interleave[1]
840
- if not isinstance(text_payload, list) or len(text_payload) < 2:
841
- continue
842
- system_prompts[text_payload[1]] = normalize_understanding_system_prompt(task, text_payload[0])
843
- return system_prompts
844
-
845
-
846
- VIDEO_GENERATION_EXAMPLES = make_generation_examples(
847
- TASK_LABEL_VIDEO_GENERATION,
848
- "config/examples/t2v_example.json",
849
- limit=6,
850
- image_task=False,
851
- #selected_keys=["000000.mp4", "000002.mp4", "000005.mp4", "000004.mp4", "000008.mp4"],
852
- selected_keys=["000004.mp4", "000002.mp4", "000000.mp4", "000005.mp4", "000008.mp4", "000007.mp4"],
853
- summaries=T2V_EXAMPLE_SUMMARIES,
854
- )
855
- VIDEO_EDIT_EXAMPLES = make_edit_examples(
856
- TASK_LABEL_VIDEO_EDIT,
857
- "config/examples/video_edit_example.json",
858
- limit=3,
859
- media_type="video",
860
- )
861
- VIDEO_UNDERSTANDING_EXAMPLES = make_understanding_examples(
862
- TASK_LABEL_VIDEO_UNDERSTANDING,
863
- "config/examples/x2t_video_example.json",
864
- limit=3,
865
- media_type="video",
866
- )
867
- VIDEO_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
868
- "config/examples/x2t_video_example.json",
869
- TASK_X2T_VIDEO,
870
- )
871
- IMAGE_GENERATION_EXAMPLES = make_generation_examples(
872
- TASK_LABEL_IMAGE_GENERATION,
873
- "config/examples/t2i_example.json",
874
- limit=5,
875
- image_task=True,
876
- selected_keys=["000000.png", "000003.png", "000006.png", "000008.png", "000009.png"],
877
- )
878
- IMAGE_EDIT_EXAMPLES = make_edit_examples(
879
- TASK_LABEL_IMAGE_EDIT,
880
- "config/examples/image_edit_example.json",
881
- limit=5,
882
- media_type="image",
883
- )
884
- IMAGE_UNDERSTANDING_EXAMPLES = make_understanding_examples(
885
- TASK_LABEL_IMAGE_UNDERSTANDING,
886
- "config/examples/x2t_image_example.json",
887
- limit=3,
888
- media_type="image",
889
- )
890
- IMAGE_UNDERSTANDING_SYSTEM_PROMPTS = make_understanding_system_prompt_map(
891
- "config/examples/x2t_image_example.json",
892
- TASK_X2T_IMAGE,
893
- )
894
-
895
-
896
- def build_save_dir(task: str) -> Path:
897
- ensure_dirs()
898
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
899
- return RESULTS_ROOT / f"{task}_{timestamp}_{int(time.time() * 1000) % 1000:03d}"
900
-
901
-
902
- def find_generated_video(save_dir: Path) -> Optional[Path]:
903
- videos = sorted(save_dir.glob("*.mp4"), key=lambda p: p.stat().st_mtime, reverse=True)
904
- return videos[0] if videos else None
905
-
906
-
907
- def find_generated_image(save_dir: Path) -> Optional[Path]:
908
- images = sorted(save_dir.glob("*.png"), key=lambda p: p.stat().st_mtime, reverse=True)
909
- return images[0] if images else None
910
-
911
-
912
- def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tuple[Path, str]:
913
- rife_dir = REPO_ROOT / "RIFE"
914
- rife_script = rife_dir / "inference_video.py"
915
- if not rife_script.exists():
916
- raise FileNotFoundError(f"RIFE inference script not found: {rife_script}")
917
-
918
- output_path = video_path.with_name(f"{video_path.stem}_rife_{2 ** exp}x{video_path.suffix}")
919
- env = os.environ.copy()
920
- env["CUDA_VISIBLE_DEVICES"] = str(device_id)
921
- command = [
922
- "python3",
923
- str(rife_script),
924
- "--exp",
925
- str(exp),
926
- "--video",
927
- str(video_path),
928
- "--output",
929
- str(output_path),
930
- "--model",
931
- str(rife_dir / "train_log"),
932
- ]
933
- rife_start = time.perf_counter()
934
- try:
935
- completed = subprocess.run(
936
- command,
937
- cwd=str(video_path.parent),
938
- env=env,
939
- check=True,
940
- capture_output=True,
941
- text=True,
942
- )
943
- except subprocess.CalledProcessError as exc:
944
- raise RuntimeError(
945
- "\n".join(
946
- [
947
- f"RIFE failed with exit code {exc.returncode}.",
948
- f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
949
- exc.stdout.strip() if exc.stdout else "",
950
- exc.stderr.strip() if exc.stderr else "",
951
- ]
952
- ).strip()
953
- ) from exc
954
- if not output_path.exists():
955
- raise FileNotFoundError(f"RIFE completed but output video was not found: {output_path}")
956
- elapsed = time.perf_counter() - rife_start
957
- log = "\n".join(
958
- [
959
- "[rife] Frame interpolation finished.",
960
- f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
961
- f"elapsed={elapsed:.2f}s",
962
- f"output={output_path}",
963
- completed.stdout.strip(),
964
- completed.stderr.strip(),
965
- ]
966
- ).strip()
967
- return output_path, log
968
-
969
-
970
- def extract_text_result(save_dir: Path) -> str:
971
- prompt_result_path = save_dir / PROMPT_JSON_FILENAME
972
- if not prompt_result_path.exists():
973
- return ""
974
- with prompt_result_path.open("r", encoding="utf-8") as f:
975
- data = json.load(f)
976
- if not data:
977
- return ""
978
- first_value = next(iter(data.values()))
979
- return first_value if isinstance(first_value, str) else json.dumps(first_value, ensure_ascii=False)
980
-
981
-
982
- class LanceT2VV2TPipeline:
983
- def __init__(self, device_id: int, model_variant: str = MODEL_VARIANT_VIDEO) -> None:
984
- self._init_lock = threading.Lock()
985
- self._generate_lock = threading.Lock()
986
- self.initialized = False
987
- self.device = device_id
988
- self.model_variant = normalize_model_variant(model_variant)
989
- self.logger = get_logger(f"lance_{self.model_variant}_gpu{device_id}")
990
-
991
- self.model: Optional[Lance] = None
992
- self.vae_model: Optional[WanVideoVAE] = None
993
- self.vae_config: Optional[AutoEncoderParams] = None
994
- self.tokenizer: Optional[Qwen2Tokenizer] = None
995
- self.new_token_ids: Optional[dict] = None
996
- self.image_token_id: Optional[int] = None
997
- self.base_model_args: Optional[ModelArguments] = None
998
- self.base_data_args: Optional[DataArguments] = None
999
- self.base_inference_args: Optional[InferenceArguments] = None
1000
-
1001
- def _log_stage(self, stage_name: str, start_time: float, extra: str = "") -> None:
1002
- elapsed = time.perf_counter() - start_time
1003
- suffix = f" | {extra}" if extra else ""
1004
- print(f"[startup][gpu:{self.device}] {stage_name} done in {elapsed:.2f}s{suffix}", flush=True)
1005
-
1006
- def _build_base_model_args(self) -> ModelArguments:
1007
- model_path = str(get_model_path(self.model_variant))
1008
- return ModelArguments(
1009
- model_path=model_path,
1010
- vit_type=DEFAULT_VIT_TYPE,
1011
- llm_qk_norm=True,
1012
- llm_qk_norm_und=True,
1013
- llm_qk_norm_gen=True,
1014
- tie_word_embeddings=False,
1015
- max_num_frames=121,
1016
- max_latent_size=64,
1017
- latent_patch_size=[1, 1, 1],
1018
- )
1019
-
1020
- def _build_base_inference_args(self) -> InferenceArguments:
1021
- return InferenceArguments(
1022
- validation_num_timesteps=DEFAULT_TIMESTEPS,
1023
- validation_timestep_shift=DEFAULT_TIMESTEP_SHIFT,
1024
- copy_init_moe=True,
1025
- visual_und=True,
1026
- visual_gen=True,
1027
- vae_model_type="wan",
1028
- apply_qwen_2_5_vl_pos_emb=True,
1029
- apply_chat_template=False,
1030
- cfg_type=0,
1031
- validation_data_seed=42,
1032
- video_height=DEFAULT_HEIGHT,
1033
- video_width=DEFAULT_WIDTH,
1034
- num_frames=DEFAULT_NUM_FRAMES,
1035
- task=DEFAULT_TASK,
1036
- save_path_gen=str(RESULTS_ROOT),
1037
- resolution=DEFAULT_RESOLUTION,
1038
- text_template=TEXT_TEMPLATE,
1039
- use_KVcache=USE_KVCACHE,
1040
- )
1041
-
1042
- def initialize(self) -> None:
1043
- with self._init_lock:
1044
- if self.initialized:
1045
- return
1046
-
1047
- ensure_dirs()
1048
- resolved_model_path = ensure_model_assets(self.model_variant)
1049
- print(
1050
- f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
1051
- flush=True,
1052
- )
1053
- if not torch.cuda.is_available():
1054
- raise RuntimeError("CUDA is unavailable. Lance T2V/V2T Gradio requires a GPU environment.")
1055
- if self.device >= torch.cuda.device_count():
1056
- raise RuntimeError(
1057
- f"GPU {self.device} is unavailable. Detected {torch.cuda.device_count()} GPU(s)."
1058
- )
1059
- torch.cuda.set_device(self.device)
1060
-
1061
- model_args = self._build_base_model_args()
1062
- data_args = DataArguments()
1063
- inference_args = self._build_base_inference_args()
1064
- apply_inference_defaults(model_args, data_args, inference_args)
1065
- inference_args.validation_noise_seed = inference_args.validation_data_seed
1066
-
1067
- self.base_model_args = model_args
1068
- self.base_data_args = data_args
1069
- self.base_inference_args = inference_args
1070
-
1071
- set_seed(inference_args.global_seed)
1072
-
1073
- stage_start = time.perf_counter()
1074
- print(
1075
- f"[startup][gpu:{self.device}] Loading LLM config: {Path(model_args.model_path) / 'llm_config.json'}",
1076
- flush=True,
1077
- )
1078
- llm_config: Qwen2Config = Qwen2Config.from_json_file(str(Path(model_args.model_path) / "llm_config.json"))
1079
- self._log_stage("LLM config load", stage_start)
1080
-
1081
- llm_config.layer_module = model_args.layer_module
1082
- llm_config.qk_norm = model_args.llm_qk_norm
1083
- llm_config.qk_norm_und = model_args.llm_qk_norm_und
1084
- llm_config.qk_norm_gen = model_args.llm_qk_norm_gen
1085
- llm_config.tie_word_embeddings = model_args.tie_word_embeddings
1086
- llm_config.freeze_und = inference_args.freeze_und
1087
- llm_config.apply_qwen_2_5_vl_pos_emb = inference_args.apply_qwen_2_5_vl_pos_emb
1088
-
1089
- stage_start = time.perf_counter()
1090
- print(f"[startup][gpu:{self.device}] Initializing LLM weights: {model_args.model_path}", flush=True)
1091
- language_model: Qwen2ForCausalLM = Qwen2ForCausalLM(llm_config)
1092
- self._log_stage("LLM weight init", stage_start)
1093
-
1094
- vit_model = None
1095
- vit_config = None
1096
- if inference_args.visual_und:
1097
- if model_args.vit_type not in ("qwen2_5_vl", "qwen_2_5_vl_original"):
1098
- raise ValueError(f"Unsupported vit_type: {model_args.vit_type}")
1099
- stage_start = time.perf_counter()
1100
- print(f"[startup][gpu:{self.device}] Loading VIT config: {model_args.vit_path}", flush=True)
1101
- vit_config = Qwen2_5_VLVisionConfig.from_pretrained(model_args.vit_path)
1102
- self._log_stage("VIT config load", stage_start)
1103
-
1104
- stage_start = time.perf_counter()
1105
- print(
1106
- f"[startup][gpu:{self.device}] Loading VIT weights: {Path(model_args.vit_path) / 'vit.safetensors'}",
1107
- flush=True,
1108
- )
1109
- vit_model = Qwen2_5_VisionTransformerPretrainedModel(vit_config)
1110
- vit_weights = load_file(str(Path(model_args.vit_path) / "vit.safetensors"))
1111
- vit_model.load_state_dict(vit_weights, strict=True)
1112
- self._log_stage("VIT weight load", stage_start)
1113
- clean_memory(vit_weights)
1114
-
1115
- if inference_args.visual_gen:
1116
- stage_start = time.perf_counter()
1117
- print(f"[startup][gpu:{self.device}] Initializing VAE", flush=True)
1118
- vae_model = WanVideoVAE()
1119
- vae_config = deepcopy(vae_model.vae_config)
1120
- self._log_stage("VAE init", stage_start)
1121
- else:
1122
- vae_model = None
1123
- vae_config = None
1124
-
1125
- config = LanceConfig(
1126
- visual_gen=inference_args.visual_gen,
1127
- visual_und=inference_args.visual_und,
1128
- llm_config=llm_config,
1129
- vit_config=vit_config if inference_args.visual_und else None,
1130
- vae_config=vae_config if inference_args.visual_gen else None,
1131
- latent_patch_size=model_args.latent_patch_size,
1132
- max_num_frames=model_args.max_num_frames,
1133
- max_latent_size=model_args.max_latent_size,
1134
- vit_max_num_patch_per_side=model_args.vit_max_num_patch_per_side,
1135
- connector_act=model_args.connector_act,
1136
- interpolate_pos=model_args.interpolate_pos,
1137
- timestep_shift=inference_args.timestep_shift,
1138
- )
1139
- model: Lance = Lance(
1140
- language_model=language_model,
1141
- vit_model=vit_model if inference_args.visual_und else None,
1142
- vit_type=model_args.vit_type,
1143
- config=config,
1144
- training_args=inference_args,
1145
- )
1146
-
1147
- stage_start = time.perf_counter()
1148
- print(f"[startup][gpu:{self.device}] Moving Lance model to GPU {self.device}", flush=True)
1149
- model = model.to(self.device)
1150
- self._log_stage("Lance model move to GPU", stage_start)
1151
-
1152
- stage_start = time.perf_counter()
1153
- print(f"[startup][gpu:{self.device}] Loading tokenizer: {model_args.model_path}", flush=True)
1154
- tokenizer: Qwen2Tokenizer = Qwen2Tokenizer.from_pretrained(model_args.model_path)
1155
- tokenizer, new_token_ids, num_new_tokens = add_special_tokens(tokenizer)
1156
- self._log_stage("tokenizer load and special token init", stage_start, extra=f"num_new_tokens={num_new_tokens}")
1157
-
1158
- if inference_args.copy_init_moe:
1159
- language_model.init_moe()
1160
-
1161
- init_from_model_path_if_needed(model, model_args)
1162
-
1163
- if num_new_tokens > 0:
1164
- model.language_model.resize_token_embeddings(len(tokenizer))
1165
- model.config.llm_config.vocab_size = len(tokenizer)
1166
- model.language_model.config.vocab_size = len(tokenizer)
1167
-
1168
- if model_args.vit_type.lower() == "qwen2_5_vl":
1169
- from common.model.hacks import hack_qwen2_5_vl_config
1170
-
1171
- language_model = hack_qwen2_5_vl_config(language_model)
1172
-
1173
- image_token_id = language_model.config.video_token_id
1174
- new_token_ids.update({"image_token_id": image_token_id})
1175
- model.update_tokenizer(tokenizer=tokenizer)
1176
-
1177
- if model_args.tie_word_embeddings:
1178
- model.language_model.untie_lm_head()
1179
- model.language_model.copy_new_token_rows_to_lm_head(num_new_tokens)
1180
- model_args.tie_word_embeddings = False
1181
- llm_config.tie_word_embeddings = False
1182
- else:
1183
- assert (
1184
- model.language_model.get_input_embeddings().weight.data.data_ptr()
1185
- != model.language_model.get_output_embeddings().weight.data.data_ptr()
1186
- ), "tie_word_embeddings conflict"
1187
-
1188
- model = model.to(device=self.device, dtype=torch.bfloat16)
1189
- model.eval()
1190
- if vae_model is not None and hasattr(vae_model, "eval"):
1191
- vae_model.eval()
1192
-
1193
- self.model = model
1194
- self.vae_model = vae_model
1195
- self.vae_config = vae_config
1196
- self.tokenizer = tokenizer
1197
- self.new_token_ids = new_token_ids
1198
- self.image_token_id = image_token_id
1199
- self.initialized = True
1200
- print(
1201
- f"[startup][gpu:{self.device}][{self.model_variant}] Lance multimodal Gradio model loaded and ready for reuse.",
1202
- flush=True,
1203
- )
1204
-
1205
- def unload(self) -> None:
1206
- with self._init_lock:
1207
- if self.model is not None:
1208
- self.model.cpu()
1209
- if self.vae_model is not None and hasattr(self.vae_model, "vae"):
1210
- vae_inner = self.vae_model.vae
1211
- if hasattr(vae_inner, "model"):
1212
- vae_inner.model.cpu()
1213
-
1214
- self.model = None
1215
- self.vae_model = None
1216
- self.vae_config = None
1217
- self.tokenizer = None
1218
- self.new_token_ids = None
1219
- self.image_token_id = None
1220
- self.base_model_args = None
1221
- self.base_data_args = None
1222
- self.base_inference_args = None
1223
- self.initialized = False
1224
- gc.collect()
1225
- if torch.cuda.is_available():
1226
- with torch.cuda.device(self.device):
1227
- torch.cuda.empty_cache()
1228
- torch.cuda.ipc_collect()
1229
-
1230
- def _build_request_batch(
1231
- self,
1232
- prompt_file: Path,
1233
- model_args: ModelArguments,
1234
- data_args: DataArguments,
1235
- inference_args: InferenceArguments,
1236
- ):
1237
- assert self.tokenizer is not None
1238
- assert self.new_token_ids is not None
1239
- assert self.vae_config is not None
1240
-
1241
- dataset_config = DataConfig.from_yaml(str(prompt_file))
1242
- if inference_args.visual_und:
1243
- dataset_config.vit_patch_size = model_args.vit_patch_size
1244
- dataset_config.vit_patch_size_temporal = model_args.vit_patch_size_temporal
1245
- dataset_config.vit_max_num_patch_per_side = model_args.vit_max_num_patch_per_side
1246
- if inference_args.visual_gen:
1247
- vae_downsample = tuple_mul(
1248
- tuple(model_args.latent_patch_size),
1249
- (
1250
- self.vae_config.downsample_temporal,
1251
- self.vae_config.downsample_spatial,
1252
- self.vae_config.downsample_spatial,
1253
- ),
1254
- )
1255
- dataset_config.latent_patch_size = model_args.latent_patch_size
1256
- dataset_config.vae_downsample = vae_downsample
1257
- dataset_config.max_latent_size = model_args.max_latent_size
1258
- dataset_config.max_num_frames = model_args.max_num_frames
1259
-
1260
- dataset_config.text_cond_dropout_prob = model_args.text_cond_dropout_prob
1261
- dataset_config.vae_cond_dropout_prob = model_args.vae_cond_dropout_prob
1262
- dataset_config.vit_cond_dropout_prob = model_args.vit_cond_dropout_prob
1263
-
1264
- dataset_config.num_frames = inference_args.num_frames
1265
- dataset_config.H = inference_args.video_height
1266
- dataset_config.W = inference_args.video_width
1267
- dataset_config.task = inference_args.task
1268
- dataset_config.resolution = inference_args.resolution
1269
- dataset_config.text_template = inference_args.text_template
1270
-
1271
- val_dataset = ValidationDataset(
1272
- jsonl_path=str(prompt_file),
1273
- tokenizer=self.tokenizer,
1274
- data_args=data_args,
1275
- model_args=model_args,
1276
- training_args=inference_args,
1277
- new_token_ids=self.new_token_ids,
1278
- dataset_config=dataset_config,
1279
- local_rank=0,
1280
- world_size=1,
1281
- )
1282
- return simple_custom_collate([val_dataset[0]])
1283
-
1284
- def generate(
1285
- self,
1286
- task: str,
1287
- prompt: str,
1288
- system_prompt: Optional[str],
1289
- input_video: Optional[str],
1290
- input_image: Optional[str],
1291
- height: int,
1292
- width: int,
1293
- num_frames: int,
1294
- seed: int,
1295
- resolution: str,
1296
- validation_num_timesteps: int,
1297
- validation_timestep_shift: float,
1298
- cfg_text_scale: float,
1299
- enable_frame_interpolation: bool,
1300
- ):
1301
- self.initialize()
1302
- internal_task = normalize_task(task)
1303
- prompt = (prompt or "").strip()
1304
- input_video = str(input_video).strip() if input_video else ""
1305
- input_image = str(input_image).strip() if input_image else ""
1306
-
1307
- if internal_task in GENERATION_TASKS and not prompt:
1308
- return None, None, "", "Please enter a prompt.", ""
1309
- if internal_task in UNDERSTANDING_TASKS and not prompt:
1310
- return None, None, "", "Please enter a question.", ""
1311
- if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
1312
- return None, None, "", "Please upload an input video.", ""
1313
- if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
1314
- return None, None, "", "Please upload an input image.", ""
1315
- if height <= 0 or width <= 0:
1316
- return None, None, "", "Height and width must be greater than 0.", ""
1317
- if num_frames <= 0:
1318
- return None, None, "", "The number of frames must be greater than 0.", ""
1319
-
1320
- assert self.model is not None
1321
- assert self.tokenizer is not None
1322
- assert self.new_token_ids is not None
1323
- assert self.image_token_id is not None
1324
- assert self.base_model_args is not None
1325
- assert self.base_data_args is not None
1326
- assert self.base_inference_args is not None
1327
- active_model_path = self.base_model_args.model_path
1328
-
1329
- with self._generate_lock:
1330
- torch.cuda.set_device(self.device)
1331
- actual_seed = normalize_seed(int(seed))
1332
- prompt_file = create_request_json(
1333
- task=internal_task,
1334
- prompt=prompt,
1335
- input_video=input_video,
1336
- input_image=input_image,
1337
- system_prompt=system_prompt,
1338
- )
1339
- save_dir = build_save_dir(internal_task)
1340
- save_dir.mkdir(parents=True, exist_ok=True)
1341
- request_started_at = datetime.now().isoformat(timespec="seconds")
1342
-
1343
- request_model_args = deepcopy(self.base_model_args)
1344
- request_model_args.cfg_text_scale = float(cfg_text_scale)
1345
-
1346
- request_data_args = deepcopy(self.base_data_args)
1347
- request_data_args.val_dataset_config_file = str(prompt_file)
1348
-
1349
- request_inference_args = deepcopy(self.base_inference_args)
1350
- request_inference_args.validation_num_timesteps = int(validation_num_timesteps)
1351
- request_inference_args.validation_timestep_shift = float(validation_timestep_shift)
1352
- request_inference_args.validation_data_seed = actual_seed
1353
- request_inference_args.validation_noise_seed = actual_seed
1354
- request_inference_args.video_height = int(height)
1355
- request_inference_args.video_width = int(width)
1356
- request_inference_args.num_frames = int(num_frames)
1357
- display_resolution = str(resolution)
1358
- backend_resolution = normalize_resolution_for_backend(display_resolution, internal_task)
1359
- request_inference_args.resolution = backend_resolution
1360
- request_inference_args.save_path_gen = str(save_dir)
1361
- request_inference_args.task = internal_task
1362
- request_inference_args.text_template = TEXT_TEMPLATE
1363
- request_inference_args.prompt_data_dict = {}
1364
-
1365
- try:
1366
- print(
1367
- "[lance_gradio_t2v_v2t] Start generation "
1368
- f"| task={internal_task} | gpu={self.device} | seed={actual_seed} | "
1369
- f"size={height}x{width} | frames={num_frames} | resolution={display_resolution}",
1370
- flush=True,
1371
- )
1372
- val_data_cpu = self._build_request_batch(
1373
- prompt_file=prompt_file,
1374
- model_args=request_model_args,
1375
- data_args=request_data_args,
1376
- inference_args=request_inference_args,
1377
- )
1378
- generate_start = time.perf_counter()
1379
- validate_on_fixed_batch(
1380
- fsdp_model=self.model,
1381
- vae_model=self.vae_model,
1382
- tokenizer=self.tokenizer,
1383
- val_data_cpu=val_data_cpu,
1384
- training_args=request_inference_args,
1385
- model_args=request_model_args,
1386
- inference_args=request_inference_args,
1387
- new_token_ids=self.new_token_ids,
1388
- image_token_id=self.image_token_id,
1389
- device=self.device,
1390
- save_source_video=False,
1391
- save_path_gen=request_inference_args.save_path_gen,
1392
- save_path_gt="",
1393
- )
1394
- elapsed = time.perf_counter() - generate_start
1395
- save_prompt_results(request_inference_args.prompt_data_dict, request_inference_args.save_path_gen, self.logger)
1396
- clean_memory()
1397
-
1398
- video_path = find_generated_video(save_dir) if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} else None
1399
- original_video_path = video_path
1400
- rife_log = ""
1401
- rife_error = ""
1402
- frame_interpolation_enabled = normalize_frame_interpolation(enable_frame_interpolation) and internal_task in {TASK_T2V, TASK_VIDEO_EDIT}
1403
- if frame_interpolation_enabled and video_path is not None:
1404
- try:
1405
- clean_memory()
1406
- print(
1407
- "[rife] Start frame interpolation "
1408
- f"| task={internal_task} | gpu={self.device} | input={video_path}",
1409
- flush=True,
1410
- )
1411
- video_path, rife_log = run_rife_interpolation(video_path, self.device, exp=1)
1412
- except Exception:
1413
- rife_error = traceback.format_exc()
1414
- print(rife_error, flush=True)
1415
- image_path = find_generated_image(save_dir) if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} else None
1416
- text_result = extract_text_result(save_dir) if internal_task in UNDERSTANDING_TASKS else ""
1417
- record = {
1418
- "request_started_at": request_started_at,
1419
- "request_finished_at": datetime.now().isoformat(timespec="seconds"),
1420
- "status": "success",
1421
- "task": internal_task,
1422
- "model_variant": self.model_variant,
1423
- "model_path": active_model_path,
1424
- "gpu": self.device,
1425
- "prompt": prompt,
1426
- "system_prompt": normalize_understanding_system_prompt(internal_task, system_prompt)
1427
- if internal_task in UNDERSTANDING_TASKS
1428
- else "",
1429
- "input_video": input_video,
1430
- "input_image": input_image,
1431
- "seed": actual_seed,
1432
- "height": int(height),
1433
- "width": int(width),
1434
- "num_frames": int(num_frames),
1435
- "resolution": display_resolution,
1436
- "backend_resolution": backend_resolution,
1437
- "validation_num_timesteps": int(validation_num_timesteps),
1438
- "validation_timestep_shift": float(validation_timestep_shift),
1439
- "cfg_text_scale": float(cfg_text_scale),
1440
- "frame_interpolation": frame_interpolation_enabled,
1441
- "elapsed_seconds": round(elapsed, 3),
1442
- "prompt_file": str(prompt_file),
1443
- "output_dir": str(save_dir),
1444
- "original_video_path": str(original_video_path) if original_video_path is not None else "",
1445
- "video_path": str(video_path) if video_path is not None else "",
1446
- "image_path": str(image_path) if image_path is not None else "",
1447
- "text_result": text_result,
1448
- "rife_error": rife_error,
1449
- }
1450
- if internal_task in {TASK_T2V, TASK_VIDEO_EDIT} and video_path is None:
1451
- record["status"] = "completed_without_video"
1452
- if internal_task in {TASK_T2I, TASK_IMAGE_EDIT} and image_path is None:
1453
- record["status"] = "completed_without_image"
1454
- if internal_task in UNDERSTANDING_TASKS and not text_result:
1455
- record["status"] = "completed_without_text"
1456
- save_generation_record(record, save_dir)
1457
-
1458
- logs = "\n".join(
1459
- [
1460
- "[lance_gradio_t2v_v2t] Inference finished in-process.",
1461
- f"task={internal_task}",
1462
- f"model_variant={self.model_variant}",
1463
- f"model_path={active_model_path}",
1464
- f"gpu={self.device}",
1465
- f"seed={actual_seed}",
1466
- f"height={height}",
1467
- f"width={width}",
1468
- f"num_frames={num_frames}",
1469
- f"resolution={display_resolution}",
1470
- f"backend_resolution={backend_resolution}",
1471
- f"validation_num_timesteps={validation_num_timesteps}",
1472
- f"validation_timestep_shift={validation_timestep_shift}",
1473
- f"cfg_text_scale={cfg_text_scale}",
1474
- f"frame_interpolation={frame_interpolation_enabled}",
1475
- f"original_video_path={original_video_path or ''}",
1476
- f"rife_error={rife_error.strip() if rife_error else ''}",
1477
- f"elapsed={elapsed:.2f}s",
1478
- f"output_dir={save_dir}",
1479
- rife_log,
1480
- ]
1481
- )
1482
-
1483
- if internal_task in {TASK_T2V, TASK_VIDEO_EDIT}:
1484
- if video_path is None:
1485
- status = (
1486
- "Inference completed, but no output video was found.\n\n"
1487
- f"- Task: `{internal_task}`\n"
1488
- f"- Model: `{self.model_variant}`\n"
1489
- f"- Model path: `{active_model_path}`\n"
1490
- f"- GPU: `{self.device}`\n"
1491
- f"- Actual seed: `{actual_seed}`\n"
1492
- f"- Output directory: `{save_dir}`"
1493
- )
1494
- return None, None, "", status, logs
1495
- # status = (
1496
- # "Inference completed.\n\n"
1497
- # f"- Task: `{internal_task}`\n"
1498
- # f"- Model: `{self.model_variant}`\n"
1499
- # f"- Model path: `{active_model_path}`\n"
1500
- # f"- GPU: `{self.device}`\n"
1501
- # f"- Actual seed: `{actual_seed}`\n"
1502
- # f"- Output directory: `{save_dir}`\n"
1503
- # f"- Result file: `{video_path}`"
1504
- # )
1505
- status = ""
1506
- return str(video_path), None, "", status, logs
1507
-
1508
- if internal_task in {TASK_T2I, TASK_IMAGE_EDIT}:
1509
- if image_path is None:
1510
- status = (
1511
- "Inference completed, but no output image was found.\n\n"
1512
- f"- Task: `{internal_task}`\n"
1513
- f"- Model: `{self.model_variant}`\n"
1514
- f"- Model path: `{active_model_path}`\n"
1515
- f"- GPU: `{self.device}`\n"
1516
- f"- Actual seed: `{actual_seed}`\n"
1517
- f"- Output directory: `{save_dir}`"
1518
- )
1519
- return None, None, "", status, logs
1520
- # status = (
1521
- # "Inference completed.\n\n"
1522
- # f"- Task: `{internal_task}`\n"
1523
- # f"- Model: `{self.model_variant}`\n"
1524
- # f"- Model path: `{active_model_path}`\n"
1525
- # f"- GPU: `{self.device}`\n"
1526
- # f"- Actual seed: `{actual_seed}`\n"
1527
- # f"- Output directory: `{save_dir}`\n"
1528
- # f"- Result file: `{image_path}`"
1529
- # )
1530
- status = ""
1531
- return None, str(image_path), "", status, logs
1532
-
1533
- # status = (
1534
- # "Understanding completed.\n\n"
1535
- # f"- Task: `{task}`\n"
1536
- # f"- Model: `{self.model_variant}`\n"
1537
- # f"- Model path: `{active_model_path}`\n"
1538
- # f"- GPU: `{self.device}`\n"
1539
- # f"- Actual seed: `{actual_seed}`\n"
1540
- # f"- Output directory: `{save_dir}`"
1541
- # )
1542
- status = ""
1543
- return None, None, text_result, status, logs
1544
- except Exception:
1545
- error_trace = traceback.format_exc()
1546
- print(error_trace, flush=True)
1547
- record = {
1548
- "request_started_at": request_started_at,
1549
- "request_finished_at": datetime.now().isoformat(timespec="seconds"),
1550
- "status": "failed",
1551
- "task": internal_task,
1552
- "model_variant": self.model_variant,
1553
- "model_path": active_model_path,
1554
- "gpu": self.device,
1555
- "prompt": prompt,
1556
- "input_video": input_video,
1557
- "input_image": input_image,
1558
- "seed": actual_seed,
1559
- "height": int(height),
1560
- "width": int(width),
1561
- "num_frames": int(num_frames),
1562
- "resolution": display_resolution,
1563
- "backend_resolution": backend_resolution,
1564
- "validation_num_timesteps": int(validation_num_timesteps),
1565
- "validation_timestep_shift": float(validation_timestep_shift),
1566
- "cfg_text_scale": float(cfg_text_scale),
1567
- "prompt_file": str(prompt_file),
1568
- "output_dir": str(save_dir),
1569
- "video_path": "",
1570
- "image_path": "",
1571
- "text_result": "",
1572
- "error": error_trace,
1573
- }
1574
- save_generation_record(record, save_dir)
1575
- status = (
1576
- "Inference failed.\n\n"
1577
- f"- Task: `{internal_task}`\n"
1578
- f"- Model: `{self.model_variant}`\n"
1579
- f"- Model path: `{active_model_path}`\n"
1580
- f"- GPU: `{self.device}`\n"
1581
- f"- Actual seed: `{actual_seed}`\n"
1582
- f"- Resolution: `{display_resolution}`\n"
1583
- f"- Output directory: `{save_dir}`"
1584
- )
1585
- return None, None, "", status, error_trace
1586
-
1587
-
1588
- class PipelinePool:
1589
- def __init__(self, gpu_ids: list[int], model_variant: str = MODEL_VARIANT_VIDEO) -> None:
1590
- if not gpu_ids:
1591
- raise ValueError("At least one GPU must be configured.")
1592
- self.gpu_ids = gpu_ids
1593
- self.model_variant = normalize_model_variant(model_variant)
1594
- self.pipelines = [
1595
- LanceT2VV2TPipeline(device_id=gpu_id, model_variant=self.model_variant)
1596
- for gpu_id in gpu_ids
1597
- ]
1598
- self._available = deque(self.pipelines)
1599
- self._condition = threading.Condition()
1600
-
1601
- @property
1602
- def size(self) -> int:
1603
- return len(self.pipelines)
1604
-
1605
- @property
1606
- def gpu_summary(self) -> str:
1607
- return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
1608
-
1609
- def initialize_all(self) -> None:
1610
- print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
1611
- exceptions: list[Exception] = []
1612
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
1613
- futures = {
1614
- executor.submit(pipeline.initialize): pipeline.device for pipeline in self.pipelines
1615
- }
1616
- for future in concurrent.futures.as_completed(futures):
1617
- gpu_id = futures[future]
1618
- try:
1619
- future.result()
1620
- except Exception as exc:
1621
- print(f"[startup][gpu:{gpu_id}][{self.model_variant}] Preload failed: {exc}", flush=True)
1622
- exceptions.append(exc)
1623
- if exceptions:
1624
- raise RuntimeError(
1625
- f"{self.model_variant} preload failed on {len(exceptions)} GPU(s). Please check the terminal logs."
1626
- ) from exceptions[0]
1627
- print(
1628
- f"[startup][{self.model_variant}] GPU preload finished. Ready to handle {self.size} concurrent request(s).",
1629
- flush=True,
1630
- )
1631
-
1632
- def acquire(self) -> LanceT2VV2TPipeline:
1633
- with self._condition:
1634
- while not self._available:
1635
- self._condition.wait()
1636
- return self._available.popleft()
1637
-
1638
- def release(self, pipeline: LanceT2VV2TPipeline) -> None:
1639
- with self._condition:
1640
- self._available.append(pipeline)
1641
- self._condition.notify()
1642
-
1643
- def unload_all(self) -> None:
1644
- print(f"[runtime][{self.model_variant}] Unloading model pool from GPU(s): {self.gpu_ids}", flush=True)
1645
- with self._condition:
1646
- while len(self._available) != len(self.pipelines):
1647
- self._condition.wait()
1648
-
1649
- for pipeline in self.pipelines:
1650
- pipeline.unload()
1651
-
1652
- gc.collect()
1653
- if torch.cuda.is_available():
1654
- torch.cuda.empty_cache()
1655
- torch.cuda.ipc_collect()
1656
- print(f"[runtime][{self.model_variant}] Model pool unloaded.", flush=True)
1657
-
1658
- def generate(
1659
- self,
1660
- task: str,
1661
- prompt: str,
1662
- system_prompt: Optional[str],
1663
- input_video: Optional[str],
1664
- input_image: Optional[str],
1665
- height: int,
1666
- width: int,
1667
- num_frames: int,
1668
- seed: int,
1669
- resolution: str,
1670
- validation_num_timesteps: int,
1671
- validation_timestep_shift: float,
1672
- cfg_text_scale: float,
1673
- enable_frame_interpolation: bool,
1674
- ):
1675
- pipeline = self.acquire()
1676
- try:
1677
- with get_gpu_runtime_lock(pipeline.device):
1678
- return pipeline.generate(
1679
- task=task,
1680
- prompt=prompt,
1681
- system_prompt=system_prompt,
1682
- input_video=input_video,
1683
- input_image=input_image,
1684
- height=height,
1685
- width=width,
1686
- num_frames=num_frames,
1687
- seed=seed,
1688
- resolution=resolution,
1689
- validation_num_timesteps=validation_num_timesteps,
1690
- validation_timestep_shift=validation_timestep_shift,
1691
- cfg_text_scale=cfg_text_scale,
1692
- enable_frame_interpolation=enable_frame_interpolation,
1693
- )
1694
- finally:
1695
- self.release(pipeline)
1696
-
1697
-
1698
- ACTIVE_PIPELINE_POOLS: dict[str, PipelinePool] = {}
1699
- ACTIVE_POOL_LOCK = threading.Lock()
1700
- GPU_RUNTIME_LOCKS: dict[int, threading.Lock] = {}
1701
- GPU_RUNTIME_LOCKS_LOCK = threading.Lock()
1702
- QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
1703
- PRELOAD_MODEL_VARIANTS = [MODEL_VARIANT_VIDEO, MODEL_VARIANT_IMAGE]
1704
-
1705
-
1706
- def get_gpu_runtime_lock(device_id: int) -> threading.Lock:
1707
- with GPU_RUNTIME_LOCKS_LOCK:
1708
- lock = GPU_RUNTIME_LOCKS.get(device_id)
1709
- if lock is None:
1710
- lock = threading.Lock()
1711
- GPU_RUNTIME_LOCKS[device_id] = lock
1712
- return lock
1713
-
1714
-
1715
- def get_task_model_variant(task: str) -> str:
1716
- internal_task = normalize_task(task)
1717
- return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
1718
-
1719
-
1720
- def get_pipeline_pool(task: str) -> PipelinePool:
1721
- model_variant = get_task_model_variant(task)
1722
- with ACTIVE_POOL_LOCK:
1723
- pipeline_pool = ACTIVE_PIPELINE_POOLS.get(model_variant)
1724
- if pipeline_pool is not None:
1725
- return pipeline_pool
1726
-
1727
- gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
1728
- print(
1729
- f"[runtime] Loading Lance {model_variant} model pool without unloading existing pools.",
1730
- flush=True,
1731
- )
1732
- pipeline_pool = PipelinePool(gpu_ids, model_variant=model_variant)
1733
- pipeline_pool.initialize_all()
1734
- ACTIVE_PIPELINE_POOLS[model_variant] = pipeline_pool
1735
- return pipeline_pool
1736
-
1737
-
1738
- def preload_pipeline_pools(gpu_ids: list[int], model_variants: list[str]) -> None:
1739
- for model_variant in model_variants:
1740
- normalized_variant = normalize_model_variant(model_variant)
1741
- if normalized_variant in ACTIVE_PIPELINE_POOLS:
1742
- continue
1743
- resolved_model_path = ensure_model_assets(normalized_variant)
1744
- print(
1745
- f"[startup][{normalized_variant}] Using Lance model path: {resolved_model_path}",
1746
- flush=True,
1747
- )
1748
- pipeline_pool = PipelinePool(gpu_ids, model_variant=normalized_variant)
1749
- pipeline_pool.initialize_all()
1750
- ACTIVE_PIPELINE_POOLS[normalized_variant] = pipeline_pool
1751
-
1752
-
1753
- def run_task(
1754
- task: str,
1755
- prompt: str,
1756
- system_prompt: Optional[str],
1757
- input_video: Optional[str],
1758
- input_image: Optional[str],
1759
- height: int,
1760
- width: int,
1761
- num_frames: int,
1762
- seed: int,
1763
- resolution: str,
1764
- validation_num_timesteps: int,
1765
- validation_timestep_shift: float,
1766
- cfg_text_scale: float,
1767
- enable_frame_interpolation: bool,
1768
- ):
1769
- internal_task = normalize_task(task)
1770
- if internal_task == TASK_T2V:
1771
- num_frames = video_seconds_to_num_frames(num_frames)
1772
- pipeline_pool = get_pipeline_pool(task)
1773
- return pipeline_pool.generate(
1774
- task=task,
1775
- prompt=prompt,
1776
- system_prompt=system_prompt,
1777
- input_video=input_video,
1778
- input_image=input_image,
1779
- height=height,
1780
- width=width,
1781
- num_frames=num_frames,
1782
- seed=seed,
1783
- resolution=resolution,
1784
- validation_num_timesteps=validation_num_timesteps,
1785
- validation_timestep_shift=validation_timestep_shift,
1786
- cfg_text_scale=cfg_text_scale,
1787
- enable_frame_interpolation=enable_frame_interpolation,
1788
- )
1789
-
1790
-
1791
- def build_status_markdown() -> str:
1792
- gpu_text = "unknown"
1793
- concurrency = 1
1794
- loaded_variants = "none"
1795
- if ACTIVE_PIPELINE_POOLS:
1796
- loaded_variants = ",".join(sorted(ACTIVE_PIPELINE_POOLS))
1797
- gpu_ids = sorted({gpu_id for pool in ACTIVE_PIPELINE_POOLS.values() for gpu_id in pool.gpu_ids})
1798
- gpu_text = ",".join(str(gpu_id) for gpu_id in gpu_ids)
1799
- concurrency = len(gpu_ids)
1800
- return (
1801
- f"**Status** GPU: `{gpu_text}` | Max concurrency: `{concurrency}` | "
1802
- f"Queue limit: `{QUEUE_MAX_SIZE}` | Loaded models: `{loaded_variants}` | "
1803
- f"Switch mode: `dual resident`"
1804
- )
1805
-
1806
-
1807
- def get_logo_data_uri() -> str:
1808
- if not LANCE_LOGO_PATH.exists():
1809
- return ""
1810
- encoded_logo = base64.b64encode(LANCE_LOGO_PATH.read_bytes()).decode("ascii")
1811
- return f"data:image/webp;base64,{encoded_logo}"
1812
-
1813
-
1814
- def build_header_html() -> str:
1815
- logo_data_uri = get_logo_data_uri()
1816
- logo_html = (
1817
- f'<img class="lance-logo" src="{logo_data_uri}" alt="Lance logo">'
1818
- if logo_data_uri
1819
- else ""
1820
- )
1821
- return f"""
1822
- <div class="lance-hero">
1823
- {logo_html}
1824
- <h1 class="lance-title">Lance: Unified Multimodal Modeling by Multi-Task Synergy</h1>
1825
- <div class="lance-authors">
1826
- <strong>
1827
- <a href="https://scholar.google.com.hk/citations?user=FXxoQlsAAAAJ&hl=zh-CN&oi=ao" target="_blank">Fengyi Fu</a><sup>*</sup>,
1828
- <a href="https://corleone-huang.github.io/" target="_blank">Mengqi Huang</a><sup>*,✉</sup>,
1829
- <a href="https://scholar.google.com.hk/citations?user=9ER6nVkAAAAJ&hl=zh-CN&oi=ao" target="_blank">Shaojin Wu</a><sup>*</sup>,
1830
- Yunsheng Jiang<sup>*</sup>,
1831
- Yufei Huo,
1832
- <a href="https://guojianzhu.com/" target="_blank">Jianzhu Guo</a><sup>✉,§</sup>
1833
- </strong><br>
1834
- Hao Li, Yinghang Song, Fei Ding, Qian He, Zheren Fu, Zhendong Mao, Yongdong Zhang<br>
1835
- <em>ByteDance</em>
1836
- </div>
1837
- <div class="lance-badges">
1838
- <a href="{LANCE_HOMEPAGE_URL}" target="_blank" rel="noopener noreferrer">
1839
- <img alt="Homepage" src="https://img.shields.io/badge/Homepage-Lance-blue?style=flat">
1840
- </a>
1841
- <a href="{LANCE_PAPER_URL}" target="_blank" rel="noopener noreferrer">
1842
- <img alt="Paper" src="https://img.shields.io/badge/Paper-arXiv-red?style=flat&logo=arxiv">
1843
- </a>
1844
- <a href="{LANCE_HUGGING_FACE_URL}" target="_blank" rel="noopener noreferrer">
1845
- <img alt="Hugging Face" src="https://img.shields.io/badge/Model-HuggingFace-yellow?style=flat&logo=huggingface">
1846
- </a>
1847
- <a href="{LANCE_GITHUB_URL}" target="_blank" rel="noopener noreferrer">
1848
- <img alt="GitHub" src="https://img.shields.io/badge/Code-GitHub-536af5?color=536af5&logo=github">
1849
- </a>
1850
- </div>
1851
- </div>
1852
- """
1853
-
1854
-
1855
- def update_task_ui(task: str):
1856
- internal_task = normalize_task(task)
1857
- is_image_task = internal_task in IMAGE_TASKS
1858
- is_video_task = internal_task in VIDEO_TASKS
1859
- is_edit_task = internal_task in EDIT_TASKS
1860
- is_understanding_task = internal_task in UNDERSTANDING_TASKS
1861
- is_generation_task = internal_task in GENERATION_TASKS
1862
- show_media_input = is_edit_task or is_understanding_task
1863
- resolution_choices = IMAGE_RESOLUTION_CHOICES if is_image_task else VIDEO_RESOLUTION_CHOICES
1864
- resolution_value = DEFAULT_IMAGE_RESOLUTION if is_image_task else DEFAULT_RESOLUTION
1865
- aspect_ratio_value = DEFAULT_IMAGE_ASPECT_RATIO if is_image_task else DEFAULT_VIDEO_ASPECT_RATIO
1866
- width_value, height_value = get_size_for_aspect_ratio(internal_task, aspect_ratio_value)
1867
- size_markdown = format_size_markdown(internal_task, width_value, height_value)
1868
- system_prompt_choices = get_understanding_system_prompt_choices(internal_task)
1869
-
1870
- if is_generation_task:
1871
- text_label = "Prompt"
1872
- text_placeholder = "Describe what you want to generate..."
1873
- elif is_edit_task:
1874
- text_label = "Instruction"
1875
- text_placeholder = "Describe the edit you want..."
1876
- else:
1877
- text_label = "Question"
1878
- text_placeholder = "Ask a question about the input..."
1879
-
1880
- return (
1881
- gr.update(
1882
- label=text_label,
1883
- placeholder=text_placeholder,
1884
- visible=True,
1885
- ),
1886
- gr.update(
1887
- choices=system_prompt_choices,
1888
- value=system_prompt_choices[0],
1889
- visible=False,
1890
- ),
1891
- gr.update(label="Input Video", visible=show_media_input and is_video_task),
1892
- gr.update(label="Input Image", visible=show_media_input and is_image_task),
1893
- gr.update(value=aspect_ratio_value, visible=is_generation_task or is_edit_task),
1894
- gr.update(value=height_value),
1895
- gr.update(value=width_value),
1896
- gr.update(value=size_markdown, visible=is_generation_task or is_edit_task),
1897
- gr.update(visible=internal_task == TASK_T2V, value=DEFAULT_VIDEO_DURATION_SECONDS if internal_task == TASK_T2V else 1),
1898
- gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}, value=DEFAULT_FRAME_INTERPOLATION),
1899
- gr.update(choices=resolution_choices, value=resolution_value, visible=False),
1900
- gr.update(visible=internal_task in {TASK_T2V, TASK_VIDEO_EDIT}),
1901
- gr.update(visible=internal_task in {TASK_T2I, TASK_IMAGE_EDIT}),
1902
- gr.update(visible=is_understanding_task, value=""),
1903
- gr.update(visible=internal_task == TASK_T2V),
1904
- gr.update(visible=internal_task == TASK_VIDEO_EDIT),
1905
- gr.update(visible=internal_task == TASK_X2T_VIDEO),
1906
- gr.update(visible=internal_task == TASK_T2I),
1907
- gr.update(visible=internal_task == TASK_IMAGE_EDIT),
1908
- gr.update(visible=internal_task == TASK_X2T_IMAGE),
1909
- )
1910
-
1911
-
1912
- def keep_example_clicks_from_changing_visibility(*examples_components) -> None:
1913
- for examples_component in examples_components:
1914
- dataset = getattr(examples_component, "dataset", None)
1915
- component_props = getattr(dataset, "component_props", None)
1916
- if not component_props:
1917
- continue
1918
- for props in component_props:
1919
- props.pop("visible", None)
1920
-
1921
-
1922
- def build_demo() -> gr.Blocks:
1923
- with gr.Blocks(title="Lance", css=APP_CSS) as demo:
1924
- gr.HTML(build_header_html())
1925
- gr.Markdown(build_status_markdown(), elem_classes=["lance-status"], visible=False)
1926
-
1927
- with gr.Row(elem_classes=["lance-main-row"]):
1928
- with gr.Column(scale=1, elem_classes=["lance-main-column"]):
1929
- task = gr.Radio(
1930
- label="Task",
1931
- choices=TASK_CHOICES,
1932
- value=TASK_LABEL_VIDEO_GENERATION,
1933
- elem_classes=["task-selector"],
1934
- )
1935
- prompt = gr.Textbox(
1936
- label="Prompt",
1937
- lines=6,
1938
- placeholder="Describe the video you want to generate...",
1939
- )
1940
- system_prompt = gr.Dropdown(
1941
- label="System Prompt",
1942
- choices=get_understanding_system_prompt_choices(TASK_X2T_VIDEO),
1943
- value=V2T_QA_SYSTEM_PROMPT,
1944
- visible=False,
1945
- )
1946
- input_video = gr.Video(label="Input Video", visible=False, elem_classes=["lance-display-frame"])
1947
- input_image = gr.Image(label="Input Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
1948
- with gr.Row(elem_classes=["generation-controls-row"]):
1949
- enable_frame_interpolation = gr.Dropdown(
1950
- label="Frame Interpolation",
1951
- choices=[FRAME_INTERPOLATION_YES, FRAME_INTERPOLATION_NO],
1952
- value=DEFAULT_FRAME_INTERPOLATION,
1953
- elem_classes=["generation-control", "generation-dropdown-control"],
1954
- min_width=0,
1955
- )
1956
- seed = gr.Number(
1957
- label="Seed (-1 for random seed)",
1958
- value=DEFAULT_BASIC_SEED,
1959
- precision=0,
1960
- elem_classes=["generation-control", "generation-value-control"],
1961
- min_width=0,
1962
- # info="-1 for random seed",
1963
- )
1964
- aspect_ratio = gr.Dropdown(
1965
- label="Aspect Ratio",
1966
- # choices=ASPECT_RATIO_CHOICES, # 原始版本,不显示 是否为 default
1967
- choices=get_aspect_ratio_choices_for_task(TASK_T2V),
1968
- value=DEFAULT_VIDEO_ASPECT_RATIO,
1969
- elem_classes=["generation-control", "generation-dropdown-control"],
1970
- min_width=0,
1971
- )
1972
- # real_size = gr.Markdown(format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT))
1973
- real_size = gr.Textbox(
1974
- label="Output Resolution",
1975
- value=format_size_markdown(TASK_T2V, DEFAULT_WIDTH, DEFAULT_HEIGHT),
1976
- interactive=False,
1977
- elem_classes=["generation-control", "generation-value-control"],
1978
- min_width=0,
1979
- )
1980
- resolution = gr.Dropdown(
1981
- label="Resolution",
1982
- choices=RESOLUTION_CHOICES,
1983
- value=DEFAULT_RESOLUTION,
1984
- visible=False,
1985
- )
1986
- height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
1987
- width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
1988
- num_frames = gr.Slider(
1989
- minimum=1,
1990
- maximum=10,
1991
- step=1,
1992
- value=DEFAULT_VIDEO_DURATION_SECONDS,
1993
- label="Video Duration (seconds)",
1994
- )
1995
- # seed = gr.Number(
1996
- # label="Seed",
1997
- # value=DEFAULT_BASIC_SEED,
1998
- # precision=0,
1999
- # info="-1 means using a random seed each time",
2000
- # )
2001
-
2002
- with gr.Accordion("Advanced Parameters", open=False):
2003
- validation_num_timesteps = gr.Slider(
2004
- minimum=1,
2005
- maximum=100,
2006
- step=1,
2007
- value=DEFAULT_TIMESTEPS,
2008
- label="Validation Num Timesteps",
2009
- )
2010
- with gr.Row():
2011
- validation_timestep_shift = gr.Number(
2012
- label="Validation Timestep Shift",
2013
- value=DEFAULT_TIMESTEP_SHIFT,
2014
- )
2015
- cfg_text_scale = gr.Number(
2016
- label="CFG Text Scale",
2017
- value=DEFAULT_CFG_TEXT_SCALE,
2018
- )
2019
-
2020
- generation_example_inputs = [
2021
- prompt,
2022
- input_video,
2023
- input_image,
2024
- ]
2025
-
2026
- with gr.Column(scale=1, elem_classes=["lance-main-column"]):
2027
- output_video = gr.Video(label="Output Video", elem_classes=["lance-display-frame"])
2028
- output_image = gr.Image(label="Output Image", type="filepath", visible=False, elem_classes=["lance-display-frame"])
2029
- output_text = gr.Textbox(label="Output Text", lines=8, visible=False, elem_classes=["lance-display-frame"])
2030
- status = gr.Markdown("WAITING TO RUN.")
2031
- logs = gr.Textbox(label="Run Logs", lines=22, max_lines=30)
2032
-
2033
- run_button = gr.Button("🚀 Generate", variant="primary", elem_classes=["lance-run-button"])
2034
-
2035
- with gr.Group(visible=True, elem_classes=["prompt-examples", "example-panel"]) as video_generation_examples_group:
2036
- gr.Markdown("### Video generation recommended cases", elem_classes=["recommended-title"])
2037
- video_generation_examples = gr.Dataset(
2038
- samples=VIDEO_GENERATION_EXAMPLES,
2039
- components=[gr.Textbox(label="Prompt", visible=False)],
2040
- headers=["Prompt"],
2041
- show_label=False,
2042
- type="values",
2043
- layout="table",
2044
- samples_per_page=len(VIDEO_GENERATION_EXAMPLES),
2045
- elem_classes=["prompt-dataset"],
2046
- )
2047
-
2048
- with gr.Group(visible=False, elem_classes=["example-panel"]) as video_edit_examples_group:
2049
- gr.Markdown("### Video edit recommended cases", elem_classes=["recommended-title"])
2050
- video_edit_examples = gr.Examples(
2051
- examples=VIDEO_EDIT_EXAMPLES,
2052
- inputs=generation_example_inputs,
2053
- label="",
2054
- examples_per_page=3,
2055
- cache_examples=False,
2056
- preprocess=False,
2057
- postprocess=False,
2058
- )
2059
-
2060
- with gr.Group(visible=False, elem_classes=["example-panel"]) as video_understanding_examples_group:
2061
- gr.Markdown("### Video understanding recommended cases", elem_classes=["recommended-title"])
2062
- video_understanding_examples = gr.Examples(
2063
- examples=VIDEO_UNDERSTANDING_EXAMPLES,
2064
- inputs=generation_example_inputs,
2065
- label="",
2066
- examples_per_page=4,
2067
- cache_examples=False,
2068
- preprocess=False,
2069
- postprocess=False,
2070
- )
2071
-
2072
- with gr.Group(visible=False, elem_classes=["prompt-examples", "example-panel"]) as image_generation_examples_group:
2073
- gr.Markdown("### Image generation recommended cases", elem_classes=["recommended-title"])
2074
- image_generation_examples = gr.Dataset(
2075
- samples=IMAGE_GENERATION_EXAMPLES,
2076
- components=[gr.Textbox(label="Prompt", visible=False)],
2077
- headers=["Prompt"],
2078
- show_label=False,
2079
- type="values",
2080
- layout="table",
2081
- samples_per_page=len(IMAGE_GENERATION_EXAMPLES),
2082
- elem_classes=["prompt-dataset"],
2083
- )
2084
-
2085
- with gr.Group(visible=False, elem_classes=["example-panel"]) as image_edit_examples_group:
2086
- gr.Markdown("### Image edit recommended cases", elem_classes=["recommended-title"])
2087
- image_edit_examples = gr.Examples(
2088
- examples=IMAGE_EDIT_EXAMPLES,
2089
- inputs=generation_example_inputs,
2090
- label="",
2091
- examples_per_page=5,
2092
- cache_examples=False,
2093
- preprocess=False,
2094
- postprocess=False,
2095
- )
2096
-
2097
- with gr.Group(visible=False, elem_classes=["example-panel"]) as image_understanding_examples_group:
2098
- gr.Markdown("### Image understanding recommended cases", elem_classes=["recommended-title"])
2099
- image_understanding_examples = gr.Examples(
2100
- examples=IMAGE_UNDERSTANDING_EXAMPLES,
2101
- inputs=generation_example_inputs,
2102
- label="",
2103
- examples_per_page=4,
2104
- cache_examples=False,
2105
- preprocess=False,
2106
- postprocess=False,
2107
- )
2108
-
2109
- keep_example_clicks_from_changing_visibility(
2110
- video_generation_examples,
2111
- video_edit_examples,
2112
- video_understanding_examples,
2113
- image_generation_examples,
2114
- image_edit_examples,
2115
- image_understanding_examples,
2116
- )
2117
-
2118
- task.change(
2119
- fn=update_task_ui,
2120
- inputs=[task],
2121
- outputs=[
2122
- prompt,
2123
- system_prompt,
2124
- input_video,
2125
- input_image,
2126
- aspect_ratio,
2127
- height,
2128
- width,
2129
- real_size,
2130
- num_frames,
2131
- enable_frame_interpolation,
2132
- resolution,
2133
- output_video,
2134
- output_image,
2135
- output_text,
2136
- video_generation_examples_group,
2137
- video_edit_examples_group,
2138
- video_understanding_examples_group,
2139
- image_generation_examples_group,
2140
- image_edit_examples_group,
2141
- image_understanding_examples_group,
2142
- ],
2143
- )
2144
-
2145
- aspect_ratio.change(
2146
- fn=update_size_from_aspect_ratio,
2147
- inputs=[task, aspect_ratio],
2148
- outputs=[height, width, real_size],
2149
- queue=False,
2150
- show_api=False,
2151
- )
2152
-
2153
- for examples_component in (video_edit_examples, video_understanding_examples, image_edit_examples, image_understanding_examples):
2154
- examples_component.load_input_event.then(
2155
- fn=reset_generation_defaults_for_task,
2156
- inputs=[task],
2157
- outputs=[aspect_ratio, height, width, num_frames, resolution, real_size],
2158
- queue=False,
2159
- show_api=False,
2160
- )
2161
-
2162
- video_generation_examples.select(
2163
- fn=apply_prompt_example,
2164
- inputs=[task],
2165
- outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
2166
- queue=False,
2167
- show_api=False,
2168
- )
2169
- image_generation_examples.select(
2170
- fn=apply_prompt_example,
2171
- inputs=[task],
2172
- outputs=[prompt, aspect_ratio, height, width, num_frames, resolution, real_size],
2173
- queue=False,
2174
- show_api=False,
2175
- )
2176
-
2177
- run_button.click(
2178
- fn=run_task,
2179
- inputs=[
2180
- task,
2181
- prompt,
2182
- system_prompt,
2183
- input_video,
2184
- input_image,
2185
- height,
2186
- width,
2187
- num_frames,
2188
- seed,
2189
- resolution,
2190
- validation_num_timesteps,
2191
- validation_timestep_shift,
2192
- cfg_text_scale,
2193
- enable_frame_interpolation,
2194
- ],
2195
- outputs=[output_video, output_image, output_text, status, logs],
2196
- )
2197
-
2198
- return demo
2199
-
2200
-
2201
- def parse_args() -> argparse.Namespace:
2202
- parser = argparse.ArgumentParser(description="Lance multimodal Gradio")
2203
- parser.add_argument("--server-name", default=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"))
2204
- parser.add_argument("--server-port", type=int, default=int(os.getenv("GRADIO_SERVER_PORT", "7860")))
2205
- parser.add_argument("--share", action="store_true", default=env_flag("GRADIO_SHARE", False))
2206
- parser.add_argument(
2207
- "--gpus",
2208
- default=os.getenv("LANCE_GPUS", DEFAULT_GPUS),
2209
- help="Comma-separated GPU list, for example: 0,1,2,3,4,5,6",
2210
- )
2211
- parser.add_argument(
2212
- "--queue-size",
2213
- type=int,
2214
- default=int(os.getenv("LANCE_QUEUE_SIZE", str(DEFAULT_QUEUE_SIZE))),
2215
- help="Maximum number of queued Gradio requests.",
2216
- )
2217
- return parser.parse_args()
2218
-
2219
-
2220
- def parse_gpu_ids(gpu_string: str) -> list[int]:
2221
- gpu_ids: list[int] = []
2222
- for item in gpu_string.split(","):
2223
- item = item.strip()
2224
- if not item:
2225
- continue
2226
- gpu_ids.append(int(item))
2227
- if not gpu_ids:
2228
- raise ValueError("No valid GPU IDs were parsed.")
2229
- return gpu_ids
2230
-
2231
-
2232
- if __name__ == "__main__":
2233
- args = parse_args()
2234
- os.environ["LANCE_GPUS"] = args.gpus
2235
- QUEUE_MAX_SIZE = args.queue_size
2236
- gpu_ids = parse_gpu_ids(args.gpus)
2237
- preload_pipeline_pools(gpu_ids, PRELOAD_MODEL_VARIANTS)
2238
- default_concurrency_limit = max(1, len(gpu_ids))
2239
- demo = build_demo()
2240
- demo.queue(
2241
- max_size=args.queue_size,
2242
- default_concurrency_limit=default_concurrency_limit,
2243
- ).launch(
2244
- server_name=args.server_name,
2245
- server_port=args.server_port,
2246
- share=args.share,
2247
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/video-understanding/videos/video-understanding-caption-long-01_h264.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7387de84940c96d7ed5e50cd0ee78de3e1b5062903466cb0861f497bd95efc52
3
+ size 679220
assets/video-understanding/videos/video-understanding-caption-short-01_h264.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62a8ca1c0f50dc0ba08ed33814031494b7f6eac9fc889f241b1a52789cff8eed
3
+ size 381609
assets/video-understanding/videos/video-understanding-vqa-01_h264.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b7185e2b75fa656f45b439a01064ee0ac411057449079da4d36fd08306f2dad
3
+ size 284350
config/examples/video_edit_examples/edit_source_car_h264.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e436c2954f3a19be39248ba48c7b98edffbe0f3b7eeaeb3c44d8168e722d433d
3
+ size 220126
config/examples/video_edit_examples/edit_source_woman_h264.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e665d11d1b6b0a45aa44cb930fdc4ea125f67ea692b0882e5fa3e9b282b1b4ba
3
+ size 56974