Spaces:
Running on Zero
Running on Zero
Vendor RIFE into repo
Browse files- app.py +155 -78
- inference_lance.py +36 -35
app.py
CHANGED
|
@@ -21,6 +21,8 @@ from datetime import datetime
|
|
| 21 |
from pathlib import Path
|
| 22 |
from typing import Optional
|
| 23 |
|
|
|
|
|
|
|
| 24 |
try:
|
| 25 |
import spaces
|
| 26 |
except ImportError: # pragma: no cover - keeps local CPU runs working
|
|
@@ -1556,29 +1558,21 @@ UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
|
|
| 1556 |
IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
|
| 1557 |
VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
|
| 1558 |
EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
|
| 1559 |
-
VIDEO_RESOLUTION_CHOICES = [
|
| 1560 |
-
VIDEO_RESOLUTION_DISPLAY_CHOICES = [
|
| 1561 |
-
("video_360p", "video_360p"),
|
| 1562 |
-
("video_480p(Higher quota usage. Use sparingly.)", "video_480p"),
|
| 1563 |
-
]
|
| 1564 |
VIDEO_EDIT_RESOLUTION_CHOICES = [DEFAULT_VIDEO_EDIT_RESOLUTION]
|
| 1565 |
IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
|
| 1566 |
RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
|
| 1567 |
-
|
| 1568 |
-
"Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background."
|
| 1569 |
-
)
|
| 1570 |
-
V2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="video")
|
| 1571 |
-
I2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="image")
|
| 1572 |
V2T_QA_SYSTEM_PROMPT = "View the video attentively and provide a suitable answer to the posed question."
|
| 1573 |
I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
|
| 1574 |
-
|
| 1575 |
-
|
| 1576 |
def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
|
| 1577 |
"""Get Aspect Ratio choices with default/recommended marker for the given task."""
|
| 1578 |
internal_task = normalize_task(task)
|
| 1579 |
default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
|
| 1580 |
return [
|
| 1581 |
-
(f"{ratio}" if ratio == default_ratio else ratio, ratio)
|
| 1582 |
for ratio in ASPECT_RATIO_CHOICES
|
| 1583 |
]
|
| 1584 |
|
|
@@ -2817,6 +2811,8 @@ class LanceT2VV2TPipeline:
|
|
| 2817 |
data_args=request_data_args,
|
| 2818 |
inference_args=request_inference_args,
|
| 2819 |
)
|
|
|
|
|
|
|
| 2820 |
generate_start = time.perf_counter()
|
| 2821 |
validate_on_fixed_batch(
|
| 2822 |
fsdp_model=self.model,
|
|
@@ -3047,7 +3043,13 @@ class PipelinePool:
|
|
| 3047 |
def gpu_summary(self) -> str:
|
| 3048 |
return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
|
| 3049 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3050 |
def initialize_all(self) -> None:
|
|
|
|
|
|
|
| 3051 |
print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
|
| 3052 |
exceptions: list[Exception] = []
|
| 3053 |
with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
|
|
@@ -3135,6 +3137,7 @@ class PipelinePool:
|
|
| 3135 |
self.release(pipeline)
|
| 3136 |
|
| 3137 |
|
|
|
|
| 3138 |
ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
|
| 3139 |
ACTIVE_POOL_LOCK = threading.Lock()
|
| 3140 |
QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
|
|
@@ -3209,9 +3212,53 @@ def clamp_zerogpu_duration(seconds: int) -> int:
|
|
| 3209 |
|
| 3210 |
ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
|
| 3211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3212 |
def is_pipeline_pool_ready_for_task(task: str) -> bool:
|
| 3213 |
-
|
| 3214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3215 |
|
| 3216 |
|
| 3217 |
def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
|
|
@@ -3258,30 +3305,37 @@ def _estimate_zerogpu_duration_seconds(
|
|
| 3258 |
prompt_length = len((prompt or "").strip())
|
| 3259 |
has_video_input = bool((input_video or "").strip())
|
| 3260 |
has_image_input = bool((input_image or "").strip())
|
|
|
|
| 3261 |
is_video_task = internal_task in {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
|
| 3262 |
is_image_task = internal_task in {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
|
| 3263 |
|
| 3264 |
if internal_task == TASK_T2I:
|
| 3265 |
-
return 150
|
| 3266 |
|
| 3267 |
if internal_task == TASK_IMAGE_EDIT:
|
| 3268 |
-
return 150
|
| 3269 |
|
| 3270 |
if internal_task == TASK_X2T_IMAGE:
|
| 3271 |
-
return 150
|
| 3272 |
|
| 3273 |
if internal_task == TASK_X2T_VIDEO:
|
| 3274 |
-
return 200
|
| 3275 |
|
| 3276 |
if internal_task == TASK_VIDEO_EDIT:
|
| 3277 |
-
base = 300
|
| 3278 |
-
base += min(48, max(0, num_frames - 37) //
|
| 3279 |
-
base +=
|
| 3280 |
-
base +=
|
| 3281 |
-
base +=
|
| 3282 |
return base
|
| 3283 |
|
| 3284 |
if internal_task == TASK_T2V:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3285 |
base = 224 if resolution == "video_360p" else 264
|
| 3286 |
base += min(56, max(0, num_frames - 37) // 2)
|
| 3287 |
base += 28 if enable_frame_interpolation else 0
|
|
@@ -3289,13 +3343,13 @@ def _estimate_zerogpu_duration_seconds(
|
|
| 3289 |
return base
|
| 3290 |
|
| 3291 |
if is_video_task:
|
| 3292 |
-
base = 240
|
| 3293 |
-
base += min(40, max(0, num_frames - 37) //
|
| 3294 |
-
base +=
|
| 3295 |
return base
|
| 3296 |
|
| 3297 |
if is_image_task:
|
| 3298 |
-
return 120
|
| 3299 |
|
| 3300 |
return 160
|
| 3301 |
|
|
@@ -3335,34 +3389,6 @@ def get_run_task_gpu_duration(
|
|
| 3335 |
return finalize_zerogpu_duration(estimated_seconds, task)
|
| 3336 |
|
| 3337 |
|
| 3338 |
-
def get_pipeline_pool(task: str) -> PipelinePool:
|
| 3339 |
-
global ACTIVE_PIPELINE_POOL
|
| 3340 |
-
if not torch.cuda.is_available():
|
| 3341 |
-
raise RuntimeError(
|
| 3342 |
-
"Lance inference requires a GPU. The Gradio UI can start on CPU, but generation is disabled "
|
| 3343 |
-
"until GPU hardware is attached."
|
| 3344 |
-
)
|
| 3345 |
-
model_variant = get_task_model_variant(task)
|
| 3346 |
-
with ACTIVE_POOL_LOCK:
|
| 3347 |
-
if ACTIVE_PIPELINE_POOL is not None and ACTIVE_PIPELINE_POOL.model_variant == model_variant:
|
| 3348 |
-
return ACTIVE_PIPELINE_POOL
|
| 3349 |
-
|
| 3350 |
-
gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
|
| 3351 |
-
if ACTIVE_PIPELINE_POOL is not None:
|
| 3352 |
-
previous_variant = ACTIVE_PIPELINE_POOL.model_variant
|
| 3353 |
-
print(
|
| 3354 |
-
f"[runtime] Switching Lance model from {previous_variant} to {model_variant}.",
|
| 3355 |
-
flush=True,
|
| 3356 |
-
)
|
| 3357 |
-
ACTIVE_PIPELINE_POOL.unload_all()
|
| 3358 |
-
ACTIVE_PIPELINE_POOL = None
|
| 3359 |
-
|
| 3360 |
-
ACTIVE_PIPELINE_POOL = PipelinePool(gpu_ids, model_variant=model_variant)
|
| 3361 |
-
ACTIVE_PIPELINE_POOL.initialize_all()
|
| 3362 |
-
return ACTIVE_PIPELINE_POOL
|
| 3363 |
-
|
| 3364 |
-
|
| 3365 |
-
@spaces.GPU(size="large", duration=get_run_task_gpu_duration)
|
| 3366 |
def run_task(
|
| 3367 |
task: str,
|
| 3368 |
prompt: str,
|
|
@@ -3380,8 +3406,55 @@ def run_task(
|
|
| 3380 |
enable_frame_interpolation: bool,
|
| 3381 |
):
|
| 3382 |
internal_task = normalize_task(task)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3383 |
if internal_task == TASK_T2V:
|
| 3384 |
num_frames = video_seconds_to_num_frames(num_frames)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3385 |
pipeline_pool = get_pipeline_pool(task)
|
| 3386 |
return pipeline_pool.generate(
|
| 3387 |
task=task,
|
|
@@ -3405,14 +3478,18 @@ def build_status_markdown() -> str:
|
|
| 3405 |
gpu_text = "unknown"
|
| 3406 |
concurrency = 1
|
| 3407 |
active_variant = "none"
|
|
|
|
| 3408 |
if ACTIVE_PIPELINE_POOL is not None:
|
| 3409 |
active_variant = ACTIVE_PIPELINE_POOL.model_variant
|
| 3410 |
gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
|
| 3411 |
concurrency = ACTIVE_PIPELINE_POOL.size
|
|
|
|
|
|
|
|
|
|
| 3412 |
return (
|
| 3413 |
f"**Status** GPU: `{gpu_text}` | Max concurrency: `{concurrency}` | "
|
| 3414 |
f"Queue limit: `{QUEUE_MAX_SIZE}` | Active model: `{active_variant}` | "
|
| 3415 |
-
f"
|
| 3416 |
)
|
| 3417 |
|
| 3418 |
|
|
@@ -3604,6 +3681,17 @@ def build_demo() -> gr.Blocks:
|
|
| 3604 |
value=DEFAULT_FRAME_INTERPOLATION if RIFE_AVAILABLE else FRAME_INTERPOLATION_NO,
|
| 3605 |
elem_classes=["generation-control", "generation-two-line-label"],
|
| 3606 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3607 |
with gr.Row(elem_classes=["generation-controls-row", "aspect-ratio-row"]) as aspect_ratio_row:
|
| 3608 |
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3609 |
gr.HTML('<div class="lance-generation-label">Aspect Ratio</div>', elem_classes=["lance-label-html"])
|
|
@@ -3615,6 +3703,16 @@ def build_demo() -> gr.Blocks:
|
|
| 3615 |
value=DEFAULT_VIDEO_ASPECT_RATIO,
|
| 3616 |
elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
|
| 3617 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3618 |
with gr.Row(elem_classes=["generation-controls-row", "output-resolution-row"], visible=False) as output_resolution_row:
|
| 3619 |
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3620 |
gr.HTML('<div class="lance-generation-label">Output Resolution</div>', elem_classes=["lance-label-html"])
|
|
@@ -3627,27 +3725,6 @@ def build_demo() -> gr.Blocks:
|
|
| 3627 |
visible=False,
|
| 3628 |
elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
|
| 3629 |
)
|
| 3630 |
-
with gr.Row(elem_classes=["generation-controls-row", "video-duration-row"]) as video_duration_row:
|
| 3631 |
-
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3632 |
-
gr.HTML(build_lance_label_html("Video Duration (seconds)", "lance-generation-label"), elem_classes=["lance-label-html"])
|
| 3633 |
-
num_frames = gr.Radio(
|
| 3634 |
-
label="Video Duration (seconds)",
|
| 3635 |
-
show_label=False,
|
| 3636 |
-
choices=get_video_duration_choices(),
|
| 3637 |
-
value=DEFAULT_VIDEO_DURATION_SECONDS,
|
| 3638 |
-
elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
|
| 3639 |
-
)
|
| 3640 |
-
with gr.Row(elem_classes=["generation-controls-row", "video-resolution-row"]) as video_resolution_row:
|
| 3641 |
-
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3642 |
-
gr.HTML(build_lance_label_html("Video Resolution", "lance-generation-label"), elem_classes=["lance-label-html"])
|
| 3643 |
-
resolution = gr.Dropdown(
|
| 3644 |
-
label="Video Resolution",
|
| 3645 |
-
show_label=False,
|
| 3646 |
-
choices=VIDEO_RESOLUTION_DISPLAY_CHOICES,
|
| 3647 |
-
value=DEFAULT_RESOLUTION,
|
| 3648 |
-
allow_custom_value=True,
|
| 3649 |
-
elem_classes=["generation-control"],
|
| 3650 |
-
)
|
| 3651 |
height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
|
| 3652 |
width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
|
| 3653 |
|
|
|
|
| 21 |
from pathlib import Path
|
| 22 |
from typing import Optional
|
| 23 |
|
| 24 |
+
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:128")
|
| 25 |
+
|
| 26 |
try:
|
| 27 |
import spaces
|
| 28 |
except ImportError: # pragma: no cover - keeps local CPU runs working
|
|
|
|
| 1558 |
IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
|
| 1559 |
VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
|
| 1560 |
EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
|
| 1561 |
+
VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1562 |
VIDEO_EDIT_RESOLUTION_CHOICES = [DEFAULT_VIDEO_EDIT_RESOLUTION]
|
| 1563 |
IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
|
| 1564 |
RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
|
| 1565 |
+
VIDEO_RESOLUTION_DISPLAY_CHOICES = [("360p", "video_360p"), ("480p", "video_480p")]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1566 |
V2T_QA_SYSTEM_PROMPT = "View the video attentively and provide a suitable answer to the posed question."
|
| 1567 |
I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
|
| 1568 |
+
|
| 1569 |
+
|
| 1570 |
def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
|
| 1571 |
"""Get Aspect Ratio choices with default/recommended marker for the given task."""
|
| 1572 |
internal_task = normalize_task(task)
|
| 1573 |
default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
|
| 1574 |
return [
|
| 1575 |
+
(f"{ratio} (default)" if ratio == default_ratio else ratio, ratio)
|
| 1576 |
for ratio in ASPECT_RATIO_CHOICES
|
| 1577 |
]
|
| 1578 |
|
|
|
|
| 2811 |
data_args=request_data_args,
|
| 2812 |
inference_args=request_inference_args,
|
| 2813 |
)
|
| 2814 |
+
# Keep the allocator from fragmenting before the heavy forward pass.
|
| 2815 |
+
clean_memory()
|
| 2816 |
generate_start = time.perf_counter()
|
| 2817 |
validate_on_fixed_batch(
|
| 2818 |
fsdp_model=self.model,
|
|
|
|
| 3043 |
def gpu_summary(self) -> str:
|
| 3044 |
return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
|
| 3045 |
|
| 3046 |
+
@property
|
| 3047 |
+
def is_initialized(self) -> bool:
|
| 3048 |
+
return all(pipeline.initialized for pipeline in self.pipelines)
|
| 3049 |
+
|
| 3050 |
def initialize_all(self) -> None:
|
| 3051 |
+
if self.is_initialized:
|
| 3052 |
+
return
|
| 3053 |
print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
|
| 3054 |
exceptions: list[Exception] = []
|
| 3055 |
with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
|
|
|
|
| 3137 |
self.release(pipeline)
|
| 3138 |
|
| 3139 |
|
| 3140 |
+
PIPELINE_POOLS: dict[str, PipelinePool] = {}
|
| 3141 |
ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
|
| 3142 |
ACTIVE_POOL_LOCK = threading.Lock()
|
| 3143 |
QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
|
|
|
|
| 3212 |
|
| 3213 |
ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
|
| 3214 |
|
| 3215 |
+
|
| 3216 |
+
def get_other_model_variant(model_variant: str) -> str:
|
| 3217 |
+
normalized_variant = normalize_model_variant(model_variant)
|
| 3218 |
+
return MODEL_VARIANT_IMAGE if normalized_variant == MODEL_VARIANT_VIDEO else MODEL_VARIANT_VIDEO
|
| 3219 |
+
|
| 3220 |
+
|
| 3221 |
+
def is_pipeline_pool_ready_for_variant(model_variant: str) -> bool:
|
| 3222 |
+
normalized_variant = normalize_model_variant(model_variant)
|
| 3223 |
+
with ACTIVE_POOL_LOCK:
|
| 3224 |
+
pool = PIPELINE_POOLS.get(normalized_variant)
|
| 3225 |
+
return bool(pool is not None and pool.is_initialized)
|
| 3226 |
+
|
| 3227 |
+
|
| 3228 |
def is_pipeline_pool_ready_for_task(task: str) -> bool:
|
| 3229 |
+
return is_pipeline_pool_ready_for_variant(get_task_model_variant(task))
|
| 3230 |
+
|
| 3231 |
+
|
| 3232 |
+
def get_or_create_pipeline_pool(model_variant: str) -> PipelinePool:
|
| 3233 |
+
if not torch.cuda.is_available():
|
| 3234 |
+
raise RuntimeError(
|
| 3235 |
+
"Lance inference requires a GPU. The Gradio UI can start on CPU, but generation is disabled "
|
| 3236 |
+
"until GPU hardware is attached."
|
| 3237 |
+
)
|
| 3238 |
+
normalized_variant = normalize_model_variant(model_variant)
|
| 3239 |
+
gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
|
| 3240 |
+
with ACTIVE_POOL_LOCK:
|
| 3241 |
+
pool = PIPELINE_POOLS.get(normalized_variant)
|
| 3242 |
+
if pool is None:
|
| 3243 |
+
pool = PipelinePool(gpu_ids, model_variant=normalized_variant)
|
| 3244 |
+
PIPELINE_POOLS[normalized_variant] = pool
|
| 3245 |
+
return pool
|
| 3246 |
+
|
| 3247 |
+
|
| 3248 |
+
def ensure_pipeline_pool_ready(model_variant: str) -> PipelinePool:
|
| 3249 |
+
pool = get_or_create_pipeline_pool(model_variant)
|
| 3250 |
+
if not pool.is_initialized:
|
| 3251 |
+
pool.initialize_all()
|
| 3252 |
+
return pool
|
| 3253 |
+
|
| 3254 |
+
|
| 3255 |
+
def get_pipeline_pool(task: str) -> PipelinePool:
|
| 3256 |
+
global ACTIVE_PIPELINE_POOL
|
| 3257 |
+
model_variant = get_task_model_variant(task)
|
| 3258 |
+
pool = ensure_pipeline_pool_ready(model_variant)
|
| 3259 |
+
with ACTIVE_POOL_LOCK:
|
| 3260 |
+
ACTIVE_PIPELINE_POOL = pool
|
| 3261 |
+
return pool
|
| 3262 |
|
| 3263 |
|
| 3264 |
def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
|
|
|
|
| 3305 |
prompt_length = len((prompt or "").strip())
|
| 3306 |
has_video_input = bool((input_video or "").strip())
|
| 3307 |
has_image_input = bool((input_image or "").strip())
|
| 3308 |
+
pool_ready = is_pipeline_pool_ready_for_task(internal_task)
|
| 3309 |
is_video_task = internal_task in {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
|
| 3310 |
is_image_task = internal_task in {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
|
| 3311 |
|
| 3312 |
if internal_task == TASK_T2I:
|
| 3313 |
+
return 90 if pool_ready else 150
|
| 3314 |
|
| 3315 |
if internal_task == TASK_IMAGE_EDIT:
|
| 3316 |
+
return 100 if pool_ready else 150
|
| 3317 |
|
| 3318 |
if internal_task == TASK_X2T_IMAGE:
|
| 3319 |
+
return 90 if pool_ready else 150
|
| 3320 |
|
| 3321 |
if internal_task == TASK_X2T_VIDEO:
|
| 3322 |
+
return 120 if pool_ready else 200
|
| 3323 |
|
| 3324 |
if internal_task == TASK_VIDEO_EDIT:
|
| 3325 |
+
base = 170 if pool_ready else 300
|
| 3326 |
+
base += min(30 if pool_ready else 48, max(0, num_frames - 37) // 3)
|
| 3327 |
+
base += 24 if enable_frame_interpolation else 0
|
| 3328 |
+
base += 16 if has_video_input else 0
|
| 3329 |
+
base += 10 if resolution == "video_480p" else 0
|
| 3330 |
return base
|
| 3331 |
|
| 3332 |
if internal_task == TASK_T2V:
|
| 3333 |
+
if pool_ready:
|
| 3334 |
+
base = 130 if resolution == "video_360p" else 150
|
| 3335 |
+
base += min(36, max(0, num_frames - 37) // 3)
|
| 3336 |
+
base += 18 if enable_frame_interpolation else 0
|
| 3337 |
+
base += min(12, prompt_length // 320)
|
| 3338 |
+
return base
|
| 3339 |
base = 224 if resolution == "video_360p" else 264
|
| 3340 |
base += min(56, max(0, num_frames - 37) // 2)
|
| 3341 |
base += 28 if enable_frame_interpolation else 0
|
|
|
|
| 3343 |
return base
|
| 3344 |
|
| 3345 |
if is_video_task:
|
| 3346 |
+
base = 150 if pool_ready else 240
|
| 3347 |
+
base += min(28 if pool_ready else 40, max(0, num_frames - 37) // 3)
|
| 3348 |
+
base += 18 if enable_frame_interpolation else 0
|
| 3349 |
return base
|
| 3350 |
|
| 3351 |
if is_image_task:
|
| 3352 |
+
return 100 if pool_ready else 120
|
| 3353 |
|
| 3354 |
return 160
|
| 3355 |
|
|
|
|
| 3389 |
return finalize_zerogpu_duration(estimated_seconds, task)
|
| 3390 |
|
| 3391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3392 |
def run_task(
|
| 3393 |
task: str,
|
| 3394 |
prompt: str,
|
|
|
|
| 3406 |
enable_frame_interpolation: bool,
|
| 3407 |
):
|
| 3408 |
internal_task = normalize_task(task)
|
| 3409 |
+
if internal_task in UNDERSTANDING_TASKS and not prompt:
|
| 3410 |
+
return None, None, "", "Please enter a question.", ""
|
| 3411 |
+
if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
|
| 3412 |
+
return None, None, "", "Please upload an input video.", ""
|
| 3413 |
+
if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
|
| 3414 |
+
return None, None, "", "Please upload an input image.", ""
|
| 3415 |
+
if height <= 0 or width <= 0:
|
| 3416 |
+
return None, None, "", "Height and width must be greater than 0.", ""
|
| 3417 |
+
if num_frames <= 0:
|
| 3418 |
+
return None, None, "", "The number of frames must be greater than 0.", ""
|
| 3419 |
+
|
| 3420 |
if internal_task == TASK_T2V:
|
| 3421 |
num_frames = video_seconds_to_num_frames(num_frames)
|
| 3422 |
+
normalized_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
|
| 3423 |
+
return run_task_gpu(
|
| 3424 |
+
task=task,
|
| 3425 |
+
prompt=prompt,
|
| 3426 |
+
system_prompt=system_prompt,
|
| 3427 |
+
input_video=input_video,
|
| 3428 |
+
input_image=input_image,
|
| 3429 |
+
height=height,
|
| 3430 |
+
width=width,
|
| 3431 |
+
num_frames=num_frames,
|
| 3432 |
+
seed=seed,
|
| 3433 |
+
resolution=normalized_resolution,
|
| 3434 |
+
validation_num_timesteps=validation_num_timesteps,
|
| 3435 |
+
validation_timestep_shift=validation_timestep_shift,
|
| 3436 |
+
cfg_text_scale=cfg_text_scale,
|
| 3437 |
+
enable_frame_interpolation=enable_frame_interpolation,
|
| 3438 |
+
)
|
| 3439 |
+
|
| 3440 |
+
|
| 3441 |
+
@spaces.GPU(size="large", duration=get_run_task_gpu_duration)
|
| 3442 |
+
def run_task_gpu(
|
| 3443 |
+
task: str,
|
| 3444 |
+
prompt: str,
|
| 3445 |
+
system_prompt: Optional[str],
|
| 3446 |
+
input_video: Optional[str],
|
| 3447 |
+
input_image: Optional[str],
|
| 3448 |
+
height: int,
|
| 3449 |
+
width: int,
|
| 3450 |
+
num_frames: int,
|
| 3451 |
+
seed: int,
|
| 3452 |
+
resolution: str,
|
| 3453 |
+
validation_num_timesteps: int,
|
| 3454 |
+
validation_timestep_shift: float,
|
| 3455 |
+
cfg_text_scale: float,
|
| 3456 |
+
enable_frame_interpolation: bool,
|
| 3457 |
+
):
|
| 3458 |
pipeline_pool = get_pipeline_pool(task)
|
| 3459 |
return pipeline_pool.generate(
|
| 3460 |
task=task,
|
|
|
|
| 3478 |
gpu_text = "unknown"
|
| 3479 |
concurrency = 1
|
| 3480 |
active_variant = "none"
|
| 3481 |
+
cached_variants = "none"
|
| 3482 |
if ACTIVE_PIPELINE_POOL is not None:
|
| 3483 |
active_variant = ACTIVE_PIPELINE_POOL.model_variant
|
| 3484 |
gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
|
| 3485 |
concurrency = ACTIVE_PIPELINE_POOL.size
|
| 3486 |
+
with ACTIVE_POOL_LOCK:
|
| 3487 |
+
if PIPELINE_POOLS:
|
| 3488 |
+
cached_variants = ",".join(sorted(PIPELINE_POOLS.keys()))
|
| 3489 |
return (
|
| 3490 |
f"**Status** GPU: `{gpu_text}` | Max concurrency: `{concurrency}` | "
|
| 3491 |
f"Queue limit: `{QUEUE_MAX_SIZE}` | Active model: `{active_variant}` | "
|
| 3492 |
+
f"Cached variants: `{cached_variants}`"
|
| 3493 |
)
|
| 3494 |
|
| 3495 |
|
|
|
|
| 3681 |
value=DEFAULT_FRAME_INTERPOLATION if RIFE_AVAILABLE else FRAME_INTERPOLATION_NO,
|
| 3682 |
elem_classes=["generation-control", "generation-two-line-label"],
|
| 3683 |
)
|
| 3684 |
+
with gr.Row(elem_classes=["generation-controls-row", "video-resolution-row"]) as video_resolution_row:
|
| 3685 |
+
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3686 |
+
gr.HTML(build_lance_label_html("Video Resolution", "lance-generation-label"), elem_classes=["lance-label-html"])
|
| 3687 |
+
resolution = gr.Dropdown(
|
| 3688 |
+
label="Video Resolution",
|
| 3689 |
+
show_label=False,
|
| 3690 |
+
choices=VIDEO_RESOLUTION_DISPLAY_CHOICES,
|
| 3691 |
+
value=DEFAULT_RESOLUTION,
|
| 3692 |
+
allow_custom_value=True,
|
| 3693 |
+
elem_classes=["generation-control"],
|
| 3694 |
+
)
|
| 3695 |
with gr.Row(elem_classes=["generation-controls-row", "aspect-ratio-row"]) as aspect_ratio_row:
|
| 3696 |
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3697 |
gr.HTML('<div class="lance-generation-label">Aspect Ratio</div>', elem_classes=["lance-label-html"])
|
|
|
|
| 3703 |
value=DEFAULT_VIDEO_ASPECT_RATIO,
|
| 3704 |
elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
|
| 3705 |
)
|
| 3706 |
+
with gr.Row(elem_classes=["generation-controls-row", "video-duration-row"]) as video_duration_row:
|
| 3707 |
+
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3708 |
+
gr.HTML(build_lance_label_html("Video Duration (seconds)", "lance-generation-label"), elem_classes=["lance-label-html"])
|
| 3709 |
+
num_frames = gr.Radio(
|
| 3710 |
+
label="Video Duration (seconds)",
|
| 3711 |
+
show_label=False,
|
| 3712 |
+
choices=get_video_duration_choices(),
|
| 3713 |
+
value=DEFAULT_VIDEO_DURATION_SECONDS,
|
| 3714 |
+
elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
|
| 3715 |
+
)
|
| 3716 |
with gr.Row(elem_classes=["generation-controls-row", "output-resolution-row"], visible=False) as output_resolution_row:
|
| 3717 |
with gr.Column(elem_classes=["lance-control-field"]):
|
| 3718 |
gr.HTML('<div class="lance-generation-label">Output Resolution</div>', elem_classes=["lance-label-html"])
|
|
|
|
| 3725 |
visible=False,
|
| 3726 |
elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
|
| 3727 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3728 |
height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
|
| 3729 |
width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
|
| 3730 |
|
inference_lance.py
CHANGED
|
@@ -344,33 +344,37 @@ def validate_on_fixed_batch(
|
|
| 344 |
clean_memory()
|
| 345 |
|
| 346 |
elif inference_args.task in UNDERSTANDING_TASKS:
|
| 347 |
-
|
| 348 |
-
val_packed_text_ids
|
| 349 |
-
val_packed_text_indexes
|
| 350 |
-
val_packed_position_ids
|
| 351 |
-
val_sample_N_target
|
| 352 |
-
val_split_lens
|
| 353 |
-
val_attn_modes
|
| 354 |
-
val_sample_lens
|
| 355 |
-
val_sample_type
|
| 356 |
-
val_packed_vit_tokens
|
| 357 |
-
val_vit_video_grid_thw
|
| 358 |
-
val_ce_loss_indexes
|
| 359 |
-
max_samples
|
| 360 |
-
max_length
|
| 361 |
-
device
|
| 362 |
-
dtype
|
| 363 |
-
new_token_ids
|
| 364 |
-
pad_token_id
|
| 365 |
-
vocab_size
|
| 366 |
-
caption
|
| 367 |
-
tokenizer
|
| 368 |
-
apply_chat_template
|
| 369 |
-
apply_qwen_2_5_vl_pos_emb
|
| 370 |
-
do_sample
|
| 371 |
-
image_token_id
|
| 372 |
-
index
|
| 373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
|
| 375 |
for i_val, generated_sequence in enumerate(generated_sequence_all):
|
| 376 |
cap = tokenizer.decode(generated_sequence[:, 0])
|
|
@@ -378,7 +382,7 @@ def validate_on_fixed_batch(
|
|
| 378 |
inference_args.prompt_data_dict[index] = f"{cap}"
|
| 379 |
del generated_sequence
|
| 380 |
|
| 381 |
-
del generated_sequence_all, captions
|
| 382 |
clean_memory()
|
| 383 |
|
| 384 |
del val_data
|
|
@@ -495,9 +499,9 @@ def main():
|
|
| 495 |
training_args=training_args,
|
| 496 |
)
|
| 497 |
stage_start = time.perf_counter()
|
| 498 |
-
log_rank0("[startup]
|
| 499 |
-
model = model.to(
|
| 500 |
-
log_stage("Lance model
|
| 501 |
|
| 502 |
# Setup tokenizer for model:
|
| 503 |
stage_start = time.perf_counter()
|
|
@@ -538,10 +542,7 @@ def main():
|
|
| 538 |
else: # HACK!!!
|
| 539 |
assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_word_embeddings conflict'
|
| 540 |
|
| 541 |
-
|
| 542 |
-
log_rank0(f"[startup] Moving Lance model to GPU {DEVICE}")
|
| 543 |
-
model = model.to(device=DEVICE)
|
| 544 |
-
log_stage("Lance model move to GPU", stage_start)
|
| 545 |
model.eval()
|
| 546 |
if vae_model is not None and hasattr(vae_model, "eval"):
|
| 547 |
vae_model.eval()
|
|
|
|
| 344 |
clean_memory()
|
| 345 |
|
| 346 |
elif inference_args.task in UNDERSTANDING_TASKS:
|
| 347 |
+
params = {
|
| 348 |
+
"val_packed_text_ids": val_data["packed_text_ids"],
|
| 349 |
+
"val_packed_text_indexes": val_data["packed_text_indexes"],
|
| 350 |
+
"val_packed_position_ids": val_data["packed_position_ids"],
|
| 351 |
+
"val_sample_N_target": val_data["sample_N_target"],
|
| 352 |
+
"val_split_lens": val_data["split_lens"],
|
| 353 |
+
"val_attn_modes": val_data["attn_modes"],
|
| 354 |
+
"val_sample_lens": val_data["sample_lens"],
|
| 355 |
+
"val_sample_type": val_data["sample_type"],
|
| 356 |
+
"val_packed_vit_tokens": val_data["packed_vit_tokens"],
|
| 357 |
+
"val_vit_video_grid_thw": val_data["vit_video_grid_thw"],
|
| 358 |
+
"val_ce_loss_indexes": val_data["ce_loss_indexes"],
|
| 359 |
+
"max_samples": training_args.validation_max_samples,
|
| 360 |
+
"max_length": MAX_GENERATION_LENGTH,
|
| 361 |
+
"device": device,
|
| 362 |
+
"dtype": torch.bfloat16,
|
| 363 |
+
"new_token_ids": new_token_ids,
|
| 364 |
+
"pad_token_id": tokenizer.pad_token_id,
|
| 365 |
+
"vocab_size": len(tokenizer),
|
| 366 |
+
"caption": val_data.get("caption_cn", None),
|
| 367 |
+
"tokenizer": tokenizer,
|
| 368 |
+
"apply_chat_template": training_args.apply_chat_template,
|
| 369 |
+
"apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
|
| 370 |
+
"do_sample": False,
|
| 371 |
+
"image_token_id": image_token_id,
|
| 372 |
+
"index": val_data["index"],
|
| 373 |
+
}
|
| 374 |
+
if inference_args.use_KVcache:
|
| 375 |
+
generated_sequence_all, captions, index = fsdp_model.validation_und_KVcache(**params)
|
| 376 |
+
else:
|
| 377 |
+
generated_sequence_all, captions, index = fsdp_model.validation_video_to_text(**params)
|
| 378 |
|
| 379 |
for i_val, generated_sequence in enumerate(generated_sequence_all):
|
| 380 |
cap = tokenizer.decode(generated_sequence[:, 0])
|
|
|
|
| 382 |
inference_args.prompt_data_dict[index] = f"{cap}"
|
| 383 |
del generated_sequence
|
| 384 |
|
| 385 |
+
del generated_sequence_all, captions, params
|
| 386 |
clean_memory()
|
| 387 |
|
| 388 |
del val_data
|
|
|
|
| 499 |
training_args=training_args,
|
| 500 |
)
|
| 501 |
stage_start = time.perf_counter()
|
| 502 |
+
log_rank0(f"[startup] Moving Lance model to GPU {DEVICE}")
|
| 503 |
+
model = model.to(DEVICE)
|
| 504 |
+
log_stage("Lance model move to GPU", stage_start)
|
| 505 |
|
| 506 |
# Setup tokenizer for model:
|
| 507 |
stage_start = time.perf_counter()
|
|
|
|
| 542 |
else: # HACK!!!
|
| 543 |
assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_word_embeddings conflict'
|
| 544 |
|
| 545 |
+
model = model.to(device=DEVICE, dtype=torch.bfloat16)
|
|
|
|
|
|
|
|
|
|
| 546 |
model.eval()
|
| 547 |
if vae_model is not None and hasattr(vae_model, "eval"):
|
| 548 |
vae_model.eval()
|