ffy2000 commited on
Commit
96c5ff4
·
1 Parent(s): 35616fd

Vendor RIFE into repo

Browse files
Files changed (2) hide show
  1. app.py +155 -78
  2. inference_lance.py +36 -35
app.py CHANGED
@@ -21,6 +21,8 @@ from datetime import datetime
21
  from pathlib import Path
22
  from typing import Optional
23
 
 
 
24
  try:
25
  import spaces
26
  except ImportError: # pragma: no cover - keeps local CPU runs working
@@ -1556,29 +1558,21 @@ UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
1556
  IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
1557
  VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
1558
  EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
1559
- VIDEO_RESOLUTION_CHOICES = ["video_360p", "video_480p"]
1560
- VIDEO_RESOLUTION_DISPLAY_CHOICES = [
1561
- ("video_360p", "video_360p"),
1562
- ("video_480p(Higher quota usage. Use sparingly.)", "video_480p"),
1563
- ]
1564
  VIDEO_EDIT_RESOLUTION_CHOICES = [DEFAULT_VIDEO_EDIT_RESOLUTION]
1565
  IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
1566
  RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
1567
- CAPTION_SYSTEM_PROMPT_TEMPLATE = (
1568
- "Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background."
1569
- )
1570
- V2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="video")
1571
- I2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="image")
1572
  V2T_QA_SYSTEM_PROMPT = "View the video attentively and provide a suitable answer to the posed question."
1573
  I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
1574
-
1575
-
1576
  def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
1577
  """Get Aspect Ratio choices with default/recommended marker for the given task."""
1578
  internal_task = normalize_task(task)
1579
  default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
1580
  return [
1581
- (f"{ratio}" if ratio == default_ratio else ratio, ratio)
1582
  for ratio in ASPECT_RATIO_CHOICES
1583
  ]
1584
 
@@ -2817,6 +2811,8 @@ class LanceT2VV2TPipeline:
2817
  data_args=request_data_args,
2818
  inference_args=request_inference_args,
2819
  )
 
 
2820
  generate_start = time.perf_counter()
2821
  validate_on_fixed_batch(
2822
  fsdp_model=self.model,
@@ -3047,7 +3043,13 @@ class PipelinePool:
3047
  def gpu_summary(self) -> str:
3048
  return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
3049
 
 
 
 
 
3050
  def initialize_all(self) -> None:
 
 
3051
  print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
3052
  exceptions: list[Exception] = []
3053
  with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
@@ -3135,6 +3137,7 @@ class PipelinePool:
3135
  self.release(pipeline)
3136
 
3137
 
 
3138
  ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
3139
  ACTIVE_POOL_LOCK = threading.Lock()
3140
  QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
@@ -3209,9 +3212,53 @@ def clamp_zerogpu_duration(seconds: int) -> int:
3209
 
3210
  ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
3211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3212
  def is_pipeline_pool_ready_for_task(task: str) -> bool:
3213
- """Retained for compatibility with earlier duration logic."""
3214
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3215
 
3216
 
3217
  def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
@@ -3258,30 +3305,37 @@ def _estimate_zerogpu_duration_seconds(
3258
  prompt_length = len((prompt or "").strip())
3259
  has_video_input = bool((input_video or "").strip())
3260
  has_image_input = bool((input_image or "").strip())
 
3261
  is_video_task = internal_task in {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
3262
  is_image_task = internal_task in {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
3263
 
3264
  if internal_task == TASK_T2I:
3265
- return 150
3266
 
3267
  if internal_task == TASK_IMAGE_EDIT:
3268
- return 150
3269
 
3270
  if internal_task == TASK_X2T_IMAGE:
3271
- return 150
3272
 
3273
  if internal_task == TASK_X2T_VIDEO:
3274
- return 200
3275
 
3276
  if internal_task == TASK_VIDEO_EDIT:
3277
- base = 300
3278
- base += min(48, max(0, num_frames - 37) // 2)
3279
- base += 32 if enable_frame_interpolation else 0
3280
- base += 20 if has_video_input else 0
3281
- base += 16 if resolution == "video_480p" else 0
3282
  return base
3283
 
3284
  if internal_task == TASK_T2V:
 
 
 
 
 
 
3285
  base = 224 if resolution == "video_360p" else 264
3286
  base += min(56, max(0, num_frames - 37) // 2)
3287
  base += 28 if enable_frame_interpolation else 0
@@ -3289,13 +3343,13 @@ def _estimate_zerogpu_duration_seconds(
3289
  return base
3290
 
3291
  if is_video_task:
3292
- base = 240
3293
- base += min(40, max(0, num_frames - 37) // 2)
3294
- base += 24 if enable_frame_interpolation else 0
3295
  return base
3296
 
3297
  if is_image_task:
3298
- return 120
3299
 
3300
  return 160
3301
 
@@ -3335,34 +3389,6 @@ def get_run_task_gpu_duration(
3335
  return finalize_zerogpu_duration(estimated_seconds, task)
3336
 
3337
 
3338
- def get_pipeline_pool(task: str) -> PipelinePool:
3339
- global ACTIVE_PIPELINE_POOL
3340
- if not torch.cuda.is_available():
3341
- raise RuntimeError(
3342
- "Lance inference requires a GPU. The Gradio UI can start on CPU, but generation is disabled "
3343
- "until GPU hardware is attached."
3344
- )
3345
- model_variant = get_task_model_variant(task)
3346
- with ACTIVE_POOL_LOCK:
3347
- if ACTIVE_PIPELINE_POOL is not None and ACTIVE_PIPELINE_POOL.model_variant == model_variant:
3348
- return ACTIVE_PIPELINE_POOL
3349
-
3350
- gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
3351
- if ACTIVE_PIPELINE_POOL is not None:
3352
- previous_variant = ACTIVE_PIPELINE_POOL.model_variant
3353
- print(
3354
- f"[runtime] Switching Lance model from {previous_variant} to {model_variant}.",
3355
- flush=True,
3356
- )
3357
- ACTIVE_PIPELINE_POOL.unload_all()
3358
- ACTIVE_PIPELINE_POOL = None
3359
-
3360
- ACTIVE_PIPELINE_POOL = PipelinePool(gpu_ids, model_variant=model_variant)
3361
- ACTIVE_PIPELINE_POOL.initialize_all()
3362
- return ACTIVE_PIPELINE_POOL
3363
-
3364
-
3365
- @spaces.GPU(size="large", duration=get_run_task_gpu_duration)
3366
  def run_task(
3367
  task: str,
3368
  prompt: str,
@@ -3380,8 +3406,55 @@ def run_task(
3380
  enable_frame_interpolation: bool,
3381
  ):
3382
  internal_task = normalize_task(task)
 
 
 
 
 
 
 
 
 
 
 
3383
  if internal_task == TASK_T2V:
3384
  num_frames = video_seconds_to_num_frames(num_frames)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3385
  pipeline_pool = get_pipeline_pool(task)
3386
  return pipeline_pool.generate(
3387
  task=task,
@@ -3405,14 +3478,18 @@ def build_status_markdown() -> str:
3405
  gpu_text = "unknown"
3406
  concurrency = 1
3407
  active_variant = "none"
 
3408
  if ACTIVE_PIPELINE_POOL is not None:
3409
  active_variant = ACTIVE_PIPELINE_POOL.model_variant
3410
  gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
3411
  concurrency = ACTIVE_PIPELINE_POOL.size
 
 
 
3412
  return (
3413
  f"**Status** GPU: `{gpu_text}` | Max concurrency: `{concurrency}` | "
3414
  f"Queue limit: `{QUEUE_MAX_SIZE}` | Active model: `{active_variant}` | "
3415
- f"Switch mode: `unload then load`"
3416
  )
3417
 
3418
 
@@ -3604,6 +3681,17 @@ def build_demo() -> gr.Blocks:
3604
  value=DEFAULT_FRAME_INTERPOLATION if RIFE_AVAILABLE else FRAME_INTERPOLATION_NO,
3605
  elem_classes=["generation-control", "generation-two-line-label"],
3606
  )
 
 
 
 
 
 
 
 
 
 
 
3607
  with gr.Row(elem_classes=["generation-controls-row", "aspect-ratio-row"]) as aspect_ratio_row:
3608
  with gr.Column(elem_classes=["lance-control-field"]):
3609
  gr.HTML('<div class="lance-generation-label">Aspect Ratio</div>', elem_classes=["lance-label-html"])
@@ -3615,6 +3703,16 @@ def build_demo() -> gr.Blocks:
3615
  value=DEFAULT_VIDEO_ASPECT_RATIO,
3616
  elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
3617
  )
 
 
 
 
 
 
 
 
 
 
3618
  with gr.Row(elem_classes=["generation-controls-row", "output-resolution-row"], visible=False) as output_resolution_row:
3619
  with gr.Column(elem_classes=["lance-control-field"]):
3620
  gr.HTML('<div class="lance-generation-label">Output Resolution</div>', elem_classes=["lance-label-html"])
@@ -3627,27 +3725,6 @@ def build_demo() -> gr.Blocks:
3627
  visible=False,
3628
  elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
3629
  )
3630
- with gr.Row(elem_classes=["generation-controls-row", "video-duration-row"]) as video_duration_row:
3631
- with gr.Column(elem_classes=["lance-control-field"]):
3632
- gr.HTML(build_lance_label_html("Video Duration (seconds)", "lance-generation-label"), elem_classes=["lance-label-html"])
3633
- num_frames = gr.Radio(
3634
- label="Video Duration (seconds)",
3635
- show_label=False,
3636
- choices=get_video_duration_choices(),
3637
- value=DEFAULT_VIDEO_DURATION_SECONDS,
3638
- elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
3639
- )
3640
- with gr.Row(elem_classes=["generation-controls-row", "video-resolution-row"]) as video_resolution_row:
3641
- with gr.Column(elem_classes=["lance-control-field"]):
3642
- gr.HTML(build_lance_label_html("Video Resolution", "lance-generation-label"), elem_classes=["lance-label-html"])
3643
- resolution = gr.Dropdown(
3644
- label="Video Resolution",
3645
- show_label=False,
3646
- choices=VIDEO_RESOLUTION_DISPLAY_CHOICES,
3647
- value=DEFAULT_RESOLUTION,
3648
- allow_custom_value=True,
3649
- elem_classes=["generation-control"],
3650
- )
3651
  height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
3652
  width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
3653
 
 
21
  from pathlib import Path
22
  from typing import Optional
23
 
24
+ os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:128")
25
+
26
  try:
27
  import spaces
28
  except ImportError: # pragma: no cover - keeps local CPU runs working
 
1558
  IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
1559
  VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
1560
  EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
1561
+ VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
 
 
 
 
1562
  VIDEO_EDIT_RESOLUTION_CHOICES = [DEFAULT_VIDEO_EDIT_RESOLUTION]
1563
  IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
1564
  RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
1565
+ VIDEO_RESOLUTION_DISPLAY_CHOICES = [("360p", "video_360p"), ("480p", "video_480p")]
 
 
 
 
1566
  V2T_QA_SYSTEM_PROMPT = "View the video attentively and provide a suitable answer to the posed question."
1567
  I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
1568
+
1569
+
1570
  def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
1571
  """Get Aspect Ratio choices with default/recommended marker for the given task."""
1572
  internal_task = normalize_task(task)
1573
  default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
1574
  return [
1575
+ (f"{ratio} (default)" if ratio == default_ratio else ratio, ratio)
1576
  for ratio in ASPECT_RATIO_CHOICES
1577
  ]
1578
 
 
2811
  data_args=request_data_args,
2812
  inference_args=request_inference_args,
2813
  )
2814
+ # Keep the allocator from fragmenting before the heavy forward pass.
2815
+ clean_memory()
2816
  generate_start = time.perf_counter()
2817
  validate_on_fixed_batch(
2818
  fsdp_model=self.model,
 
3043
  def gpu_summary(self) -> str:
3044
  return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
3045
 
3046
+ @property
3047
+ def is_initialized(self) -> bool:
3048
+ return all(pipeline.initialized for pipeline in self.pipelines)
3049
+
3050
  def initialize_all(self) -> None:
3051
+ if self.is_initialized:
3052
+ return
3053
  print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
3054
  exceptions: list[Exception] = []
3055
  with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
 
3137
  self.release(pipeline)
3138
 
3139
 
3140
+ PIPELINE_POOLS: dict[str, PipelinePool] = {}
3141
  ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
3142
  ACTIVE_POOL_LOCK = threading.Lock()
3143
  QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
 
3212
 
3213
  ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
3214
 
3215
+
3216
+ def get_other_model_variant(model_variant: str) -> str:
3217
+ normalized_variant = normalize_model_variant(model_variant)
3218
+ return MODEL_VARIANT_IMAGE if normalized_variant == MODEL_VARIANT_VIDEO else MODEL_VARIANT_VIDEO
3219
+
3220
+
3221
+ def is_pipeline_pool_ready_for_variant(model_variant: str) -> bool:
3222
+ normalized_variant = normalize_model_variant(model_variant)
3223
+ with ACTIVE_POOL_LOCK:
3224
+ pool = PIPELINE_POOLS.get(normalized_variant)
3225
+ return bool(pool is not None and pool.is_initialized)
3226
+
3227
+
3228
  def is_pipeline_pool_ready_for_task(task: str) -> bool:
3229
+ return is_pipeline_pool_ready_for_variant(get_task_model_variant(task))
3230
+
3231
+
3232
+ def get_or_create_pipeline_pool(model_variant: str) -> PipelinePool:
3233
+ if not torch.cuda.is_available():
3234
+ raise RuntimeError(
3235
+ "Lance inference requires a GPU. The Gradio UI can start on CPU, but generation is disabled "
3236
+ "until GPU hardware is attached."
3237
+ )
3238
+ normalized_variant = normalize_model_variant(model_variant)
3239
+ gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
3240
+ with ACTIVE_POOL_LOCK:
3241
+ pool = PIPELINE_POOLS.get(normalized_variant)
3242
+ if pool is None:
3243
+ pool = PipelinePool(gpu_ids, model_variant=normalized_variant)
3244
+ PIPELINE_POOLS[normalized_variant] = pool
3245
+ return pool
3246
+
3247
+
3248
+ def ensure_pipeline_pool_ready(model_variant: str) -> PipelinePool:
3249
+ pool = get_or_create_pipeline_pool(model_variant)
3250
+ if not pool.is_initialized:
3251
+ pool.initialize_all()
3252
+ return pool
3253
+
3254
+
3255
+ def get_pipeline_pool(task: str) -> PipelinePool:
3256
+ global ACTIVE_PIPELINE_POOL
3257
+ model_variant = get_task_model_variant(task)
3258
+ pool = ensure_pipeline_pool_ready(model_variant)
3259
+ with ACTIVE_POOL_LOCK:
3260
+ ACTIVE_PIPELINE_POOL = pool
3261
+ return pool
3262
 
3263
 
3264
  def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
 
3305
  prompt_length = len((prompt or "").strip())
3306
  has_video_input = bool((input_video or "").strip())
3307
  has_image_input = bool((input_image or "").strip())
3308
+ pool_ready = is_pipeline_pool_ready_for_task(internal_task)
3309
  is_video_task = internal_task in {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
3310
  is_image_task = internal_task in {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
3311
 
3312
  if internal_task == TASK_T2I:
3313
+ return 90 if pool_ready else 150
3314
 
3315
  if internal_task == TASK_IMAGE_EDIT:
3316
+ return 100 if pool_ready else 150
3317
 
3318
  if internal_task == TASK_X2T_IMAGE:
3319
+ return 90 if pool_ready else 150
3320
 
3321
  if internal_task == TASK_X2T_VIDEO:
3322
+ return 120 if pool_ready else 200
3323
 
3324
  if internal_task == TASK_VIDEO_EDIT:
3325
+ base = 170 if pool_ready else 300
3326
+ base += min(30 if pool_ready else 48, max(0, num_frames - 37) // 3)
3327
+ base += 24 if enable_frame_interpolation else 0
3328
+ base += 16 if has_video_input else 0
3329
+ base += 10 if resolution == "video_480p" else 0
3330
  return base
3331
 
3332
  if internal_task == TASK_T2V:
3333
+ if pool_ready:
3334
+ base = 130 if resolution == "video_360p" else 150
3335
+ base += min(36, max(0, num_frames - 37) // 3)
3336
+ base += 18 if enable_frame_interpolation else 0
3337
+ base += min(12, prompt_length // 320)
3338
+ return base
3339
  base = 224 if resolution == "video_360p" else 264
3340
  base += min(56, max(0, num_frames - 37) // 2)
3341
  base += 28 if enable_frame_interpolation else 0
 
3343
  return base
3344
 
3345
  if is_video_task:
3346
+ base = 150 if pool_ready else 240
3347
+ base += min(28 if pool_ready else 40, max(0, num_frames - 37) // 3)
3348
+ base += 18 if enable_frame_interpolation else 0
3349
  return base
3350
 
3351
  if is_image_task:
3352
+ return 100 if pool_ready else 120
3353
 
3354
  return 160
3355
 
 
3389
  return finalize_zerogpu_duration(estimated_seconds, task)
3390
 
3391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3392
  def run_task(
3393
  task: str,
3394
  prompt: str,
 
3406
  enable_frame_interpolation: bool,
3407
  ):
3408
  internal_task = normalize_task(task)
3409
+ if internal_task in UNDERSTANDING_TASKS and not prompt:
3410
+ return None, None, "", "Please enter a question.", ""
3411
+ if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
3412
+ return None, None, "", "Please upload an input video.", ""
3413
+ if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
3414
+ return None, None, "", "Please upload an input image.", ""
3415
+ if height <= 0 or width <= 0:
3416
+ return None, None, "", "Height and width must be greater than 0.", ""
3417
+ if num_frames <= 0:
3418
+ return None, None, "", "The number of frames must be greater than 0.", ""
3419
+
3420
  if internal_task == TASK_T2V:
3421
  num_frames = video_seconds_to_num_frames(num_frames)
3422
+ normalized_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
3423
+ return run_task_gpu(
3424
+ task=task,
3425
+ prompt=prompt,
3426
+ system_prompt=system_prompt,
3427
+ input_video=input_video,
3428
+ input_image=input_image,
3429
+ height=height,
3430
+ width=width,
3431
+ num_frames=num_frames,
3432
+ seed=seed,
3433
+ resolution=normalized_resolution,
3434
+ validation_num_timesteps=validation_num_timesteps,
3435
+ validation_timestep_shift=validation_timestep_shift,
3436
+ cfg_text_scale=cfg_text_scale,
3437
+ enable_frame_interpolation=enable_frame_interpolation,
3438
+ )
3439
+
3440
+
3441
+ @spaces.GPU(size="large", duration=get_run_task_gpu_duration)
3442
+ def run_task_gpu(
3443
+ task: str,
3444
+ prompt: str,
3445
+ system_prompt: Optional[str],
3446
+ input_video: Optional[str],
3447
+ input_image: Optional[str],
3448
+ height: int,
3449
+ width: int,
3450
+ num_frames: int,
3451
+ seed: int,
3452
+ resolution: str,
3453
+ validation_num_timesteps: int,
3454
+ validation_timestep_shift: float,
3455
+ cfg_text_scale: float,
3456
+ enable_frame_interpolation: bool,
3457
+ ):
3458
  pipeline_pool = get_pipeline_pool(task)
3459
  return pipeline_pool.generate(
3460
  task=task,
 
3478
  gpu_text = "unknown"
3479
  concurrency = 1
3480
  active_variant = "none"
3481
+ cached_variants = "none"
3482
  if ACTIVE_PIPELINE_POOL is not None:
3483
  active_variant = ACTIVE_PIPELINE_POOL.model_variant
3484
  gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
3485
  concurrency = ACTIVE_PIPELINE_POOL.size
3486
+ with ACTIVE_POOL_LOCK:
3487
+ if PIPELINE_POOLS:
3488
+ cached_variants = ",".join(sorted(PIPELINE_POOLS.keys()))
3489
  return (
3490
  f"**Status** GPU: `{gpu_text}` | Max concurrency: `{concurrency}` | "
3491
  f"Queue limit: `{QUEUE_MAX_SIZE}` | Active model: `{active_variant}` | "
3492
+ f"Cached variants: `{cached_variants}`"
3493
  )
3494
 
3495
 
 
3681
  value=DEFAULT_FRAME_INTERPOLATION if RIFE_AVAILABLE else FRAME_INTERPOLATION_NO,
3682
  elem_classes=["generation-control", "generation-two-line-label"],
3683
  )
3684
+ with gr.Row(elem_classes=["generation-controls-row", "video-resolution-row"]) as video_resolution_row:
3685
+ with gr.Column(elem_classes=["lance-control-field"]):
3686
+ gr.HTML(build_lance_label_html("Video Resolution", "lance-generation-label"), elem_classes=["lance-label-html"])
3687
+ resolution = gr.Dropdown(
3688
+ label="Video Resolution",
3689
+ show_label=False,
3690
+ choices=VIDEO_RESOLUTION_DISPLAY_CHOICES,
3691
+ value=DEFAULT_RESOLUTION,
3692
+ allow_custom_value=True,
3693
+ elem_classes=["generation-control"],
3694
+ )
3695
  with gr.Row(elem_classes=["generation-controls-row", "aspect-ratio-row"]) as aspect_ratio_row:
3696
  with gr.Column(elem_classes=["lance-control-field"]):
3697
  gr.HTML('<div class="lance-generation-label">Aspect Ratio</div>', elem_classes=["lance-label-html"])
 
3703
  value=DEFAULT_VIDEO_ASPECT_RATIO,
3704
  elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
3705
  )
3706
+ with gr.Row(elem_classes=["generation-controls-row", "video-duration-row"]) as video_duration_row:
3707
+ with gr.Column(elem_classes=["lance-control-field"]):
3708
+ gr.HTML(build_lance_label_html("Video Duration (seconds)", "lance-generation-label"), elem_classes=["lance-label-html"])
3709
+ num_frames = gr.Radio(
3710
+ label="Video Duration (seconds)",
3711
+ show_label=False,
3712
+ choices=get_video_duration_choices(),
3713
+ value=DEFAULT_VIDEO_DURATION_SECONDS,
3714
+ elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
3715
+ )
3716
  with gr.Row(elem_classes=["generation-controls-row", "output-resolution-row"], visible=False) as output_resolution_row:
3717
  with gr.Column(elem_classes=["lance-control-field"]):
3718
  gr.HTML('<div class="lance-generation-label">Output Resolution</div>', elem_classes=["lance-label-html"])
 
3725
  visible=False,
3726
  elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
3727
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3728
  height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
3729
  width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)
3730
 
inference_lance.py CHANGED
@@ -344,33 +344,37 @@ def validate_on_fixed_batch(
344
  clean_memory()
345
 
346
  elif inference_args.task in UNDERSTANDING_TASKS:
347
- generated_sequence_all, captions, index = fsdp_model.validation_video_to_text(
348
- val_packed_text_ids=val_data["packed_text_ids"],
349
- val_packed_text_indexes=val_data["packed_text_indexes"],
350
- val_packed_position_ids=val_data["packed_position_ids"],
351
- val_sample_N_target=val_data["sample_N_target"],
352
- val_split_lens=val_data["split_lens"],
353
- val_attn_modes=val_data["attn_modes"],
354
- val_sample_lens=val_data["sample_lens"],
355
- val_sample_type=val_data["sample_type"],
356
- val_packed_vit_tokens=val_data["packed_vit_tokens"],
357
- val_vit_video_grid_thw=val_data["vit_video_grid_thw"],
358
- val_ce_loss_indexes=val_data["ce_loss_indexes"],
359
- max_samples=training_args.validation_max_samples,
360
- max_length=MAX_GENERATION_LENGTH,
361
- device=device,
362
- dtype=torch.bfloat16,
363
- new_token_ids=new_token_ids,
364
- pad_token_id=tokenizer.pad_token_id,
365
- vocab_size=len(tokenizer),
366
- caption=val_data.get("caption_cn", None),
367
- tokenizer=tokenizer,
368
- apply_chat_template=training_args.apply_chat_template,
369
- apply_qwen_2_5_vl_pos_emb=training_args.apply_qwen_2_5_vl_pos_emb,
370
- do_sample=False,
371
- image_token_id=image_token_id,
372
- index=val_data["index"],
373
- )
 
 
 
 
374
 
375
  for i_val, generated_sequence in enumerate(generated_sequence_all):
376
  cap = tokenizer.decode(generated_sequence[:, 0])
@@ -378,7 +382,7 @@ def validate_on_fixed_batch(
378
  inference_args.prompt_data_dict[index] = f"{cap}"
379
  del generated_sequence
380
 
381
- del generated_sequence_all, captions
382
  clean_memory()
383
 
384
  del val_data
@@ -495,9 +499,9 @@ def main():
495
  training_args=training_args,
496
  )
497
  stage_start = time.perf_counter()
498
- log_rank0("[startup] Casting Lance model to bf16 on CPU")
499
- model = model.to(dtype=torch.bfloat16)
500
- log_stage("Lance model bf16 cast", stage_start)
501
 
502
  # Setup tokenizer for model:
503
  stage_start = time.perf_counter()
@@ -538,10 +542,7 @@ def main():
538
  else: # HACK!!!
539
  assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_word_embeddings conflict'
540
 
541
- stage_start = time.perf_counter()
542
- log_rank0(f"[startup] Moving Lance model to GPU {DEVICE}")
543
- model = model.to(device=DEVICE)
544
- log_stage("Lance model move to GPU", stage_start)
545
  model.eval()
546
  if vae_model is not None and hasattr(vae_model, "eval"):
547
  vae_model.eval()
 
344
  clean_memory()
345
 
346
  elif inference_args.task in UNDERSTANDING_TASKS:
347
+ params = {
348
+ "val_packed_text_ids": val_data["packed_text_ids"],
349
+ "val_packed_text_indexes": val_data["packed_text_indexes"],
350
+ "val_packed_position_ids": val_data["packed_position_ids"],
351
+ "val_sample_N_target": val_data["sample_N_target"],
352
+ "val_split_lens": val_data["split_lens"],
353
+ "val_attn_modes": val_data["attn_modes"],
354
+ "val_sample_lens": val_data["sample_lens"],
355
+ "val_sample_type": val_data["sample_type"],
356
+ "val_packed_vit_tokens": val_data["packed_vit_tokens"],
357
+ "val_vit_video_grid_thw": val_data["vit_video_grid_thw"],
358
+ "val_ce_loss_indexes": val_data["ce_loss_indexes"],
359
+ "max_samples": training_args.validation_max_samples,
360
+ "max_length": MAX_GENERATION_LENGTH,
361
+ "device": device,
362
+ "dtype": torch.bfloat16,
363
+ "new_token_ids": new_token_ids,
364
+ "pad_token_id": tokenizer.pad_token_id,
365
+ "vocab_size": len(tokenizer),
366
+ "caption": val_data.get("caption_cn", None),
367
+ "tokenizer": tokenizer,
368
+ "apply_chat_template": training_args.apply_chat_template,
369
+ "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
370
+ "do_sample": False,
371
+ "image_token_id": image_token_id,
372
+ "index": val_data["index"],
373
+ }
374
+ if inference_args.use_KVcache:
375
+ generated_sequence_all, captions, index = fsdp_model.validation_und_KVcache(**params)
376
+ else:
377
+ generated_sequence_all, captions, index = fsdp_model.validation_video_to_text(**params)
378
 
379
  for i_val, generated_sequence in enumerate(generated_sequence_all):
380
  cap = tokenizer.decode(generated_sequence[:, 0])
 
382
  inference_args.prompt_data_dict[index] = f"{cap}"
383
  del generated_sequence
384
 
385
+ del generated_sequence_all, captions, params
386
  clean_memory()
387
 
388
  del val_data
 
499
  training_args=training_args,
500
  )
501
  stage_start = time.perf_counter()
502
+ log_rank0(f"[startup] Moving Lance model to GPU {DEVICE}")
503
+ model = model.to(DEVICE)
504
+ log_stage("Lance model move to GPU", stage_start)
505
 
506
  # Setup tokenizer for model:
507
  stage_start = time.perf_counter()
 
542
  else: # HACK!!!
543
  assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_word_embeddings conflict'
544
 
545
+ model = model.to(device=DEVICE, dtype=torch.bfloat16)
 
 
 
546
  model.eval()
547
  if vae_model is not None and hasattr(vae_model, "eval"):
548
  vae_model.eval()