Spaces:

bytedance-research
/

Lance

Running on Zero

App Files Files Community

ffy2000 commited on 3 days ago

Commit

96c5ff4

1 Parent(s): 35616fd

Vendor RIFE into repo

Browse files

Files changed (2) hide show

app.py +155 -78
inference_lance.py +36 -35

app.py CHANGED Viewed

@@ -21,6 +21,8 @@ from datetime import datetime
 from pathlib import Path
 from typing import Optional
 try:
     import spaces
 except ImportError:  # pragma: no cover - keeps local CPU runs working
@@ -1556,29 +1558,21 @@ UNDERSTANDING_TASKS = {TASK_X2T_VIDEO, TASK_X2T_IMAGE}
 IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
 VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
 EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
-VIDEO_RESOLUTION_CHOICES = ["video_360p", "video_480p"]
-VIDEO_RESOLUTION_DISPLAY_CHOICES = [
-    ("video_360p", "video_360p"),
-    ("video_480p（Higher quota usage. Use sparingly.）", "video_480p"),
-]
 VIDEO_EDIT_RESOLUTION_CHOICES = [DEFAULT_VIDEO_EDIT_RESOLUTION]
 IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
 RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
-CAPTION_SYSTEM_PROMPT_TEMPLATE = (
-    "Describe the key features of the input {vision_type}, including color, shape, size, texture, objects, background."
-)
-V2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="video")
-I2T_CAPTION_SYSTEM_PROMPT = CAPTION_SYSTEM_PROMPT_TEMPLATE.format(vision_type="image")
 V2T_QA_SYSTEM_PROMPT = "View the video  attentively and provide a suitable answer to the posed question."
 I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
 def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
     """Get Aspect Ratio choices with default/recommended marker for the given task."""
     internal_task = normalize_task(task)
     default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
     return [
-        (f"{ratio}" if ratio == default_ratio else ratio, ratio)
         for ratio in ASPECT_RATIO_CHOICES
     ]
@@ -2817,6 +2811,8 @@ class LanceT2VV2TPipeline:
                     data_args=request_data_args,
                     inference_args=request_inference_args,
                 )
                 generate_start = time.perf_counter()
                 validate_on_fixed_batch(
                     fsdp_model=self.model,
@@ -3047,7 +3043,13 @@ class PipelinePool:
     def gpu_summary(self) -> str:
         return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
     def initialize_all(self) -> None:
         print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
         exceptions: list[Exception] = []
         with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
@@ -3135,6 +3137,7 @@ class PipelinePool:
             self.release(pipeline)
 ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
 ACTIVE_POOL_LOCK = threading.Lock()
 QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
@@ -3209,9 +3212,53 @@ def clamp_zerogpu_duration(seconds: int) -> int:
 ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
 def is_pipeline_pool_ready_for_task(task: str) -> bool:
-    """Retained for compatibility with earlier duration logic."""
-    return False
 def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
@@ -3258,30 +3305,37 @@ def _estimate_zerogpu_duration_seconds(
     prompt_length = len((prompt or "").strip())
     has_video_input = bool((input_video or "").strip())
     has_image_input = bool((input_image or "").strip())
     is_video_task = internal_task in {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
     is_image_task = internal_task in {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
     if internal_task == TASK_T2I:
-        return 150
     if internal_task == TASK_IMAGE_EDIT:
-        return 150
     if internal_task == TASK_X2T_IMAGE:
-        return 150
     if internal_task == TASK_X2T_VIDEO:
-        return 200
     if internal_task == TASK_VIDEO_EDIT:
-        base = 300
-        base += min(48, max(0, num_frames - 37) // 2)
-        base += 32 if enable_frame_interpolation else 0
-        base += 20 if has_video_input else 0
-        base += 16 if resolution == "video_480p" else 0
         return base
     if internal_task == TASK_T2V:
         base = 224 if resolution == "video_360p" else 264
         base += min(56, max(0, num_frames - 37) // 2)
         base += 28 if enable_frame_interpolation else 0
@@ -3289,13 +3343,13 @@ def _estimate_zerogpu_duration_seconds(
         return base
     if is_video_task:
-        base = 240
-        base += min(40, max(0, num_frames - 37) // 2)
-        base += 24 if enable_frame_interpolation else 0
         return base
     if is_image_task:
-        return 120
     return 160
@@ -3335,34 +3389,6 @@ def get_run_task_gpu_duration(
     return finalize_zerogpu_duration(estimated_seconds, task)
-def get_pipeline_pool(task: str) -> PipelinePool:
-    global ACTIVE_PIPELINE_POOL
-    if not torch.cuda.is_available():
-        raise RuntimeError(
-            "Lance inference requires a GPU. The Gradio UI can start on CPU, but generation is disabled "
-            "until GPU hardware is attached."
-        )
-    model_variant = get_task_model_variant(task)
-    with ACTIVE_POOL_LOCK:
-        if ACTIVE_PIPELINE_POOL is not None and ACTIVE_PIPELINE_POOL.model_variant == model_variant:
-            return ACTIVE_PIPELINE_POOL
-        gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
-        if ACTIVE_PIPELINE_POOL is not None:
-            previous_variant = ACTIVE_PIPELINE_POOL.model_variant
-            print(
-                f"[runtime] Switching Lance model from {previous_variant} to {model_variant}.",
-                flush=True,
-            )
-            ACTIVE_PIPELINE_POOL.unload_all()
-            ACTIVE_PIPELINE_POOL = None
-        ACTIVE_PIPELINE_POOL = PipelinePool(gpu_ids, model_variant=model_variant)
-        ACTIVE_PIPELINE_POOL.initialize_all()
-        return ACTIVE_PIPELINE_POOL
-@spaces.GPU(size="large", duration=get_run_task_gpu_duration)
 def run_task(
     task: str,
     prompt: str,
@@ -3380,8 +3406,55 @@ def run_task(
     enable_frame_interpolation: bool,
 ):
     internal_task = normalize_task(task)
     if internal_task == TASK_T2V:
         num_frames = video_seconds_to_num_frames(num_frames)
     pipeline_pool = get_pipeline_pool(task)
     return pipeline_pool.generate(
         task=task,
@@ -3405,14 +3478,18 @@ def build_status_markdown() -> str:
     gpu_text = "unknown"
     concurrency = 1
     active_variant = "none"
     if ACTIVE_PIPELINE_POOL is not None:
         active_variant = ACTIVE_PIPELINE_POOL.model_variant
         gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
         concurrency = ACTIVE_PIPELINE_POOL.size
     return (
         f"**Status**  GPU: `{gpu_text}`  |  Max concurrency: `{concurrency}`  |  "
         f"Queue limit: `{QUEUE_MAX_SIZE}`  |  Active model: `{active_variant}`  |  "
-        f"Switch mode: `unload then load`"
     )
@@ -3604,6 +3681,17 @@ def build_demo() -> gr.Blocks:
                                 value=DEFAULT_FRAME_INTERPOLATION if RIFE_AVAILABLE else FRAME_INTERPOLATION_NO,
                                 elem_classes=["generation-control", "generation-two-line-label"],
                             )
                     with gr.Row(elem_classes=["generation-controls-row", "aspect-ratio-row"]) as aspect_ratio_row:
                         with gr.Column(elem_classes=["lance-control-field"]):
                             gr.HTML('<div class="lance-generation-label">Aspect Ratio</div>', elem_classes=["lance-label-html"])
@@ -3615,6 +3703,16 @@ def build_demo() -> gr.Blocks:
                                 value=DEFAULT_VIDEO_ASPECT_RATIO,
                                 elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
                             )
                     with gr.Row(elem_classes=["generation-controls-row", "output-resolution-row"], visible=False) as output_resolution_row:
                         with gr.Column(elem_classes=["lance-control-field"]):
                             gr.HTML('<div class="lance-generation-label">Output Resolution</div>', elem_classes=["lance-label-html"])
@@ -3627,27 +3725,6 @@ def build_demo() -> gr.Blocks:
                                 visible=False,
                                 elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
                             )
-                with gr.Row(elem_classes=["generation-controls-row", "video-duration-row"]) as video_duration_row:
-                    with gr.Column(elem_classes=["lance-control-field"]):
-                        gr.HTML(build_lance_label_html("Video Duration (seconds)", "lance-generation-label"), elem_classes=["lance-label-html"])
-                        num_frames = gr.Radio(
-                            label="Video Duration (seconds)",
-                            show_label=False,
-                            choices=get_video_duration_choices(),
-                            value=DEFAULT_VIDEO_DURATION_SECONDS,
-                            elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
-                        )
-                with gr.Row(elem_classes=["generation-controls-row", "video-resolution-row"]) as video_resolution_row:
-                    with gr.Column(elem_classes=["lance-control-field"]):
-                        gr.HTML(build_lance_label_html("Video Resolution", "lance-generation-label"), elem_classes=["lance-label-html"])
-                        resolution = gr.Dropdown(
-                            label="Video Resolution",
-                            show_label=False,
-                            choices=VIDEO_RESOLUTION_DISPLAY_CHOICES,
-                            value=DEFAULT_RESOLUTION,
-                            allow_custom_value=True,
-                            elem_classes=["generation-control"],
-                        )
                 height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
                 width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)

 from pathlib import Path
 from typing import Optional
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:128")
 try:
     import spaces
 except ImportError:  # pragma: no cover - keeps local CPU runs working
 IMAGE_TASKS = {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
 VIDEO_TASKS = {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
 EDIT_TASKS = {TASK_IMAGE_EDIT, TASK_VIDEO_EDIT}
+VIDEO_RESOLUTION_CHOICES = [DEFAULT_RESOLUTION]
 VIDEO_EDIT_RESOLUTION_CHOICES = [DEFAULT_VIDEO_EDIT_RESOLUTION]
 IMAGE_RESOLUTION_CHOICES = [DEFAULT_IMAGE_RESOLUTION]
 RESOLUTION_CHOICES = VIDEO_RESOLUTION_CHOICES + IMAGE_RESOLUTION_CHOICES
+VIDEO_RESOLUTION_DISPLAY_CHOICES = [("360p", "video_360p"), ("480p", "video_480p")]
 V2T_QA_SYSTEM_PROMPT = "View the video  attentively and provide a suitable answer to the posed question."
 I2T_QA_SYSTEM_PROMPT = "View the image attentively and provide a suitable answer to the posed question."
 def get_aspect_ratio_choices_for_task(task: str) -> list[tuple[str, str]]:
     """Get Aspect Ratio choices with default/recommended marker for the given task."""
     internal_task = normalize_task(task)
     default_ratio = DEFAULT_IMAGE_ASPECT_RATIO if internal_task in IMAGE_TASKS else DEFAULT_VIDEO_ASPECT_RATIO
     return [
+        (f"{ratio} (default)" if ratio == default_ratio else ratio, ratio)
         for ratio in ASPECT_RATIO_CHOICES
     ]
                     data_args=request_data_args,
                     inference_args=request_inference_args,
                 )
+                # Keep the allocator from fragmenting before the heavy forward pass.
+                clean_memory()
                 generate_start = time.perf_counter()
                 validate_on_fixed_batch(
                     fsdp_model=self.model,
     def gpu_summary(self) -> str:
         return ",".join(str(gpu_id) for gpu_id in self.gpu_ids)
+    @property
+    def is_initialized(self) -> bool:
+        return all(pipeline.initialized for pipeline in self.pipelines)
     def initialize_all(self) -> None:
+        if self.is_initialized:
+            return
         print(f"[startup][{self.model_variant}] Preparing parallel GPU preload: {self.gpu_ids}", flush=True)
         exceptions: list[Exception] = []
         with concurrent.futures.ThreadPoolExecutor(max_workers=self.size) as executor:
             self.release(pipeline)
+PIPELINE_POOLS: dict[str, PipelinePool] = {}
 ACTIVE_PIPELINE_POOL: Optional[PipelinePool] = None
 ACTIVE_POOL_LOCK = threading.Lock()
 QUEUE_MAX_SIZE = DEFAULT_QUEUE_SIZE
 ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
+def get_other_model_variant(model_variant: str) -> str:
+    normalized_variant = normalize_model_variant(model_variant)
+    return MODEL_VARIANT_IMAGE if normalized_variant == MODEL_VARIANT_VIDEO else MODEL_VARIANT_VIDEO
+def is_pipeline_pool_ready_for_variant(model_variant: str) -> bool:
+    normalized_variant = normalize_model_variant(model_variant)
+    with ACTIVE_POOL_LOCK:
+        pool = PIPELINE_POOLS.get(normalized_variant)
+        return bool(pool is not None and pool.is_initialized)
 def is_pipeline_pool_ready_for_task(task: str) -> bool:
+    return is_pipeline_pool_ready_for_variant(get_task_model_variant(task))
+def get_or_create_pipeline_pool(model_variant: str) -> PipelinePool:
+    if not torch.cuda.is_available():
+        raise RuntimeError(
+            "Lance inference requires a GPU. The Gradio UI can start on CPU, but generation is disabled "
+            "until GPU hardware is attached."
+        )
+    normalized_variant = normalize_model_variant(model_variant)
+    gpu_ids = parse_gpu_ids(os.getenv("LANCE_GPUS", DEFAULT_GPUS))
+    with ACTIVE_POOL_LOCK:
+        pool = PIPELINE_POOLS.get(normalized_variant)
+        if pool is None:
+            pool = PipelinePool(gpu_ids, model_variant=normalized_variant)
+            PIPELINE_POOLS[normalized_variant] = pool
+        return pool
+def ensure_pipeline_pool_ready(model_variant: str) -> PipelinePool:
+    pool = get_or_create_pipeline_pool(model_variant)
+    if not pool.is_initialized:
+        pool.initialize_all()
+    return pool
+def get_pipeline_pool(task: str) -> PipelinePool:
+    global ACTIVE_PIPELINE_POOL
+    model_variant = get_task_model_variant(task)
+    pool = ensure_pipeline_pool_ready(model_variant)
+    with ACTIVE_POOL_LOCK:
+        ACTIVE_PIPELINE_POOL = pool
+    return pool
 def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
     prompt_length = len((prompt or "").strip())
     has_video_input = bool((input_video or "").strip())
     has_image_input = bool((input_image or "").strip())
+    pool_ready = is_pipeline_pool_ready_for_task(internal_task)
     is_video_task = internal_task in {TASK_T2V, TASK_VIDEO_EDIT, TASK_X2T_VIDEO}
     is_image_task = internal_task in {TASK_T2I, TASK_IMAGE_EDIT, TASK_X2T_IMAGE}
     if internal_task == TASK_T2I:
+        return 90 if pool_ready else 150
     if internal_task == TASK_IMAGE_EDIT:
+        return 100 if pool_ready else 150
     if internal_task == TASK_X2T_IMAGE:
+        return 90 if pool_ready else 150
     if internal_task == TASK_X2T_VIDEO:
+        return 120 if pool_ready else 200
     if internal_task == TASK_VIDEO_EDIT:
+        base = 170 if pool_ready else 300
+        base += min(30 if pool_ready else 48, max(0, num_frames - 37) // 3)
+        base += 24 if enable_frame_interpolation else 0
+        base += 16 if has_video_input else 0
+        base += 10 if resolution == "video_480p" else 0
         return base
     if internal_task == TASK_T2V:
+        if pool_ready:
+            base = 130 if resolution == "video_360p" else 150
+            base += min(36, max(0, num_frames - 37) // 3)
+            base += 18 if enable_frame_interpolation else 0
+            base += min(12, prompt_length // 320)
+            return base
         base = 224 if resolution == "video_360p" else 264
         base += min(56, max(0, num_frames - 37) // 2)
         base += 28 if enable_frame_interpolation else 0
         return base
     if is_video_task:
+        base = 150 if pool_ready else 240
+        base += min(28 if pool_ready else 40, max(0, num_frames - 37) // 3)
+        base += 18 if enable_frame_interpolation else 0
         return base
     if is_image_task:
+        return 100 if pool_ready else 120
     return 160
     return finalize_zerogpu_duration(estimated_seconds, task)
 def run_task(
     task: str,
     prompt: str,
     enable_frame_interpolation: bool,
 ):
     internal_task = normalize_task(task)
+    if internal_task in UNDERSTANDING_TASKS and not prompt:
+        return None, None, "", "Please enter a question.", ""
+    if internal_task in {TASK_VIDEO_EDIT, TASK_X2T_VIDEO} and not input_video:
+        return None, None, "", "Please upload an input video.", ""
+    if internal_task in {TASK_IMAGE_EDIT, TASK_X2T_IMAGE} and not input_image:
+        return None, None, "", "Please upload an input image.", ""
+    if height <= 0 or width <= 0:
+        return None, None, "", "Height and width must be greater than 0.", ""
+    if num_frames <= 0:
+        return None, None, "", "The number of frames must be greater than 0.", ""
     if internal_task == TASK_T2V:
         num_frames = video_seconds_to_num_frames(num_frames)
+    normalized_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
+    return run_task_gpu(
+        task=task,
+        prompt=prompt,
+        system_prompt=system_prompt,
+        input_video=input_video,
+        input_image=input_image,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        seed=seed,
+        resolution=normalized_resolution,
+        validation_num_timesteps=validation_num_timesteps,
+        validation_timestep_shift=validation_timestep_shift,
+        cfg_text_scale=cfg_text_scale,
+        enable_frame_interpolation=enable_frame_interpolation,
+    )
+@spaces.GPU(size="large", duration=get_run_task_gpu_duration)
+def run_task_gpu(
+    task: str,
+    prompt: str,
+    system_prompt: Optional[str],
+    input_video: Optional[str],
+    input_image: Optional[str],
+    height: int,
+    width: int,
+    num_frames: int,
+    seed: int,
+    resolution: str,
+    validation_num_timesteps: int,
+    validation_timestep_shift: float,
+    cfg_text_scale: float,
+    enable_frame_interpolation: bool,
+):
     pipeline_pool = get_pipeline_pool(task)
     return pipeline_pool.generate(
         task=task,
     gpu_text = "unknown"
     concurrency = 1
     active_variant = "none"
+    cached_variants = "none"
     if ACTIVE_PIPELINE_POOL is not None:
         active_variant = ACTIVE_PIPELINE_POOL.model_variant
         gpu_text = ACTIVE_PIPELINE_POOL.gpu_summary
         concurrency = ACTIVE_PIPELINE_POOL.size
+    with ACTIVE_POOL_LOCK:
+        if PIPELINE_POOLS:
+            cached_variants = ",".join(sorted(PIPELINE_POOLS.keys()))
     return (
         f"**Status**  GPU: `{gpu_text}`  |  Max concurrency: `{concurrency}`  |  "
         f"Queue limit: `{QUEUE_MAX_SIZE}`  |  Active model: `{active_variant}`  |  "
+        f"Cached variants: `{cached_variants}`"
     )
                                 value=DEFAULT_FRAME_INTERPOLATION if RIFE_AVAILABLE else FRAME_INTERPOLATION_NO,
                                 elem_classes=["generation-control", "generation-two-line-label"],
                             )
+                    with gr.Row(elem_classes=["generation-controls-row", "video-resolution-row"]) as video_resolution_row:
+                        with gr.Column(elem_classes=["lance-control-field"]):
+                            gr.HTML(build_lance_label_html("Video Resolution", "lance-generation-label"), elem_classes=["lance-label-html"])
+                            resolution = gr.Dropdown(
+                                label="Video Resolution",
+                                show_label=False,
+                                choices=VIDEO_RESOLUTION_DISPLAY_CHOICES,
+                                value=DEFAULT_RESOLUTION,
+                                allow_custom_value=True,
+                                elem_classes=["generation-control"],
+                            )
                     with gr.Row(elem_classes=["generation-controls-row", "aspect-ratio-row"]) as aspect_ratio_row:
                         with gr.Column(elem_classes=["lance-control-field"]):
                             gr.HTML('<div class="lance-generation-label">Aspect Ratio</div>', elem_classes=["lance-label-html"])
                                 value=DEFAULT_VIDEO_ASPECT_RATIO,
                                 elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
                             )
+                    with gr.Row(elem_classes=["generation-controls-row", "video-duration-row"]) as video_duration_row:
+                        with gr.Column(elem_classes=["lance-control-field"]):
+                            gr.HTML(build_lance_label_html("Video Duration (seconds)", "lance-generation-label"), elem_classes=["lance-label-html"])
+                            num_frames = gr.Radio(
+                                label="Video Duration (seconds)",
+                                show_label=False,
+                                choices=get_video_duration_choices(),
+                                value=DEFAULT_VIDEO_DURATION_SECONDS,
+                                elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
+                            )
                     with gr.Row(elem_classes=["generation-controls-row", "output-resolution-row"], visible=False) as output_resolution_row:
                         with gr.Column(elem_classes=["lance-control-field"]):
                             gr.HTML('<div class="lance-generation-label">Output Resolution</div>', elem_classes=["lance-label-html"])
                                 visible=False,
                                 elem_classes=["generation-control", "generation-choice-grid", "generation-two-line-label"],
                             )
                 height = gr.Number(value=DEFAULT_HEIGHT, precision=0, visible=False)
                 width = gr.Number(value=DEFAULT_WIDTH, precision=0, visible=False)

inference_lance.py CHANGED Viewed

@@ -344,33 +344,37 @@ def validate_on_fixed_batch(
             clean_memory()
         elif inference_args.task in UNDERSTANDING_TASKS:
-            generated_sequence_all, captions, index = fsdp_model.validation_video_to_text(
-                val_packed_text_ids=val_data["packed_text_ids"],
-                val_packed_text_indexes=val_data["packed_text_indexes"],
-                val_packed_position_ids=val_data["packed_position_ids"],
-                val_sample_N_target=val_data["sample_N_target"],
-                val_split_lens=val_data["split_lens"],
-                val_attn_modes=val_data["attn_modes"],
-                val_sample_lens=val_data["sample_lens"],
-                val_sample_type=val_data["sample_type"],
-                val_packed_vit_tokens=val_data["packed_vit_tokens"],
-                val_vit_video_grid_thw=val_data["vit_video_grid_thw"],
-                val_ce_loss_indexes=val_data["ce_loss_indexes"],
-                max_samples=training_args.validation_max_samples,
-                max_length=MAX_GENERATION_LENGTH,
-                device=device,
-                dtype=torch.bfloat16,
-                new_token_ids=new_token_ids,
-                pad_token_id=tokenizer.pad_token_id,
-                vocab_size=len(tokenizer),
-                caption=val_data.get("caption_cn", None),
-                tokenizer=tokenizer,
-                apply_chat_template=training_args.apply_chat_template,
-                apply_qwen_2_5_vl_pos_emb=training_args.apply_qwen_2_5_vl_pos_emb,
-                do_sample=False,
-                image_token_id=image_token_id,
-                index=val_data["index"],
-            )
             for i_val, generated_sequence in enumerate(generated_sequence_all):
                 cap = tokenizer.decode(generated_sequence[:, 0])
@@ -378,7 +382,7 @@ def validate_on_fixed_batch(
                 inference_args.prompt_data_dict[index] = f"{cap}"
                 del generated_sequence
-            del generated_sequence_all, captions
             clean_memory()
     del val_data
@@ -495,9 +499,9 @@ def main():
         training_args=training_args,
     )
     stage_start = time.perf_counter()
-    log_rank0("[startup] Casting Lance model to bf16 on CPU")
-    model = model.to(dtype=torch.bfloat16)
-    log_stage("Lance model bf16 cast", stage_start)
     # Setup tokenizer for model:
     stage_start = time.perf_counter()
@@ -538,10 +542,7 @@ def main():
     else: # HACK!!!
         assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_word_embeddings conflict'
-    stage_start = time.perf_counter()
-    log_rank0(f"[startup] Moving Lance model to GPU {DEVICE}")
-    model = model.to(device=DEVICE)
-    log_stage("Lance model move to GPU", stage_start)
     model.eval()
     if vae_model is not None and hasattr(vae_model, "eval"):
         vae_model.eval()

             clean_memory()
         elif inference_args.task in UNDERSTANDING_TASKS:
+            params = {
+                "val_packed_text_ids": val_data["packed_text_ids"],
+                "val_packed_text_indexes": val_data["packed_text_indexes"],
+                "val_packed_position_ids": val_data["packed_position_ids"],
+                "val_sample_N_target": val_data["sample_N_target"],
+                "val_split_lens": val_data["split_lens"],
+                "val_attn_modes": val_data["attn_modes"],
+                "val_sample_lens": val_data["sample_lens"],
+                "val_sample_type": val_data["sample_type"],
+                "val_packed_vit_tokens": val_data["packed_vit_tokens"],
+                "val_vit_video_grid_thw": val_data["vit_video_grid_thw"],
+                "val_ce_loss_indexes": val_data["ce_loss_indexes"],
+                "max_samples": training_args.validation_max_samples,
+                "max_length": MAX_GENERATION_LENGTH,
+                "device": device,
+                "dtype": torch.bfloat16,
+                "new_token_ids": new_token_ids,
+                "pad_token_id": tokenizer.pad_token_id,
+                "vocab_size": len(tokenizer),
+                "caption": val_data.get("caption_cn", None),
+                "tokenizer": tokenizer,
+                "apply_chat_template": training_args.apply_chat_template,
+                "apply_qwen_2_5_vl_pos_emb": training_args.apply_qwen_2_5_vl_pos_emb,
+                "do_sample": False,
+                "image_token_id": image_token_id,
+                "index": val_data["index"],
+            }
+            if inference_args.use_KVcache:
+                generated_sequence_all, captions, index = fsdp_model.validation_und_KVcache(**params)
+            else:
+                generated_sequence_all, captions, index = fsdp_model.validation_video_to_text(**params)
             for i_val, generated_sequence in enumerate(generated_sequence_all):
                 cap = tokenizer.decode(generated_sequence[:, 0])
                 inference_args.prompt_data_dict[index] = f"{cap}"
                 del generated_sequence
+            del generated_sequence_all, captions, params
             clean_memory()
     del val_data
         training_args=training_args,
     )
     stage_start = time.perf_counter()
+    log_rank0(f"[startup] Moving Lance model to GPU {DEVICE}")
+    model = model.to(DEVICE)
+    log_stage("Lance model move to GPU", stage_start)
     # Setup tokenizer for model:
     stage_start = time.perf_counter()
     else: # HACK!!!
         assert model.language_model.get_input_embeddings().weight.data.data_ptr() != model.language_model.get_output_embeddings().weight.data.data_ptr(), 'tie_word_embeddings conflict'
+    model = model.to(device=DEVICE, dtype=torch.bfloat16)
     model.eval()
     if vae_model is not None and hasattr(vae_model, "eval"):
         vae_model.eval()