Spaces:

bytedance-research
/

Lance

Running on Zero

App Files Files Community

ffy2000 commited on 4 days ago

Commit

fddaf5e

1 Parent(s): b7a0fa0

Prepare Lance for Hugging Face Space

Browse files

Files changed (2) hide show

SPACE_DEPLOYMENT.md +11 -4
app.py +36 -75

SPACE_DEPLOYMENT.md CHANGED Viewed

@@ -1,13 +1,15 @@
 # Hugging Face Space Deployment
-This repository is prepared for a Docker-based Hugging Face Space.
 ## Runtime
-- Space SDK: Docker
 - Public port: `7860`
 - Entrypoint: `python app.py`
-- Recommended hardware: GPU, preferably `l40s` or stronger
 ## Model Assets
@@ -17,7 +19,8 @@ Default behavior:
 - Local checkout with `downloads/`: use `./downloads`
 - Hugging Face Space without local assets: download from `bytedance-research/Lance` into `/data/lance_models`
-- Video tasks preload `Lance_3B_Video` at startup.
 - Image tasks unload the active video model first, then load `Lance_3B`.
 - Switching back to a video task unloads `Lance_3B`, then reloads `Lance_3B_Video`.
@@ -33,6 +36,10 @@ Useful environment variables:
 - `LANCE_GPUS`: comma-separated GPU IDs, for example `0` or `0,1`
 - `LANCE_QUEUE_SIZE`: Gradio queue size
 - `LANCE_GRADIO_TMP_ROOT`: output and temporary file directory
 Expected model layout:

 # Hugging Face Space Deployment
+This repository is prepared for a Gradio-based Hugging Face Space with ZeroGPU.
 ## Runtime
+- Space SDK: Gradio
+- Space hardware: ZeroGPU
 - Public port: `7860`
 - Entrypoint: `python app.py`
+- Recommended use: ZeroGPU for request-scoped GPU allocation
+- In the Space settings UI, select `ZeroGPU` as the hardware target
 ## Model Assets
 - Local checkout with `downloads/`: use `./downloads`
 - Hugging Face Space without local assets: download from `bytedance-research/Lance` into `/data/lance_models`
+- Video tasks use the pre-fetched `Lance_3B_Video` assets when available.
+- Startup prefetch downloads the model snapshots on CPU so the first GPU request does not pay that cold-start cost.
 - Image tasks unload the active video model first, then load `Lance_3B`.
 - Switching back to a video task unloads `Lance_3B`, then reloads `Lance_3B_Video`.
 - `LANCE_GPUS`: comma-separated GPU IDs, for example `0` or `0,1`
 - `LANCE_QUEUE_SIZE`: Gradio queue size
 - `LANCE_GRADIO_TMP_ROOT`: output and temporary file directory
+- `LANCE_ZEROGPU_MAX_DURATION_SECONDS`: fixed `@spaces.GPU` duration request in seconds for all tasks (default: 300)
+- `LANCE_INSTALL_FLASH_ATTN_ON_STARTUP`: set to `1` to install flash-attn during Space startup instead of inside the GPU reservation
+- `LANCE_PREFETCH_MODEL_ASSETS`: set to `0` to skip CPU-side model prefetch at startup
+- `LANCE_PREFETCH_MODEL_VARIANTS`: comma-separated model variants to prefetch, for example `video,image`
 Expected model layout:

app.py CHANGED Viewed

@@ -2355,7 +2355,7 @@ def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tu
             "\n".join(
                 [
                     f"RIFE failed with exit code {exc.returncode}.",
-                    f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
                     exc.stdout.strip() if exc.stdout else "",
                     exc.stderr.strip() if exc.stderr else "",
                 ]
@@ -2367,7 +2367,7 @@ def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tu
     log = "\n".join(
         [
             "[rife] Frame interpolation finished.",
-            f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
             f"elapsed={elapsed:.2f}s",
             f"output={output_path}",
             completed.stdout.strip(),
@@ -2497,7 +2497,6 @@ class LanceT2VV2TPipeline:
                 return
             ensure_dirs()
-            ensure_flash_attn_installed()
             resolved_model_path = ensure_model_assets(self.model_variant)
             print(
                 f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
@@ -3159,6 +3158,22 @@ def get_task_model_variant(task: str) -> str:
     return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
 def ensure_flash_attn_installed() -> None:
     try:
         from importlib.metadata import PackageNotFoundError, version as package_version
@@ -3188,60 +3203,31 @@ def ensure_flash_attn_installed() -> None:
     print(f"[startup] flash-attn {DEFAULT_FLASH_ATTN_VERSION} installed successfully.", flush=True)
-def get_env_int(name: str, default: int) -> int:
-    """Read an integer environment variable, falling back safely on invalid values."""
-    try:
-        return int(os.getenv(name, str(default)))
-    except (TypeError, ValueError):
-        return default
-def get_env_float(name: str, default: float) -> float:
-    """Read a float environment variable, falling back safely on invalid values."""
-    try:
-        return float(os.getenv(name, str(default)))
-    except (TypeError, ValueError):
-        return default
 def get_zerogpu_duration_cap() -> int:
-    """Maximum duration requested from ZeroGPU.
     The duration value is a ZeroGPU reservation/timeout hint. Shorter values can
     improve queue priority and reduce wasted quota, but the value must still cover
     model warm-up plus inference. Override per deployment when needed:
         LANCE_ZEROGPU_MAX_DURATION_SECONDS=300
     """
-    return max(1, get_env_int("LANCE_ZEROGPU_MAX_DURATION_SECONDS", 240))
 def clamp_zerogpu_duration(seconds: int) -> int:
     return max(1, min(int(seconds), get_zerogpu_duration_cap()))
-def is_pipeline_pool_ready_for_task(task: str) -> bool:
-    """Return True when the required model variant is already resident on GPU.
-    ZeroGPU evaluates the dynamic duration before calling the decorated function.
-    If the model is already loaded, we can request a shorter warm-run duration;
-    otherwise we reserve extra time for the first request after startup or model
-    switching. This does not change the UI layout or user-facing controls.
-    """
-    try:
-        pool = ACTIVE_PIPELINE_POOL
-        if pool is None or pool.model_variant != get_task_model_variant(task):
-            return False
-        return all(getattr(pipeline, "initialized", False) for pipeline in pool.pipelines)
-    except Exception:
-        return False
 def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
-    """Add configurable safety margin and clamp the requested ZeroGPU duration."""
-    margin = max(1.0, get_env_float("LANCE_ZEROGPU_DURATION_MARGIN", 1.10))
-    if not is_pipeline_pool_ready_for_task(task):
-        estimated_seconds += max(0, get_env_int("LANCE_ZEROGPU_COLD_START_BUFFER_SECONDS", 120))
-    return clamp_zerogpu_duration(int(estimated_seconds * margin + 0.999))
 def get_run_task_gpu_duration(
@@ -3260,39 +3246,8 @@ def get_run_task_gpu_duration(
     cfg_text_scale: float,
     enable_frame_interpolation: bool,
 ) -> int:
-    """Return a dynamic ZeroGPU reservation duration.
-    The previous implementation used one conservative estimate for both cold and
-    warm runs. This version keeps the first request safe, then asks for shorter
-    durations once the matching Lance model is already loaded, which reduces
-    wasted ZeroGPU quota and improves queue priority without changing the UI.
-    """
-    internal_task = normalize_task(task)
-    timesteps = max(1, int(validation_num_timesteps or DEFAULT_TIMESTEPS))
-    backend_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
-    resolution_multiplier = 1.28 if backend_resolution == "video_480p" else 1.0
-    timestep_extra = max(0, timesteps - 20)
-    if internal_task == TASK_T2V:
-        requested_seconds = max(1, int(num_frames or DEFAULT_VIDEO_DURATION_SECONDS))
-        estimate = 35 + requested_seconds * 10 + timestep_extra * 1.5
-        if normalize_frame_interpolation(enable_frame_interpolation):
-            estimate += min(32, 8 + requested_seconds * 3)
-        return finalize_zerogpu_duration(estimate * resolution_multiplier, internal_task)
-    if internal_task == TASK_VIDEO_EDIT:
-        estimate = 85 + timestep_extra * 1.5
-        if normalize_frame_interpolation(enable_frame_interpolation):
-            estimate += 22
-        return finalize_zerogpu_duration(estimate * resolution_multiplier, internal_task)
-    if internal_task == TASK_X2T_VIDEO:
-        return finalize_zerogpu_duration(32, internal_task)
-    if internal_task == TASK_T2I:
-        return finalize_zerogpu_duration(58, internal_task)
-    if internal_task == TASK_IMAGE_EDIT:
-        return finalize_zerogpu_duration(70, internal_task)
-    return finalize_zerogpu_duration(28, internal_task)
 def get_pipeline_pool(task: str) -> PipelinePool:
@@ -3322,7 +3277,7 @@ def get_pipeline_pool(task: str) -> PipelinePool:
         return ACTIVE_PIPELINE_POOL
-@spaces.GPU(duration=get_run_task_gpu_duration)
 def run_task(
     task: str,
     prompt: str,
@@ -3915,6 +3870,12 @@ def prefetch_model_assets_before_launch() -> None:
     the visible UI unchanged. Set LANCE_PREFETCH_MODEL_ASSETS=0 to skip this at
     Space startup, or LANCE_PREFETCH_MODEL_VARIANTS=video to prefetch less.
     """
     if not env_flag("LANCE_PREFETCH_MODEL_ASSETS", running_on_space()):
         print("[startup] Model asset prefetch disabled.", flush=True)
         return
@@ -3952,7 +3913,7 @@ if __name__ == "__main__":
     QUEUE_MAX_SIZE = args.queue_size
     prefetch_model_assets_before_launch()
     print(
-        "[startup] Skipping GPU model preload. UI will launch first, and Lance weights will be loaded lazily inside ZeroGPU inference calls.",
         flush=True,
     )
     concurrency_limit = 1

             "\n".join(
                 [
                     f"RIFE failed with exit code {exc.returncode}.",
+                    f"command=CUDA_VISIBLE_DEVICES={device_id} " + " ".join(command),
                     exc.stdout.strip() if exc.stdout else "",
                     exc.stderr.strip() if exc.stderr else "",
                 ]
     log = "\n".join(
         [
             "[rife] Frame interpolation finished.",
+            f"command=CUDA_VISIBLE_DEVICES={device_id} " + " ".join(command),
             f"elapsed={elapsed:.2f}s",
             f"output={output_path}",
             completed.stdout.strip(),
                 return
             ensure_dirs()
             resolved_model_path = ensure_model_assets(self.model_variant)
             print(
                 f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
     return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
+def get_env_int(name: str, default: int) -> int:
+    """Read an integer environment variable, falling back safely on invalid values."""
+    try:
+        return int(os.getenv(name, str(default)))
+    except (TypeError, ValueError):
+        return default
+def get_env_float(name: str, default: float) -> float:
+    """Read a float environment variable, falling back safely on invalid values."""
+    try:
+        return float(os.getenv(name, str(default)))
+    except (TypeError, ValueError):
+        return default
 def ensure_flash_attn_installed() -> None:
     try:
         from importlib.metadata import PackageNotFoundError, version as package_version
     print(f"[startup] flash-attn {DEFAULT_FLASH_ATTN_VERSION} installed successfully.", flush=True)
 def get_zerogpu_duration_cap() -> int:
+    """Fixed duration requested from ZeroGPU for each run.
     The duration value is a ZeroGPU reservation/timeout hint. Shorter values can
     improve queue priority and reduce wasted quota, but the value must still cover
     model warm-up plus inference. Override per deployment when needed:
         LANCE_ZEROGPU_MAX_DURATION_SECONDS=300
     """
+    return max(1, get_env_int("LANCE_ZEROGPU_MAX_DURATION_SECONDS", 300))
 def clamp_zerogpu_duration(seconds: int) -> int:
     return max(1, min(int(seconds), get_zerogpu_duration_cap()))
+ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
+def is_pipeline_pool_ready_for_task(task: str) -> bool:
+    """Retained for compatibility with earlier duration logic."""
+    return False
 def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
+    """Retained for compatibility with earlier duration logic."""
+    return clamp_zerogpu_duration(ZERO_GPU_RUN_TASK_DURATION_SECONDS)
 def get_run_task_gpu_duration(
     cfg_text_scale: float,
     enable_frame_interpolation: bool,
 ) -> int:
+    """Return a fixed ZeroGPU reservation duration for compatibility."""
+    return ZERO_GPU_RUN_TASK_DURATION_SECONDS
 def get_pipeline_pool(task: str) -> PipelinePool:
         return ACTIVE_PIPELINE_POOL
+@spaces.GPU(size="large", duration=ZERO_GPU_RUN_TASK_DURATION_SECONDS)
 def run_task(
     task: str,
     prompt: str,
     the visible UI unchanged. Set LANCE_PREFETCH_MODEL_ASSETS=0 to skip this at
     Space startup, or LANCE_PREFETCH_MODEL_VARIANTS=video to prefetch less.
     """
+    if running_on_space() or env_flag("LANCE_INSTALL_FLASH_ATTN_ON_STARTUP", False):
+        try:
+            ensure_flash_attn_installed()
+        except Exception as exc:
+            print(f"[startup] flash-attn startup install failed and will be retried lazily during inference: {exc}", flush=True)
     if not env_flag("LANCE_PREFETCH_MODEL_ASSETS", running_on_space()):
         print("[startup] Model asset prefetch disabled.", flush=True)
         return
     QUEUE_MAX_SIZE = args.queue_size
     prefetch_model_assets_before_launch()
     print(
+        "[startup] Skipping GPU model preload. UI will launch first, and Lance weights will be prefetched on CPU before ZeroGPU inference. If that prefetch fails, inference will fall back to lazy loading.",
         flush=True,
     )
     concurrency_limit = 1