ffy2000 commited on
Commit
fddaf5e
·
1 Parent(s): b7a0fa0

Prepare Lance for Hugging Face Space

Browse files
Files changed (2) hide show
  1. SPACE_DEPLOYMENT.md +11 -4
  2. app.py +36 -75
SPACE_DEPLOYMENT.md CHANGED
@@ -1,13 +1,15 @@
1
  # Hugging Face Space Deployment
2
 
3
- This repository is prepared for a Docker-based Hugging Face Space.
4
 
5
  ## Runtime
6
 
7
- - Space SDK: Docker
 
8
  - Public port: `7860`
9
  - Entrypoint: `python app.py`
10
- - Recommended hardware: GPU, preferably `l40s` or stronger
 
11
 
12
  ## Model Assets
13
 
@@ -17,7 +19,8 @@ Default behavior:
17
 
18
  - Local checkout with `downloads/`: use `./downloads`
19
  - Hugging Face Space without local assets: download from `bytedance-research/Lance` into `/data/lance_models`
20
- - Video tasks preload `Lance_3B_Video` at startup.
 
21
  - Image tasks unload the active video model first, then load `Lance_3B`.
22
  - Switching back to a video task unloads `Lance_3B`, then reloads `Lance_3B_Video`.
23
 
@@ -33,6 +36,10 @@ Useful environment variables:
33
  - `LANCE_GPUS`: comma-separated GPU IDs, for example `0` or `0,1`
34
  - `LANCE_QUEUE_SIZE`: Gradio queue size
35
  - `LANCE_GRADIO_TMP_ROOT`: output and temporary file directory
 
 
 
 
36
 
37
  Expected model layout:
38
 
 
1
  # Hugging Face Space Deployment
2
 
3
+ This repository is prepared for a Gradio-based Hugging Face Space with ZeroGPU.
4
 
5
  ## Runtime
6
 
7
+ - Space SDK: Gradio
8
+ - Space hardware: ZeroGPU
9
  - Public port: `7860`
10
  - Entrypoint: `python app.py`
11
+ - Recommended use: ZeroGPU for request-scoped GPU allocation
12
+ - In the Space settings UI, select `ZeroGPU` as the hardware target
13
 
14
  ## Model Assets
15
 
 
19
 
20
  - Local checkout with `downloads/`: use `./downloads`
21
  - Hugging Face Space without local assets: download from `bytedance-research/Lance` into `/data/lance_models`
22
+ - Video tasks use the pre-fetched `Lance_3B_Video` assets when available.
23
+ - Startup prefetch downloads the model snapshots on CPU so the first GPU request does not pay that cold-start cost.
24
  - Image tasks unload the active video model first, then load `Lance_3B`.
25
  - Switching back to a video task unloads `Lance_3B`, then reloads `Lance_3B_Video`.
26
 
 
36
  - `LANCE_GPUS`: comma-separated GPU IDs, for example `0` or `0,1`
37
  - `LANCE_QUEUE_SIZE`: Gradio queue size
38
  - `LANCE_GRADIO_TMP_ROOT`: output and temporary file directory
39
+ - `LANCE_ZEROGPU_MAX_DURATION_SECONDS`: fixed `@spaces.GPU` duration request in seconds for all tasks (default: 300)
40
+ - `LANCE_INSTALL_FLASH_ATTN_ON_STARTUP`: set to `1` to install flash-attn during Space startup instead of inside the GPU reservation
41
+ - `LANCE_PREFETCH_MODEL_ASSETS`: set to `0` to skip CPU-side model prefetch at startup
42
+ - `LANCE_PREFETCH_MODEL_VARIANTS`: comma-separated model variants to prefetch, for example `video,image`
43
 
44
  Expected model layout:
45
 
app.py CHANGED
@@ -2355,7 +2355,7 @@ def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tu
2355
  "\n".join(
2356
  [
2357
  f"RIFE failed with exit code {exc.returncode}.",
2358
- f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
2359
  exc.stdout.strip() if exc.stdout else "",
2360
  exc.stderr.strip() if exc.stderr else "",
2361
  ]
@@ -2367,7 +2367,7 @@ def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tu
2367
  log = "\n".join(
2368
  [
2369
  "[rife] Frame interpolation finished.",
2370
- f"command=CUDA_VISIBLE_DEVICES={device_id} {' '.join(command)}",
2371
  f"elapsed={elapsed:.2f}s",
2372
  f"output={output_path}",
2373
  completed.stdout.strip(),
@@ -2497,7 +2497,6 @@ class LanceT2VV2TPipeline:
2497
  return
2498
 
2499
  ensure_dirs()
2500
- ensure_flash_attn_installed()
2501
  resolved_model_path = ensure_model_assets(self.model_variant)
2502
  print(
2503
  f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
@@ -3159,6 +3158,22 @@ def get_task_model_variant(task: str) -> str:
3159
  return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
3160
 
3161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3162
  def ensure_flash_attn_installed() -> None:
3163
  try:
3164
  from importlib.metadata import PackageNotFoundError, version as package_version
@@ -3188,60 +3203,31 @@ def ensure_flash_attn_installed() -> None:
3188
  print(f"[startup] flash-attn {DEFAULT_FLASH_ATTN_VERSION} installed successfully.", flush=True)
3189
 
3190
 
3191
- def get_env_int(name: str, default: int) -> int:
3192
- """Read an integer environment variable, falling back safely on invalid values."""
3193
- try:
3194
- return int(os.getenv(name, str(default)))
3195
- except (TypeError, ValueError):
3196
- return default
3197
-
3198
-
3199
- def get_env_float(name: str, default: float) -> float:
3200
- """Read a float environment variable, falling back safely on invalid values."""
3201
- try:
3202
- return float(os.getenv(name, str(default)))
3203
- except (TypeError, ValueError):
3204
- return default
3205
-
3206
-
3207
  def get_zerogpu_duration_cap() -> int:
3208
- """Maximum duration requested from ZeroGPU.
3209
 
3210
  The duration value is a ZeroGPU reservation/timeout hint. Shorter values can
3211
  improve queue priority and reduce wasted quota, but the value must still cover
3212
  model warm-up plus inference. Override per deployment when needed:
3213
  LANCE_ZEROGPU_MAX_DURATION_SECONDS=300
3214
  """
3215
- return max(1, get_env_int("LANCE_ZEROGPU_MAX_DURATION_SECONDS", 240))
3216
 
3217
 
3218
  def clamp_zerogpu_duration(seconds: int) -> int:
3219
  return max(1, min(int(seconds), get_zerogpu_duration_cap()))
3220
 
3221
 
3222
- def is_pipeline_pool_ready_for_task(task: str) -> bool:
3223
- """Return True when the required model variant is already resident on GPU.
3224
 
3225
- ZeroGPU evaluates the dynamic duration before calling the decorated function.
3226
- If the model is already loaded, we can request a shorter warm-run duration;
3227
- otherwise we reserve extra time for the first request after startup or model
3228
- switching. This does not change the UI layout or user-facing controls.
3229
- """
3230
- try:
3231
- pool = ACTIVE_PIPELINE_POOL
3232
- if pool is None or pool.model_variant != get_task_model_variant(task):
3233
- return False
3234
- return all(getattr(pipeline, "initialized", False) for pipeline in pool.pipelines)
3235
- except Exception:
3236
- return False
3237
 
3238
 
3239
  def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
3240
- """Add configurable safety margin and clamp the requested ZeroGPU duration."""
3241
- margin = max(1.0, get_env_float("LANCE_ZEROGPU_DURATION_MARGIN", 1.10))
3242
- if not is_pipeline_pool_ready_for_task(task):
3243
- estimated_seconds += max(0, get_env_int("LANCE_ZEROGPU_COLD_START_BUFFER_SECONDS", 120))
3244
- return clamp_zerogpu_duration(int(estimated_seconds * margin + 0.999))
3245
 
3246
 
3247
  def get_run_task_gpu_duration(
@@ -3260,39 +3246,8 @@ def get_run_task_gpu_duration(
3260
  cfg_text_scale: float,
3261
  enable_frame_interpolation: bool,
3262
  ) -> int:
3263
- """Return a dynamic ZeroGPU reservation duration.
3264
-
3265
- The previous implementation used one conservative estimate for both cold and
3266
- warm runs. This version keeps the first request safe, then asks for shorter
3267
- durations once the matching Lance model is already loaded, which reduces
3268
- wasted ZeroGPU quota and improves queue priority without changing the UI.
3269
- """
3270
- internal_task = normalize_task(task)
3271
- timesteps = max(1, int(validation_num_timesteps or DEFAULT_TIMESTEPS))
3272
- backend_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
3273
- resolution_multiplier = 1.28 if backend_resolution == "video_480p" else 1.0
3274
- timestep_extra = max(0, timesteps - 20)
3275
-
3276
- if internal_task == TASK_T2V:
3277
- requested_seconds = max(1, int(num_frames or DEFAULT_VIDEO_DURATION_SECONDS))
3278
- estimate = 35 + requested_seconds * 10 + timestep_extra * 1.5
3279
- if normalize_frame_interpolation(enable_frame_interpolation):
3280
- estimate += min(32, 8 + requested_seconds * 3)
3281
- return finalize_zerogpu_duration(estimate * resolution_multiplier, internal_task)
3282
-
3283
- if internal_task == TASK_VIDEO_EDIT:
3284
- estimate = 85 + timestep_extra * 1.5
3285
- if normalize_frame_interpolation(enable_frame_interpolation):
3286
- estimate += 22
3287
- return finalize_zerogpu_duration(estimate * resolution_multiplier, internal_task)
3288
-
3289
- if internal_task == TASK_X2T_VIDEO:
3290
- return finalize_zerogpu_duration(32, internal_task)
3291
- if internal_task == TASK_T2I:
3292
- return finalize_zerogpu_duration(58, internal_task)
3293
- if internal_task == TASK_IMAGE_EDIT:
3294
- return finalize_zerogpu_duration(70, internal_task)
3295
- return finalize_zerogpu_duration(28, internal_task)
3296
 
3297
 
3298
  def get_pipeline_pool(task: str) -> PipelinePool:
@@ -3322,7 +3277,7 @@ def get_pipeline_pool(task: str) -> PipelinePool:
3322
  return ACTIVE_PIPELINE_POOL
3323
 
3324
 
3325
- @spaces.GPU(duration=get_run_task_gpu_duration)
3326
  def run_task(
3327
  task: str,
3328
  prompt: str,
@@ -3915,6 +3870,12 @@ def prefetch_model_assets_before_launch() -> None:
3915
  the visible UI unchanged. Set LANCE_PREFETCH_MODEL_ASSETS=0 to skip this at
3916
  Space startup, or LANCE_PREFETCH_MODEL_VARIANTS=video to prefetch less.
3917
  """
 
 
 
 
 
 
3918
  if not env_flag("LANCE_PREFETCH_MODEL_ASSETS", running_on_space()):
3919
  print("[startup] Model asset prefetch disabled.", flush=True)
3920
  return
@@ -3952,7 +3913,7 @@ if __name__ == "__main__":
3952
  QUEUE_MAX_SIZE = args.queue_size
3953
  prefetch_model_assets_before_launch()
3954
  print(
3955
- "[startup] Skipping GPU model preload. UI will launch first, and Lance weights will be loaded lazily inside ZeroGPU inference calls.",
3956
  flush=True,
3957
  )
3958
  concurrency_limit = 1
 
2355
  "\n".join(
2356
  [
2357
  f"RIFE failed with exit code {exc.returncode}.",
2358
+ f"command=CUDA_VISIBLE_DEVICES={device_id} " + " ".join(command),
2359
  exc.stdout.strip() if exc.stdout else "",
2360
  exc.stderr.strip() if exc.stderr else "",
2361
  ]
 
2367
  log = "\n".join(
2368
  [
2369
  "[rife] Frame interpolation finished.",
2370
+ f"command=CUDA_VISIBLE_DEVICES={device_id} " + " ".join(command),
2371
  f"elapsed={elapsed:.2f}s",
2372
  f"output={output_path}",
2373
  completed.stdout.strip(),
 
2497
  return
2498
 
2499
  ensure_dirs()
 
2500
  resolved_model_path = ensure_model_assets(self.model_variant)
2501
  print(
2502
  f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
 
3158
  return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
3159
 
3160
 
3161
+ def get_env_int(name: str, default: int) -> int:
3162
+ """Read an integer environment variable, falling back safely on invalid values."""
3163
+ try:
3164
+ return int(os.getenv(name, str(default)))
3165
+ except (TypeError, ValueError):
3166
+ return default
3167
+
3168
+
3169
+ def get_env_float(name: str, default: float) -> float:
3170
+ """Read a float environment variable, falling back safely on invalid values."""
3171
+ try:
3172
+ return float(os.getenv(name, str(default)))
3173
+ except (TypeError, ValueError):
3174
+ return default
3175
+
3176
+
3177
  def ensure_flash_attn_installed() -> None:
3178
  try:
3179
  from importlib.metadata import PackageNotFoundError, version as package_version
 
3203
  print(f"[startup] flash-attn {DEFAULT_FLASH_ATTN_VERSION} installed successfully.", flush=True)
3204
 
3205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3206
  def get_zerogpu_duration_cap() -> int:
3207
+ """Fixed duration requested from ZeroGPU for each run.
3208
 
3209
  The duration value is a ZeroGPU reservation/timeout hint. Shorter values can
3210
  improve queue priority and reduce wasted quota, but the value must still cover
3211
  model warm-up plus inference. Override per deployment when needed:
3212
  LANCE_ZEROGPU_MAX_DURATION_SECONDS=300
3213
  """
3214
+ return max(1, get_env_int("LANCE_ZEROGPU_MAX_DURATION_SECONDS", 300))
3215
 
3216
 
3217
  def clamp_zerogpu_duration(seconds: int) -> int:
3218
  return max(1, min(int(seconds), get_zerogpu_duration_cap()))
3219
 
3220
 
3221
+ ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
 
3222
 
3223
+ def is_pipeline_pool_ready_for_task(task: str) -> bool:
3224
+ """Retained for compatibility with earlier duration logic."""
3225
+ return False
 
 
 
 
 
 
 
 
 
3226
 
3227
 
3228
  def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
3229
+ """Retained for compatibility with earlier duration logic."""
3230
+ return clamp_zerogpu_duration(ZERO_GPU_RUN_TASK_DURATION_SECONDS)
 
 
 
3231
 
3232
 
3233
  def get_run_task_gpu_duration(
 
3246
  cfg_text_scale: float,
3247
  enable_frame_interpolation: bool,
3248
  ) -> int:
3249
+ """Return a fixed ZeroGPU reservation duration for compatibility."""
3250
+ return ZERO_GPU_RUN_TASK_DURATION_SECONDS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3251
 
3252
 
3253
  def get_pipeline_pool(task: str) -> PipelinePool:
 
3277
  return ACTIVE_PIPELINE_POOL
3278
 
3279
 
3280
+ @spaces.GPU(size="large", duration=ZERO_GPU_RUN_TASK_DURATION_SECONDS)
3281
  def run_task(
3282
  task: str,
3283
  prompt: str,
 
3870
  the visible UI unchanged. Set LANCE_PREFETCH_MODEL_ASSETS=0 to skip this at
3871
  Space startup, or LANCE_PREFETCH_MODEL_VARIANTS=video to prefetch less.
3872
  """
3873
+ if running_on_space() or env_flag("LANCE_INSTALL_FLASH_ATTN_ON_STARTUP", False):
3874
+ try:
3875
+ ensure_flash_attn_installed()
3876
+ except Exception as exc:
3877
+ print(f"[startup] flash-attn startup install failed and will be retried lazily during inference: {exc}", flush=True)
3878
+
3879
  if not env_flag("LANCE_PREFETCH_MODEL_ASSETS", running_on_space()):
3880
  print("[startup] Model asset prefetch disabled.", flush=True)
3881
  return
 
3913
  QUEUE_MAX_SIZE = args.queue_size
3914
  prefetch_model_assets_before_launch()
3915
  print(
3916
+ "[startup] Skipping GPU model preload. UI will launch first, and Lance weights will be prefetched on CPU before ZeroGPU inference. If that prefetch fails, inference will fall back to lazy loading.",
3917
  flush=True,
3918
  )
3919
  concurrency_limit = 1