Spaces:
Running on Zero
Running on Zero
Prepare Lance for Hugging Face Space
Browse files- SPACE_DEPLOYMENT.md +11 -4
- app.py +36 -75
SPACE_DEPLOYMENT.md
CHANGED
|
@@ -1,13 +1,15 @@
|
|
| 1 |
# Hugging Face Space Deployment
|
| 2 |
|
| 3 |
-
This repository is prepared for a
|
| 4 |
|
| 5 |
## Runtime
|
| 6 |
|
| 7 |
-
- Space SDK:
|
|
|
|
| 8 |
- Public port: `7860`
|
| 9 |
- Entrypoint: `python app.py`
|
| 10 |
-
- Recommended
|
|
|
|
| 11 |
|
| 12 |
## Model Assets
|
| 13 |
|
|
@@ -17,7 +19,8 @@ Default behavior:
|
|
| 17 |
|
| 18 |
- Local checkout with `downloads/`: use `./downloads`
|
| 19 |
- Hugging Face Space without local assets: download from `bytedance-research/Lance` into `/data/lance_models`
|
| 20 |
-
- Video tasks
|
|
|
|
| 21 |
- Image tasks unload the active video model first, then load `Lance_3B`.
|
| 22 |
- Switching back to a video task unloads `Lance_3B`, then reloads `Lance_3B_Video`.
|
| 23 |
|
|
@@ -33,6 +36,10 @@ Useful environment variables:
|
|
| 33 |
- `LANCE_GPUS`: comma-separated GPU IDs, for example `0` or `0,1`
|
| 34 |
- `LANCE_QUEUE_SIZE`: Gradio queue size
|
| 35 |
- `LANCE_GRADIO_TMP_ROOT`: output and temporary file directory
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
Expected model layout:
|
| 38 |
|
|
|
|
| 1 |
# Hugging Face Space Deployment
|
| 2 |
|
| 3 |
+
This repository is prepared for a Gradio-based Hugging Face Space with ZeroGPU.
|
| 4 |
|
| 5 |
## Runtime
|
| 6 |
|
| 7 |
+
- Space SDK: Gradio
|
| 8 |
+
- Space hardware: ZeroGPU
|
| 9 |
- Public port: `7860`
|
| 10 |
- Entrypoint: `python app.py`
|
| 11 |
+
- Recommended use: ZeroGPU for request-scoped GPU allocation
|
| 12 |
+
- In the Space settings UI, select `ZeroGPU` as the hardware target
|
| 13 |
|
| 14 |
## Model Assets
|
| 15 |
|
|
|
|
| 19 |
|
| 20 |
- Local checkout with `downloads/`: use `./downloads`
|
| 21 |
- Hugging Face Space without local assets: download from `bytedance-research/Lance` into `/data/lance_models`
|
| 22 |
+
- Video tasks use the pre-fetched `Lance_3B_Video` assets when available.
|
| 23 |
+
- Startup prefetch downloads the model snapshots on CPU so the first GPU request does not pay that cold-start cost.
|
| 24 |
- Image tasks unload the active video model first, then load `Lance_3B`.
|
| 25 |
- Switching back to a video task unloads `Lance_3B`, then reloads `Lance_3B_Video`.
|
| 26 |
|
|
|
|
| 36 |
- `LANCE_GPUS`: comma-separated GPU IDs, for example `0` or `0,1`
|
| 37 |
- `LANCE_QUEUE_SIZE`: Gradio queue size
|
| 38 |
- `LANCE_GRADIO_TMP_ROOT`: output and temporary file directory
|
| 39 |
+
- `LANCE_ZEROGPU_MAX_DURATION_SECONDS`: fixed `@spaces.GPU` duration request in seconds for all tasks (default: 300)
|
| 40 |
+
- `LANCE_INSTALL_FLASH_ATTN_ON_STARTUP`: set to `1` to install flash-attn during Space startup instead of inside the GPU reservation
|
| 41 |
+
- `LANCE_PREFETCH_MODEL_ASSETS`: set to `0` to skip CPU-side model prefetch at startup
|
| 42 |
+
- `LANCE_PREFETCH_MODEL_VARIANTS`: comma-separated model variants to prefetch, for example `video,image`
|
| 43 |
|
| 44 |
Expected model layout:
|
| 45 |
|
app.py
CHANGED
|
@@ -2355,7 +2355,7 @@ def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tu
|
|
| 2355 |
"\n".join(
|
| 2356 |
[
|
| 2357 |
f"RIFE failed with exit code {exc.returncode}.",
|
| 2358 |
-
f"command=CUDA_VISIBLE_DEVICES={device_id}
|
| 2359 |
exc.stdout.strip() if exc.stdout else "",
|
| 2360 |
exc.stderr.strip() if exc.stderr else "",
|
| 2361 |
]
|
|
@@ -2367,7 +2367,7 @@ def run_rife_interpolation(video_path: Path, device_id: int, exp: int = 1) -> tu
|
|
| 2367 |
log = "\n".join(
|
| 2368 |
[
|
| 2369 |
"[rife] Frame interpolation finished.",
|
| 2370 |
-
f"command=CUDA_VISIBLE_DEVICES={device_id}
|
| 2371 |
f"elapsed={elapsed:.2f}s",
|
| 2372 |
f"output={output_path}",
|
| 2373 |
completed.stdout.strip(),
|
|
@@ -2497,7 +2497,6 @@ class LanceT2VV2TPipeline:
|
|
| 2497 |
return
|
| 2498 |
|
| 2499 |
ensure_dirs()
|
| 2500 |
-
ensure_flash_attn_installed()
|
| 2501 |
resolved_model_path = ensure_model_assets(self.model_variant)
|
| 2502 |
print(
|
| 2503 |
f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
|
|
@@ -3159,6 +3158,22 @@ def get_task_model_variant(task: str) -> str:
|
|
| 3159 |
return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
|
| 3160 |
|
| 3161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3162 |
def ensure_flash_attn_installed() -> None:
|
| 3163 |
try:
|
| 3164 |
from importlib.metadata import PackageNotFoundError, version as package_version
|
|
@@ -3188,60 +3203,31 @@ def ensure_flash_attn_installed() -> None:
|
|
| 3188 |
print(f"[startup] flash-attn {DEFAULT_FLASH_ATTN_VERSION} installed successfully.", flush=True)
|
| 3189 |
|
| 3190 |
|
| 3191 |
-
def get_env_int(name: str, default: int) -> int:
|
| 3192 |
-
"""Read an integer environment variable, falling back safely on invalid values."""
|
| 3193 |
-
try:
|
| 3194 |
-
return int(os.getenv(name, str(default)))
|
| 3195 |
-
except (TypeError, ValueError):
|
| 3196 |
-
return default
|
| 3197 |
-
|
| 3198 |
-
|
| 3199 |
-
def get_env_float(name: str, default: float) -> float:
|
| 3200 |
-
"""Read a float environment variable, falling back safely on invalid values."""
|
| 3201 |
-
try:
|
| 3202 |
-
return float(os.getenv(name, str(default)))
|
| 3203 |
-
except (TypeError, ValueError):
|
| 3204 |
-
return default
|
| 3205 |
-
|
| 3206 |
-
|
| 3207 |
def get_zerogpu_duration_cap() -> int:
|
| 3208 |
-
"""
|
| 3209 |
|
| 3210 |
The duration value is a ZeroGPU reservation/timeout hint. Shorter values can
|
| 3211 |
improve queue priority and reduce wasted quota, but the value must still cover
|
| 3212 |
model warm-up plus inference. Override per deployment when needed:
|
| 3213 |
LANCE_ZEROGPU_MAX_DURATION_SECONDS=300
|
| 3214 |
"""
|
| 3215 |
-
return max(1, get_env_int("LANCE_ZEROGPU_MAX_DURATION_SECONDS",
|
| 3216 |
|
| 3217 |
|
| 3218 |
def clamp_zerogpu_duration(seconds: int) -> int:
|
| 3219 |
return max(1, min(int(seconds), get_zerogpu_duration_cap()))
|
| 3220 |
|
| 3221 |
|
| 3222 |
-
|
| 3223 |
-
"""Return True when the required model variant is already resident on GPU.
|
| 3224 |
|
| 3225 |
-
|
| 3226 |
-
|
| 3227 |
-
|
| 3228 |
-
switching. This does not change the UI layout or user-facing controls.
|
| 3229 |
-
"""
|
| 3230 |
-
try:
|
| 3231 |
-
pool = ACTIVE_PIPELINE_POOL
|
| 3232 |
-
if pool is None or pool.model_variant != get_task_model_variant(task):
|
| 3233 |
-
return False
|
| 3234 |
-
return all(getattr(pipeline, "initialized", False) for pipeline in pool.pipelines)
|
| 3235 |
-
except Exception:
|
| 3236 |
-
return False
|
| 3237 |
|
| 3238 |
|
| 3239 |
def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
|
| 3240 |
-
"""
|
| 3241 |
-
|
| 3242 |
-
if not is_pipeline_pool_ready_for_task(task):
|
| 3243 |
-
estimated_seconds += max(0, get_env_int("LANCE_ZEROGPU_COLD_START_BUFFER_SECONDS", 120))
|
| 3244 |
-
return clamp_zerogpu_duration(int(estimated_seconds * margin + 0.999))
|
| 3245 |
|
| 3246 |
|
| 3247 |
def get_run_task_gpu_duration(
|
|
@@ -3260,39 +3246,8 @@ def get_run_task_gpu_duration(
|
|
| 3260 |
cfg_text_scale: float,
|
| 3261 |
enable_frame_interpolation: bool,
|
| 3262 |
) -> int:
|
| 3263 |
-
"""Return a
|
| 3264 |
-
|
| 3265 |
-
The previous implementation used one conservative estimate for both cold and
|
| 3266 |
-
warm runs. This version keeps the first request safe, then asks for shorter
|
| 3267 |
-
durations once the matching Lance model is already loaded, which reduces
|
| 3268 |
-
wasted ZeroGPU quota and improves queue priority without changing the UI.
|
| 3269 |
-
"""
|
| 3270 |
-
internal_task = normalize_task(task)
|
| 3271 |
-
timesteps = max(1, int(validation_num_timesteps or DEFAULT_TIMESTEPS))
|
| 3272 |
-
backend_resolution = normalize_resolution_for_backend(str(resolution), internal_task)
|
| 3273 |
-
resolution_multiplier = 1.28 if backend_resolution == "video_480p" else 1.0
|
| 3274 |
-
timestep_extra = max(0, timesteps - 20)
|
| 3275 |
-
|
| 3276 |
-
if internal_task == TASK_T2V:
|
| 3277 |
-
requested_seconds = max(1, int(num_frames or DEFAULT_VIDEO_DURATION_SECONDS))
|
| 3278 |
-
estimate = 35 + requested_seconds * 10 + timestep_extra * 1.5
|
| 3279 |
-
if normalize_frame_interpolation(enable_frame_interpolation):
|
| 3280 |
-
estimate += min(32, 8 + requested_seconds * 3)
|
| 3281 |
-
return finalize_zerogpu_duration(estimate * resolution_multiplier, internal_task)
|
| 3282 |
-
|
| 3283 |
-
if internal_task == TASK_VIDEO_EDIT:
|
| 3284 |
-
estimate = 85 + timestep_extra * 1.5
|
| 3285 |
-
if normalize_frame_interpolation(enable_frame_interpolation):
|
| 3286 |
-
estimate += 22
|
| 3287 |
-
return finalize_zerogpu_duration(estimate * resolution_multiplier, internal_task)
|
| 3288 |
-
|
| 3289 |
-
if internal_task == TASK_X2T_VIDEO:
|
| 3290 |
-
return finalize_zerogpu_duration(32, internal_task)
|
| 3291 |
-
if internal_task == TASK_T2I:
|
| 3292 |
-
return finalize_zerogpu_duration(58, internal_task)
|
| 3293 |
-
if internal_task == TASK_IMAGE_EDIT:
|
| 3294 |
-
return finalize_zerogpu_duration(70, internal_task)
|
| 3295 |
-
return finalize_zerogpu_duration(28, internal_task)
|
| 3296 |
|
| 3297 |
|
| 3298 |
def get_pipeline_pool(task: str) -> PipelinePool:
|
|
@@ -3322,7 +3277,7 @@ def get_pipeline_pool(task: str) -> PipelinePool:
|
|
| 3322 |
return ACTIVE_PIPELINE_POOL
|
| 3323 |
|
| 3324 |
|
| 3325 |
-
@spaces.GPU(duration=
|
| 3326 |
def run_task(
|
| 3327 |
task: str,
|
| 3328 |
prompt: str,
|
|
@@ -3915,6 +3870,12 @@ def prefetch_model_assets_before_launch() -> None:
|
|
| 3915 |
the visible UI unchanged. Set LANCE_PREFETCH_MODEL_ASSETS=0 to skip this at
|
| 3916 |
Space startup, or LANCE_PREFETCH_MODEL_VARIANTS=video to prefetch less.
|
| 3917 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3918 |
if not env_flag("LANCE_PREFETCH_MODEL_ASSETS", running_on_space()):
|
| 3919 |
print("[startup] Model asset prefetch disabled.", flush=True)
|
| 3920 |
return
|
|
@@ -3952,7 +3913,7 @@ if __name__ == "__main__":
|
|
| 3952 |
QUEUE_MAX_SIZE = args.queue_size
|
| 3953 |
prefetch_model_assets_before_launch()
|
| 3954 |
print(
|
| 3955 |
-
"[startup] Skipping GPU model preload. UI will launch first, and Lance weights will be
|
| 3956 |
flush=True,
|
| 3957 |
)
|
| 3958 |
concurrency_limit = 1
|
|
|
|
| 2355 |
"\n".join(
|
| 2356 |
[
|
| 2357 |
f"RIFE failed with exit code {exc.returncode}.",
|
| 2358 |
+
f"command=CUDA_VISIBLE_DEVICES={device_id} " + " ".join(command),
|
| 2359 |
exc.stdout.strip() if exc.stdout else "",
|
| 2360 |
exc.stderr.strip() if exc.stderr else "",
|
| 2361 |
]
|
|
|
|
| 2367 |
log = "\n".join(
|
| 2368 |
[
|
| 2369 |
"[rife] Frame interpolation finished.",
|
| 2370 |
+
f"command=CUDA_VISIBLE_DEVICES={device_id} " + " ".join(command),
|
| 2371 |
f"elapsed={elapsed:.2f}s",
|
| 2372 |
f"output={output_path}",
|
| 2373 |
completed.stdout.strip(),
|
|
|
|
| 2497 |
return
|
| 2498 |
|
| 2499 |
ensure_dirs()
|
|
|
|
| 2500 |
resolved_model_path = ensure_model_assets(self.model_variant)
|
| 2501 |
print(
|
| 2502 |
f"[startup][gpu:{self.device}][{self.model_variant}] Using Lance model path: {resolved_model_path}",
|
|
|
|
| 3158 |
return MODEL_VARIANT_IMAGE if internal_task in IMAGE_TASKS else MODEL_VARIANT_VIDEO
|
| 3159 |
|
| 3160 |
|
| 3161 |
+
def get_env_int(name: str, default: int) -> int:
|
| 3162 |
+
"""Read an integer environment variable, falling back safely on invalid values."""
|
| 3163 |
+
try:
|
| 3164 |
+
return int(os.getenv(name, str(default)))
|
| 3165 |
+
except (TypeError, ValueError):
|
| 3166 |
+
return default
|
| 3167 |
+
|
| 3168 |
+
|
| 3169 |
+
def get_env_float(name: str, default: float) -> float:
|
| 3170 |
+
"""Read a float environment variable, falling back safely on invalid values."""
|
| 3171 |
+
try:
|
| 3172 |
+
return float(os.getenv(name, str(default)))
|
| 3173 |
+
except (TypeError, ValueError):
|
| 3174 |
+
return default
|
| 3175 |
+
|
| 3176 |
+
|
| 3177 |
def ensure_flash_attn_installed() -> None:
|
| 3178 |
try:
|
| 3179 |
from importlib.metadata import PackageNotFoundError, version as package_version
|
|
|
|
| 3203 |
print(f"[startup] flash-attn {DEFAULT_FLASH_ATTN_VERSION} installed successfully.", flush=True)
|
| 3204 |
|
| 3205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3206 |
def get_zerogpu_duration_cap() -> int:
|
| 3207 |
+
"""Fixed duration requested from ZeroGPU for each run.
|
| 3208 |
|
| 3209 |
The duration value is a ZeroGPU reservation/timeout hint. Shorter values can
|
| 3210 |
improve queue priority and reduce wasted quota, but the value must still cover
|
| 3211 |
model warm-up plus inference. Override per deployment when needed:
|
| 3212 |
LANCE_ZEROGPU_MAX_DURATION_SECONDS=300
|
| 3213 |
"""
|
| 3214 |
+
return max(1, get_env_int("LANCE_ZEROGPU_MAX_DURATION_SECONDS", 300))
|
| 3215 |
|
| 3216 |
|
| 3217 |
def clamp_zerogpu_duration(seconds: int) -> int:
|
| 3218 |
return max(1, min(int(seconds), get_zerogpu_duration_cap()))
|
| 3219 |
|
| 3220 |
|
| 3221 |
+
ZERO_GPU_RUN_TASK_DURATION_SECONDS = get_zerogpu_duration_cap()
|
|
|
|
| 3222 |
|
| 3223 |
+
def is_pipeline_pool_ready_for_task(task: str) -> bool:
|
| 3224 |
+
"""Retained for compatibility with earlier duration logic."""
|
| 3225 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3226 |
|
| 3227 |
|
| 3228 |
def finalize_zerogpu_duration(estimated_seconds: float, task: str) -> int:
|
| 3229 |
+
"""Retained for compatibility with earlier duration logic."""
|
| 3230 |
+
return clamp_zerogpu_duration(ZERO_GPU_RUN_TASK_DURATION_SECONDS)
|
|
|
|
|
|
|
|
|
|
| 3231 |
|
| 3232 |
|
| 3233 |
def get_run_task_gpu_duration(
|
|
|
|
| 3246 |
cfg_text_scale: float,
|
| 3247 |
enable_frame_interpolation: bool,
|
| 3248 |
) -> int:
|
| 3249 |
+
"""Return a fixed ZeroGPU reservation duration for compatibility."""
|
| 3250 |
+
return ZERO_GPU_RUN_TASK_DURATION_SECONDS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3251 |
|
| 3252 |
|
| 3253 |
def get_pipeline_pool(task: str) -> PipelinePool:
|
|
|
|
| 3277 |
return ACTIVE_PIPELINE_POOL
|
| 3278 |
|
| 3279 |
|
| 3280 |
+
@spaces.GPU(size="large", duration=ZERO_GPU_RUN_TASK_DURATION_SECONDS)
|
| 3281 |
def run_task(
|
| 3282 |
task: str,
|
| 3283 |
prompt: str,
|
|
|
|
| 3870 |
the visible UI unchanged. Set LANCE_PREFETCH_MODEL_ASSETS=0 to skip this at
|
| 3871 |
Space startup, or LANCE_PREFETCH_MODEL_VARIANTS=video to prefetch less.
|
| 3872 |
"""
|
| 3873 |
+
if running_on_space() or env_flag("LANCE_INSTALL_FLASH_ATTN_ON_STARTUP", False):
|
| 3874 |
+
try:
|
| 3875 |
+
ensure_flash_attn_installed()
|
| 3876 |
+
except Exception as exc:
|
| 3877 |
+
print(f"[startup] flash-attn startup install failed and will be retried lazily during inference: {exc}", flush=True)
|
| 3878 |
+
|
| 3879 |
if not env_flag("LANCE_PREFETCH_MODEL_ASSETS", running_on_space()):
|
| 3880 |
print("[startup] Model asset prefetch disabled.", flush=True)
|
| 3881 |
return
|
|
|
|
| 3913 |
QUEUE_MAX_SIZE = args.queue_size
|
| 3914 |
prefetch_model_assets_before_launch()
|
| 3915 |
print(
|
| 3916 |
+
"[startup] Skipping GPU model preload. UI will launch first, and Lance weights will be prefetched on CPU before ZeroGPU inference. If that prefetch fails, inference will fall back to lazy loading.",
|
| 3917 |
flush=True,
|
| 3918 |
)
|
| 3919 |
concurrency_limit = 1
|