Spaces:

SpringWang08
/

Medical-VQA

Paused

App Files Files Community

SpringWang08 commited on 5 days ago

Commit

0ee23ae

1 Parent(s): ad0cfad

Release models after each prediction for lower memory

Browse files

Files changed (1) hide show

web/main.py +21 -0

web/main.py CHANGED Viewed

@@ -129,6 +129,8 @@ class VQAServerState:
         self.question_suggestions: list[dict[str, Any]] = []
         # Giữ mặc định là không preload để tránh ngốn RAM/VRAM khi Space khởi động.
         self.preload_models = os.getenv("WEB_PRELOAD_MODELS", "0") == "1"
     @property
     def phobert_model(self) -> str:
@@ -147,6 +149,20 @@ def _artifact_exists(path: Path) -> bool:
     return path.exists()
 def _download_hub_snapshot(repo_id: str, cache_subdir: str, allow_patterns: Optional[list[str]] = None) -> Path:
     target_dir = state.artifact_cache_dir / cache_subdir
     target_dir.mkdir(parents=True, exist_ok=True)
@@ -906,6 +922,9 @@ async def predict_variant(variant: str, question: str, image: Image.Image) -> di
             "checkpoint": "",
             "latency_ms": round((time.perf_counter() - start) * 1000, 2),
         }
 def _parse_model_selection(raw_model_name: Optional[str], raw_model_names: Optional[str]) -> list[str]:
@@ -993,6 +1012,8 @@ async def predict(
     results = []
     async with load_lock:
         for variant in selected_models:
             results.append(await predict_variant(variant, question, pil_img))
     predictions = {item["variant"]: item["prediction"] for item in results if item.get("status") == "ok"}

         self.question_suggestions: list[dict[str, Any]] = []
         # Giữ mặc định là không preload để tránh ngốn RAM/VRAM khi Space khởi động.
         self.preload_models = os.getenv("WEB_PRELOAD_MODELS", "0") == "1"
+        # Chạy lần lượt và giải phóng model sau mỗi lượt để giảm đỉnh RAM/VRAM.
+        self.release_after_predict = os.getenv("WEB_RELEASE_AFTER_PREDICT", "1") == "1"
     @property
     def phobert_model(self) -> str:
     return path.exists()
+def _release_variant_cache(variant: str) -> None:
+    if variant in {"A1", "A2"}:
+        bundle = state.a_models.pop(variant, None)
+        if bundle is not None:
+            bundle["model"] = None
+    else:
+        if state.llava_bundle is not None:
+            state.llava_bundle["model"] = None
+        state.llava_bundle = None
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 def _download_hub_snapshot(repo_id: str, cache_subdir: str, allow_patterns: Optional[list[str]] = None) -> Path:
     target_dir = state.artifact_cache_dir / cache_subdir
     target_dir.mkdir(parents=True, exist_ok=True)
             "checkpoint": "",
             "latency_ms": round((time.perf_counter() - start) * 1000, 2),
         }
+    finally:
+        if state.release_after_predict:
+            _release_variant_cache(variant)
 def _parse_model_selection(raw_model_name: Optional[str], raw_model_names: Optional[str]) -> list[str]:
     results = []
     async with load_lock:
         for variant in selected_models:
+            if state.release_after_predict:
+                _release_variant_cache(variant)
             results.append(await predict_variant(variant, question, pil_img))
     predictions = {item["variant"]: item["prediction"] for item in results if item.get("status") == "ok"}