Spaces:

dek924
/

PatientSim

Running

App Files Files Community

dek924 commited on 23 days ago

Commit

11786ab

1 Parent(s): ba9cf35

feat: add global call limit & remove sim start limit

Browse files

Files changed (2) hide show

app.py +16 -11
rate_limiter.py +50 -25

app.py CHANGED Viewed

@@ -609,14 +609,6 @@ def start_simulation(
         return _setup_error("Invalid model selection.")
     using_own_key = bool(user_api_key.strip())
-    # Only apply rate limiting when using the shared demo key
-    if not using_own_key:
-        client_key = get_client_key(request)
-        allowed, limit_msg = _rate_limiter.check_simulation_start(client_key)
-        if not allowed:
-            return _setup_error(limit_msg)
     is_openai = "gpt" in model.lower()
     if using_own_key:
@@ -654,7 +646,7 @@ def start_simulation(
             **patient,
         )
     except Exception as e:
-        _logger.error("Failed to initialize patient agent: %s", e, exc_info=True)
         return _setup_error(f"Failed to initialize patient agent: {_sanitize_error(str(e))}")
     recap = build_recap_html(hadm_id, model, cefr, personality, recall, confusion)
@@ -744,11 +736,16 @@ def chat(message: str, history: list, agent, sim_config: dict, request: gr.Reque
         raise gr.Error("Invalid input detected. Please enter a valid clinical question.")
     using_own_key = bool(sim_config and sim_config.get("user_api_key"))
     if not using_own_key:
-        client_key = get_client_key(request)
         allowed, limit_msg = _rate_limiter.check_chat_message(client_key)
         if not allowed:
             raise gr.Error(limit_msg)
     response = agent(user_prompt=message, using_multi_turn=True, verbose=False)
     history = history + [
@@ -812,6 +809,14 @@ def start_auto(agent, sim_config: dict, request: gr.Request = None):
             gr.Warning(limit_msg)
             yield _auto_fallback_outputs()
             return
     try:
         agent.reset_history(verbose=False)
@@ -847,7 +852,7 @@ def start_auto(agent, sim_config: dict, request: gr.Request = None):
             )
         except Exception as e:
-            _logger.error("Failed to initialize doctor agent: %s", e, exc_info=True)
             gr.Error(f"Failed to initialize doctor agent: {_sanitize_error(str(e))}")
             yield _auto_fallback_outputs()
             return

         return _setup_error("Invalid model selection.")
     using_own_key = bool(user_api_key.strip())
     is_openai = "gpt" in model.lower()
     if using_own_key:
             **patient,
         )
     except Exception as e:
+        _logger.error("Failed to initialize patient agent: %s", _sanitize_error(str(e)))
         return _setup_error(f"Failed to initialize patient agent: {_sanitize_error(str(e))}")
     recap = build_recap_html(hadm_id, model, cefr, personality, recall, confusion)
         raise gr.Error("Invalid input detected. Please enter a valid clinical question.")
     using_own_key = bool(sim_config and sim_config.get("user_api_key"))
+    client_key = get_client_key(request)
     if not using_own_key:
         allowed, limit_msg = _rate_limiter.check_chat_message(client_key)
         if not allowed:
             raise gr.Error(limit_msg)
+    else:
+        # Own-key users bypass per-IP quotas but still respect global capacity.
+        allowed, limit_msg = _rate_limiter.check_global_capacity()
+        if not allowed:
+            raise gr.Error(limit_msg)
     response = agent(user_prompt=message, using_multi_turn=True, verbose=False)
     history = history + [
             gr.Warning(limit_msg)
             yield _auto_fallback_outputs()
             return
+    else:
+        # Own-key users bypass per-IP quotas but still enforce the concurrent
+        # run cap and the hard global capacity limit.
+        allowed, limit_msg = _rate_limiter.check_own_key_auto_run(client_key)
+        if not allowed:
+            gr.Warning(limit_msg)
+            yield _auto_fallback_outputs()
+            return
     try:
         agent.reset_history(verbose=False)
             )
         except Exception as e:
+            _logger.error("Failed to initialize doctor agent: %s", _sanitize_error(str(e)))
             gr.Error(f"Failed to initialize doctor agent: {_sanitize_error(str(e))}")
             yield _auto_fallback_outputs()
             return

rate_limiter.py CHANGED Viewed

@@ -7,7 +7,6 @@ until the process is restarted (or the SQLite DB is cleared).
 Limits are configurable via environment variables:
-  RATE_LIMIT_SIM_STARTS       — max simulation setups total per IP   (default: 5)
   RATE_LIMIT_CHAT_MSGS        — max chat messages total per IP        (default: 50)
   RATE_LIMIT_AUTO_RUNS        — max auto simulation runs total per IP (default: 5)
   RATE_LIMIT_TOTAL_API_CALLS  — max total LLM calls across all modes  (default: 200)
@@ -38,7 +37,6 @@ import gradio as gr
 # ---------------------------------------------------------------------------
 # Configuration — overridable via environment variables
 # ---------------------------------------------------------------------------
-SIM_STARTS_LIMIT: int = int(os.environ.get("RATE_LIMIT_SIM_STARTS", "5"))
 CHAT_MSGS_LIMIT: int = int(os.environ.get("RATE_LIMIT_CHAT_MSGS", "50"))
 AUTO_RUNS_LIMIT: int = int(os.environ.get("RATE_LIMIT_AUTO_RUNS", "5"))
 TOTAL_API_CALLS_LIMIT: int = int(os.environ.get("RATE_LIMIT_TOTAL_API_CALLS", "200"))
@@ -137,7 +135,6 @@ class RateLimiter:
     Tracks four independent counters per key:
-    * **sim_starts** — calls to ``start_simulation()``
     * **chat_msgs**  — individual chat messages (1 LLM call each)
     * **auto_runs**  — auto simulation runs (each reserved as
       ``_AUTO_RUN_CALL_RESERVATION`` LLM calls in ``total_calls``)
@@ -151,7 +148,7 @@ class RateLimiter:
     Example
     -------
     >>> limiter = RateLimiter()
-    >>> allowed, msg = limiter.check_simulation_start("ip:1.2.3.4")
     >>> if not allowed:
     ...     raise gr.Error(msg)
     """
@@ -164,7 +161,6 @@ class RateLimiter:
         # SQLite-backed persistent counters; fall back to in-memory on failure
         self._db: Optional[sqlite3.Connection] = None
         self._mem: Dict[str, Dict[str, int]] = {
-            "sim_starts":  defaultdict(int),
             "chat_msgs":   defaultdict(int),
             "auto_runs":   defaultdict(int),
             "total_calls": defaultdict(int),
@@ -242,24 +238,6 @@ class RateLimiter:
     # Public check methods
     # ------------------------------------------------------------------
-    def check_simulation_start(self, key: Optional[str]) -> Tuple[bool, str]:
-        """
-        Check whether a new simulation setup is allowed.
-        Called once when the user clicks **Start Simulation**.
-        """
-        if not key:
-            return False, self._UNIDENTIFIED_MSG
-        with self._lock:
-            count = self._get("sim_starts", key) + 1
-            if count > SIM_STARTS_LIMIT:
-                return False, (
-                    f"Simulation setup limit reached "
-                    f"(maximum {SIM_STARTS_LIMIT} simulations per session)."
-                )
-            self._set("sim_starts", key, count)
-        return True, ""
     def check_chat_message(self, key: Optional[str]) -> Tuple[bool, str]:
         """
         Check whether sending a chat message is allowed (= 1 LLM API call).
@@ -331,6 +309,54 @@ class RateLimiter:
             self._active_auto_runs[key] += 1
         return True, ""
     def release_auto_slot(self, key: Optional[str]) -> None:
         """
         Release one concurrent auto run slot for *key*.
@@ -355,12 +381,11 @@ class RateLimiter:
         Returns
         -------
-        dict with keys ``sim_starts``, ``chat_messages``, ``auto_runs``,
         ``total_api_calls``; each value is a dict with ``used`` and ``limit``.
         """
         with self._lock:
             return {
-                "sim_starts":      {"used": self._get("sim_starts", key),  "limit": SIM_STARTS_LIMIT},
                 "chat_messages":   {"used": self._get("chat_msgs", key),   "limit": CHAT_MSGS_LIMIT},
                 "auto_runs":       {"used": self._get("auto_runs", key),   "limit": AUTO_RUNS_LIMIT},
                 "total_api_calls": {"used": self._get("total_calls", key), "limit": TOTAL_API_CALLS_LIMIT},

 Limits are configurable via environment variables:
   RATE_LIMIT_CHAT_MSGS        — max chat messages total per IP        (default: 50)
   RATE_LIMIT_AUTO_RUNS        — max auto simulation runs total per IP (default: 5)
   RATE_LIMIT_TOTAL_API_CALLS  — max total LLM calls across all modes  (default: 200)
 # ---------------------------------------------------------------------------
 # Configuration — overridable via environment variables
 # ---------------------------------------------------------------------------
 CHAT_MSGS_LIMIT: int = int(os.environ.get("RATE_LIMIT_CHAT_MSGS", "50"))
 AUTO_RUNS_LIMIT: int = int(os.environ.get("RATE_LIMIT_AUTO_RUNS", "5"))
 TOTAL_API_CALLS_LIMIT: int = int(os.environ.get("RATE_LIMIT_TOTAL_API_CALLS", "200"))
     Tracks four independent counters per key:
     * **chat_msgs**  — individual chat messages (1 LLM call each)
     * **auto_runs**  — auto simulation runs (each reserved as
       ``_AUTO_RUN_CALL_RESERVATION`` LLM calls in ``total_calls``)
     Example
     -------
     >>> limiter = RateLimiter()
+    >>> allowed, msg = limiter.check_chat_message("ip:1.2.3.4")
     >>> if not allowed:
     ...     raise gr.Error(msg)
     """
         # SQLite-backed persistent counters; fall back to in-memory on failure
         self._db: Optional[sqlite3.Connection] = None
         self._mem: Dict[str, Dict[str, int]] = {
             "chat_msgs":   defaultdict(int),
             "auto_runs":   defaultdict(int),
             "total_calls": defaultdict(int),
     # Public check methods
     # ------------------------------------------------------------------
     def check_chat_message(self, key: Optional[str]) -> Tuple[bool, str]:
         """
         Check whether sending a chat message is allowed (= 1 LLM API call).
             self._active_auto_runs[key] += 1
         return True, ""
+    def check_global_capacity(self) -> Tuple[bool, str]:
+        """
+        Lightweight global-capacity check for users supplying their own API keys.
+        Per-IP quotas (sim_starts, chat_msgs, auto_runs, total_calls) are
+        intentionally skipped — own-key users are not billed against the shared
+        pool.  However, the hard global cap still applies to prevent the server
+        from being overwhelmed regardless of who is calling.
+        Unlike the per-IP check methods, this method **does** increment
+        ``_global_calls`` so that the counter accurately reflects all LLM
+        calls, not just those made through the shared key.
+        """
+        with self._lock:
+            new_global = self._global_calls + 1
+            if new_global > GLOBAL_TOTAL_CALLS_LIMIT:
+                return False, "Service capacity reached. Please try again later."
+            self._global_calls = new_global
+        return True, ""
+    def check_own_key_auto_run(self, key: Optional[str]) -> Tuple[bool, str]:
+        """
+        Concurrent-run and global-capacity check for own-key auto simulations.
+        Per-IP auto-run quota and total-call quota are intentionally skipped.
+        The concurrent run cap (``_MAX_CONCURRENT_AUTO``) **is** enforced to
+        prevent a single client from spawning many parallel simulations and
+        exhausting server threads.  The global hard cap is also applied and the
+        global counter is updated.
+        Must be paired with a ``release_auto_slot()`` call in a ``finally``
+        block, just like ``check_auto_run()``.
+        """
+        if not key:
+            return False, self._UNIDENTIFIED_MSG
+        with self._lock:
+            if self._active_auto_runs[key] >= _MAX_CONCURRENT_AUTO:
+                return False, "An auto simulation is already running. Please wait."
+            new_global = self._global_calls + _AUTO_RUN_CALL_RESERVATION
+            if new_global > GLOBAL_TOTAL_CALLS_LIMIT:
+                return False, "Service capacity reached. Please try again later."
+            # All checks passed — commit atomically
+            self._global_calls = new_global
+            self._active_auto_runs[key] += 1
+        return True, ""
     def release_auto_slot(self, key: Optional[str]) -> None:
         """
         Release one concurrent auto run slot for *key*.
         Returns
         -------
+        dict with keys ``chat_messages``, ``auto_runs``,
         ``total_api_calls``; each value is a dict with ``used`` and ``limit``.
         """
         with self._lock:
             return {
                 "chat_messages":   {"used": self._get("chat_msgs", key),   "limit": CHAT_MSGS_LIMIT},
                 "auto_runs":       {"used": self._get("auto_runs", key),   "limit": AUTO_RUNS_LIMIT},
                 "total_api_calls": {"used": self._get("total_calls", key), "limit": TOTAL_API_CALLS_LIMIT},