feat: add rpm_limit=40 hint for NVIDIA NIM local provider

NIM free tier is limited to ~40 requests/minute. We add an `rpm_limit`
param to the returned kwargs so downstream rate-limiting wrappers can
read it. The existing retry logic (30s, 60s delays on 429) already
handles transient rate-limit hits.

Files changed (1) hide show

agent/core/llm_params.py +9 -1

agent/core/llm_params.py CHANGED Viewed

@@ -196,11 +196,19 @@ def _resolve_llm_params(
         if model_name.startswith(prefix):
             api_base = os.environ.get(env_override, default_base)
             api_key = os.environ.get("LOCAL_API_KEY", "no-key")
-            return {
                 "model": f"openai/{model_name.removeprefix(prefix)}",
                 "api_base": api_base,
                 "api_key": api_key,
             }
     # ─────────────────────────────────────────────────────────────────────────
     if model_name.startswith("openai/"):

         if model_name.startswith(prefix):
             api_base = os.environ.get(env_override, default_base)
             api_key = os.environ.get("LOCAL_API_KEY", "no-key")
+            params: dict = {
                 "model": f"openai/{model_name.removeprefix(prefix)}",
                 "api_base": api_base,
                 "api_key": api_key,
             }
+            # NVIDIA NIM free tier is limited to ~40 req/min.
+            # LiteLLM does not natively rate-limit; we add a small
+            # ``rpm_limit`` hint that downstream wrappers (or a future
+            # local proxy) can read.  For the agent loop we rely on
+            # the existing retry logic (30s, 60s) when 429s are hit.
+            if prefix == "nim/":
+                params["rpm_limit"] = 40
+            return params
     # ─────────────────────────────────────────────────────────────────────────
     if model_name.startswith("openai/"):