feat: add rpm_limit=40 hint for NVIDIA NIM local provider
Browse filesNIM free tier is limited to ~40 requests/minute. We add an `rpm_limit`
param to the returned kwargs so downstream rate-limiting wrappers can
read it. The existing retry logic (30s, 60s delays on 429) already
handles transient rate-limit hits.
- agent/core/llm_params.py +9 -1
agent/core/llm_params.py
CHANGED
|
@@ -196,11 +196,19 @@ def _resolve_llm_params(
|
|
| 196 |
if model_name.startswith(prefix):
|
| 197 |
api_base = os.environ.get(env_override, default_base)
|
| 198 |
api_key = os.environ.get("LOCAL_API_KEY", "no-key")
|
| 199 |
-
|
| 200 |
"model": f"openai/{model_name.removeprefix(prefix)}",
|
| 201 |
"api_base": api_base,
|
| 202 |
"api_key": api_key,
|
| 203 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 205 |
|
| 206 |
if model_name.startswith("openai/"):
|
|
|
|
| 196 |
if model_name.startswith(prefix):
|
| 197 |
api_base = os.environ.get(env_override, default_base)
|
| 198 |
api_key = os.environ.get("LOCAL_API_KEY", "no-key")
|
| 199 |
+
params: dict = {
|
| 200 |
"model": f"openai/{model_name.removeprefix(prefix)}",
|
| 201 |
"api_base": api_base,
|
| 202 |
"api_key": api_key,
|
| 203 |
}
|
| 204 |
+
# NVIDIA NIM free tier is limited to ~40 req/min.
|
| 205 |
+
# LiteLLM does not natively rate-limit; we add a small
|
| 206 |
+
# ``rpm_limit`` hint that downstream wrappers (or a future
|
| 207 |
+
# local proxy) can read. For the agent loop we rely on
|
| 208 |
+
# the existing retry logic (30s, 60s) when 429s are hit.
|
| 209 |
+
if prefix == "nim/":
|
| 210 |
+
params["rpm_limit"] = 40
|
| 211 |
+
return params
|
| 212 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 213 |
|
| 214 |
if model_name.startswith("openai/"):
|