raazkumar commited on
Commit
2266b31
Β·
verified Β·
1 Parent(s): b694b19

feat: add rpm_limit=40 hint for NVIDIA NIM local provider

Browse files

NIM free tier is limited to ~40 requests/minute. We add an `rpm_limit`
param to the returned kwargs so downstream rate-limiting wrappers can
read it. The existing retry logic (30s, 60s delays on 429) already
handles transient rate-limit hits.

Files changed (1) hide show
  1. agent/core/llm_params.py +9 -1
agent/core/llm_params.py CHANGED
@@ -196,11 +196,19 @@ def _resolve_llm_params(
196
  if model_name.startswith(prefix):
197
  api_base = os.environ.get(env_override, default_base)
198
  api_key = os.environ.get("LOCAL_API_KEY", "no-key")
199
- return {
200
  "model": f"openai/{model_name.removeprefix(prefix)}",
201
  "api_base": api_base,
202
  "api_key": api_key,
203
  }
 
 
 
 
 
 
 
 
204
  # ─────────────────────────────────────────────────────────────────────────
205
 
206
  if model_name.startswith("openai/"):
 
196
  if model_name.startswith(prefix):
197
  api_base = os.environ.get(env_override, default_base)
198
  api_key = os.environ.get("LOCAL_API_KEY", "no-key")
199
+ params: dict = {
200
  "model": f"openai/{model_name.removeprefix(prefix)}",
201
  "api_base": api_base,
202
  "api_key": api_key,
203
  }
204
+ # NVIDIA NIM free tier is limited to ~40 req/min.
205
+ # LiteLLM does not natively rate-limit; we add a small
206
+ # ``rpm_limit`` hint that downstream wrappers (or a future
207
+ # local proxy) can read. For the agent loop we rely on
208
+ # the existing retry logic (30s, 60s) when 429s are hit.
209
+ if prefix == "nim/":
210
+ params["rpm_limit"] = 40
211
+ return params
212
  # ─────────────────────────────────────────────────────────────────────────
213
 
214
  if model_name.startswith("openai/"):