mekosotto Claude Opus 4.7 (1M context) commited on
Commit
87845ef
·
1 Parent(s): c519417

feat(llm): OpenRouter free-tier fallback chain (smartest → smallest)

Browse files

Replaces single-model call with a configurable fallback chain. On 429
(rate-limit / quota), 402 (credits), 403/404 (retired id) or 5xx
(upstream) the explainer advances to the next free model. Network and
timeout errors still drop straight to the deterministic template,
since switching models can't fix an unreachable host.

Loads .env via python-dotenv at import time so OPENROUTER_API_KEY
flows through without manual exports. .env added to .gitignore.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (2) hide show
  1. .gitignore +5 -0
  2. src/llm/explainer.py +94 -26
.gitignore CHANGED
@@ -11,6 +11,11 @@ __pycache__/
11
  venv*/
12
  env/
13
 
 
 
 
 
 
14
  # Data — only keep folder structure, never raw payloads
15
  data/raw/*
16
  !data/raw/.gitkeep
 
11
  venv*/
12
  env/
13
 
14
+ # Secrets — never commit
15
+ .env
16
+ .env.local
17
+ .env.*.local
18
+
19
  # Data — only keep folder structure, never raw payloads
20
  data/raw/*
21
  !data/raw/.gitkeep
src/llm/explainer.py CHANGED
@@ -17,6 +17,16 @@ from src.core.logger import get_logger
17
 
18
  logger = get_logger(__name__)
19
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  class FeatureRow(TypedDict):
22
  feature: str
@@ -39,11 +49,38 @@ class ExplainResult(TypedDict):
39
 
40
 
41
  _OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
42
- _DEFAULT_MODEL = "meta-llama/llama-3.2-3b-instruct:free"
43
  _LLM_TIMEOUT_SECONDS = 8.0
44
  _LLM_MAX_TOKENS = 256
45
  _LLM_TEMPERATURE = 0.3
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def _should_use_llm() -> bool:
49
  """Gate: env kill-switch off AND key present."""
@@ -222,10 +259,20 @@ def _build_llm_prompt(payload: ExplainPayload, modality: str = "bbb") -> str:
222
 
223
 
224
  def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, str] | None:
225
- """Try the OpenRouter chat completion. Return (rationale, model) or None."""
 
 
 
 
226
  try:
227
- # Local import — keeps this dep optional at module load time.
228
- from openai import OpenAI
 
 
 
 
 
 
229
  except ImportError as e:
230
  logger.warning("openai SDK not importable: %s", e)
231
  return None
@@ -240,28 +287,49 @@ def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, s
240
  timeout=_LLM_TIMEOUT_SECONDS,
241
  )
242
  prompt = _build_llm_prompt(payload, modality)
243
- try:
244
- completion = client.chat.completions.create(
245
- model=_DEFAULT_MODEL,
246
- messages=[{"role": "user", "content": prompt}],
247
- max_tokens=_LLM_MAX_TOKENS,
248
- temperature=_LLM_TEMPERATURE,
249
- )
250
- except Exception as e: # broad: APITimeoutError, APIConnectionError, RateLimitError, ...
251
- logger.warning("LLM call failed (%s); falling back to template.", type(e).__name__)
252
- return None
253
-
254
- try:
255
- text = completion.choices[0].message.content
256
- except (AttributeError, IndexError, TypeError) as e:
257
- logger.warning("LLM response malformed (%s); falling back to template.", e)
258
- return None
259
-
260
- if not text or not text.strip():
261
- logger.warning("LLM returned empty rationale; falling back to template.")
262
- return None
263
-
264
- return text.strip(), _DEFAULT_MODEL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
 
267
  def explain(
 
17
 
18
  logger = get_logger(__name__)
19
 
20
+ # Load .env (project root) so OPENROUTER_API_KEY etc. are available without
21
+ # the caller having to export them. Safe no-op if python-dotenv isn't
22
+ # installed or .env is missing. Existing env vars are NOT overridden.
23
+ try:
24
+ from dotenv import load_dotenv as _load_dotenv
25
+
26
+ _load_dotenv(override=False)
27
+ except ImportError:
28
+ pass
29
+
30
 
31
  class FeatureRow(TypedDict):
32
  feature: str
 
49
 
50
 
51
  _OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
 
52
  _LLM_TIMEOUT_SECONDS = 8.0
53
  _LLM_MAX_TOKENS = 256
54
  _LLM_TEMPERATURE = 0.3
55
 
56
+ # Free-tier fallback chain, smartest → smallest. When a model returns 429
57
+ # (rate-limit / daily-quota exhausted), 402 (credits), 404 (id retired) or
58
+ # 5xx (upstream), we advance to the next model. Network/timeout errors fall
59
+ # straight to the deterministic template — switching models won't help.
60
+ # Override at runtime via OPENROUTER_FREE_MODELS (comma-separated). Model
61
+ # availability on OpenRouter churns; an ID that 404s is skipped silently.
62
+ _DEFAULT_FREE_MODEL_CHAIN: tuple[str, ...] = (
63
+ "inclusionai/ling-2.6-1t:free", # ~1T flagship
64
+ "nvidia/nemotron-3-super-120b-a12b:free", # 120B reasoning MoE
65
+ "minimax/minimax-m2.5:free",
66
+ "tencent/hy3-preview:free", # MoE + reasoning
67
+ "google/gemma-4-31b-it:free",
68
+ "google/gemma-4-26b-a4b-it:free",
69
+ "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free",
70
+ "poolside/laguna-m.1:free",
71
+ "poolside/laguna-xs.2:free",
72
+ "meta-llama/llama-3.2-3b-instruct:free", # 3B last-resort
73
+ )
74
+
75
+
76
+ def _free_model_chain() -> tuple[str, ...]:
77
+ raw = os.environ.get("OPENROUTER_FREE_MODELS")
78
+ if raw:
79
+ ids = tuple(m.strip() for m in raw.split(",") if m.strip())
80
+ if ids:
81
+ return ids
82
+ return _DEFAULT_FREE_MODEL_CHAIN
83
+
84
 
85
  def _should_use_llm() -> bool:
86
  """Gate: env kill-switch off AND key present."""
 
259
 
260
 
261
  def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, str] | None:
262
+ """Try the OpenRouter chat completion across the free-tier fallback chain.
263
+
264
+ Returns (rationale, model_id) on first success, or None if every model
265
+ is exhausted / unreachable (caller falls back to the template).
266
+ """
267
  try:
268
+ # Local imports — keeps this dep optional at module load time.
269
+ from openai import (
270
+ OpenAI,
271
+ APIConnectionError,
272
+ APIStatusError,
273
+ APITimeoutError,
274
+ RateLimitError,
275
+ )
276
  except ImportError as e:
277
  logger.warning("openai SDK not importable: %s", e)
278
  return None
 
287
  timeout=_LLM_TIMEOUT_SECONDS,
288
  )
289
  prompt = _build_llm_prompt(payload, modality)
290
+ chain = _free_model_chain()
291
+
292
+ for model in chain:
293
+ try:
294
+ completion = client.chat.completions.create(
295
+ model=model,
296
+ messages=[{"role": "user", "content": prompt}],
297
+ max_tokens=_LLM_MAX_TOKENS,
298
+ temperature=_LLM_TEMPERATURE,
299
+ )
300
+ except RateLimitError:
301
+ logger.info("OpenRouter 429 on %s; advancing to next free model.", model)
302
+ continue
303
+ except APIStatusError as e:
304
+ status = getattr(e, "status_code", None)
305
+ # 402 credits / 403 access / 404 retired-id / 5xx upstream → next.
306
+ if status in (402, 403, 404) or (status is not None and 500 <= status < 600):
307
+ logger.info("OpenRouter %s on %s; advancing to next free model.", status, model)
308
+ continue
309
+ logger.warning("LLM call failed on %s (%s); falling back to template.", model, e)
310
+ return None
311
+ except (APIConnectionError, APITimeoutError) as e:
312
+ # Network is global — switching models won't help.
313
+ logger.warning("LLM connection error (%s); falling back to template.", type(e).__name__)
314
+ return None
315
+ except Exception as e:
316
+ logger.warning("LLM unexpected error on %s (%s); falling back to template.", model, type(e).__name__)
317
+ return None
318
+
319
+ try:
320
+ text = completion.choices[0].message.content
321
+ except (AttributeError, IndexError, TypeError) as e:
322
+ logger.info("LLM response malformed on %s (%s); advancing to next model.", model, e)
323
+ continue
324
+
325
+ if not text or not text.strip():
326
+ logger.info("LLM returned empty rationale on %s; advancing to next model.", model)
327
+ continue
328
+
329
+ return text.strip(), model
330
+
331
+ logger.warning("All free models exhausted; falling back to template.")
332
+ return None
333
 
334
 
335
  def explain(