feat(llm): OpenRouter free-tier fallback chain (smartest → smallest)
Browse filesReplaces single-model call with a configurable fallback chain. On 429
(rate-limit / quota), 402 (credits), 403/404 (retired id) or 5xx
(upstream) the explainer advances to the next free model. Network and
timeout errors still drop straight to the deterministic template,
since switching models can't fix an unreachable host.
Loads .env via python-dotenv at import time so OPENROUTER_API_KEY
flows through without manual exports. .env added to .gitignore.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- .gitignore +5 -0
- src/llm/explainer.py +94 -26
.gitignore
CHANGED
|
@@ -11,6 +11,11 @@ __pycache__/
|
|
| 11 |
venv*/
|
| 12 |
env/
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
# Data — only keep folder structure, never raw payloads
|
| 15 |
data/raw/*
|
| 16 |
!data/raw/.gitkeep
|
|
|
|
| 11 |
venv*/
|
| 12 |
env/
|
| 13 |
|
| 14 |
+
# Secrets — never commit
|
| 15 |
+
.env
|
| 16 |
+
.env.local
|
| 17 |
+
.env.*.local
|
| 18 |
+
|
| 19 |
# Data — only keep folder structure, never raw payloads
|
| 20 |
data/raw/*
|
| 21 |
!data/raw/.gitkeep
|
src/llm/explainer.py
CHANGED
|
@@ -17,6 +17,16 @@ from src.core.logger import get_logger
|
|
| 17 |
|
| 18 |
logger = get_logger(__name__)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
class FeatureRow(TypedDict):
|
| 22 |
feature: str
|
|
@@ -39,11 +49,38 @@ class ExplainResult(TypedDict):
|
|
| 39 |
|
| 40 |
|
| 41 |
_OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
|
| 42 |
-
_DEFAULT_MODEL = "meta-llama/llama-3.2-3b-instruct:free"
|
| 43 |
_LLM_TIMEOUT_SECONDS = 8.0
|
| 44 |
_LLM_MAX_TOKENS = 256
|
| 45 |
_LLM_TEMPERATURE = 0.3
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def _should_use_llm() -> bool:
|
| 49 |
"""Gate: env kill-switch off AND key present."""
|
|
@@ -222,10 +259,20 @@ def _build_llm_prompt(payload: ExplainPayload, modality: str = "bbb") -> str:
|
|
| 222 |
|
| 223 |
|
| 224 |
def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, str] | None:
|
| 225 |
-
"""Try the OpenRouter chat completion
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
try:
|
| 227 |
-
# Local
|
| 228 |
-
from openai import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
except ImportError as e:
|
| 230 |
logger.warning("openai SDK not importable: %s", e)
|
| 231 |
return None
|
|
@@ -240,28 +287,49 @@ def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, s
|
|
| 240 |
timeout=_LLM_TIMEOUT_SECONDS,
|
| 241 |
)
|
| 242 |
prompt = _build_llm_prompt(payload, modality)
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
|
| 267 |
def explain(
|
|
|
|
| 17 |
|
| 18 |
logger = get_logger(__name__)
|
| 19 |
|
| 20 |
+
# Load .env (project root) so OPENROUTER_API_KEY etc. are available without
|
| 21 |
+
# the caller having to export them. Safe no-op if python-dotenv isn't
|
| 22 |
+
# installed or .env is missing. Existing env vars are NOT overridden.
|
| 23 |
+
try:
|
| 24 |
+
from dotenv import load_dotenv as _load_dotenv
|
| 25 |
+
|
| 26 |
+
_load_dotenv(override=False)
|
| 27 |
+
except ImportError:
|
| 28 |
+
pass
|
| 29 |
+
|
| 30 |
|
| 31 |
class FeatureRow(TypedDict):
|
| 32 |
feature: str
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
_OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
|
|
|
|
| 52 |
_LLM_TIMEOUT_SECONDS = 8.0
|
| 53 |
_LLM_MAX_TOKENS = 256
|
| 54 |
_LLM_TEMPERATURE = 0.3
|
| 55 |
|
| 56 |
+
# Free-tier fallback chain, smartest → smallest. When a model returns 429
|
| 57 |
+
# (rate-limit / daily-quota exhausted), 402 (credits), 404 (id retired) or
|
| 58 |
+
# 5xx (upstream), we advance to the next model. Network/timeout errors fall
|
| 59 |
+
# straight to the deterministic template — switching models won't help.
|
| 60 |
+
# Override at runtime via OPENROUTER_FREE_MODELS (comma-separated). Model
|
| 61 |
+
# availability on OpenRouter churns; an ID that 404s is skipped silently.
|
| 62 |
+
_DEFAULT_FREE_MODEL_CHAIN: tuple[str, ...] = (
|
| 63 |
+
"inclusionai/ling-2.6-1t:free", # ~1T flagship
|
| 64 |
+
"nvidia/nemotron-3-super-120b-a12b:free", # 120B reasoning MoE
|
| 65 |
+
"minimax/minimax-m2.5:free",
|
| 66 |
+
"tencent/hy3-preview:free", # MoE + reasoning
|
| 67 |
+
"google/gemma-4-31b-it:free",
|
| 68 |
+
"google/gemma-4-26b-a4b-it:free",
|
| 69 |
+
"nvidia/nemotron-3-nano-omni-30b-a3b-reasoning:free",
|
| 70 |
+
"poolside/laguna-m.1:free",
|
| 71 |
+
"poolside/laguna-xs.2:free",
|
| 72 |
+
"meta-llama/llama-3.2-3b-instruct:free", # 3B last-resort
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _free_model_chain() -> tuple[str, ...]:
|
| 77 |
+
raw = os.environ.get("OPENROUTER_FREE_MODELS")
|
| 78 |
+
if raw:
|
| 79 |
+
ids = tuple(m.strip() for m in raw.split(",") if m.strip())
|
| 80 |
+
if ids:
|
| 81 |
+
return ids
|
| 82 |
+
return _DEFAULT_FREE_MODEL_CHAIN
|
| 83 |
+
|
| 84 |
|
| 85 |
def _should_use_llm() -> bool:
|
| 86 |
"""Gate: env kill-switch off AND key present."""
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
def _llm_explain(payload: ExplainPayload, modality: str = "bbb") -> tuple[str, str] | None:
|
| 262 |
+
"""Try the OpenRouter chat completion across the free-tier fallback chain.
|
| 263 |
+
|
| 264 |
+
Returns (rationale, model_id) on first success, or None if every model
|
| 265 |
+
is exhausted / unreachable (caller falls back to the template).
|
| 266 |
+
"""
|
| 267 |
try:
|
| 268 |
+
# Local imports — keeps this dep optional at module load time.
|
| 269 |
+
from openai import (
|
| 270 |
+
OpenAI,
|
| 271 |
+
APIConnectionError,
|
| 272 |
+
APIStatusError,
|
| 273 |
+
APITimeoutError,
|
| 274 |
+
RateLimitError,
|
| 275 |
+
)
|
| 276 |
except ImportError as e:
|
| 277 |
logger.warning("openai SDK not importable: %s", e)
|
| 278 |
return None
|
|
|
|
| 287 |
timeout=_LLM_TIMEOUT_SECONDS,
|
| 288 |
)
|
| 289 |
prompt = _build_llm_prompt(payload, modality)
|
| 290 |
+
chain = _free_model_chain()
|
| 291 |
+
|
| 292 |
+
for model in chain:
|
| 293 |
+
try:
|
| 294 |
+
completion = client.chat.completions.create(
|
| 295 |
+
model=model,
|
| 296 |
+
messages=[{"role": "user", "content": prompt}],
|
| 297 |
+
max_tokens=_LLM_MAX_TOKENS,
|
| 298 |
+
temperature=_LLM_TEMPERATURE,
|
| 299 |
+
)
|
| 300 |
+
except RateLimitError:
|
| 301 |
+
logger.info("OpenRouter 429 on %s; advancing to next free model.", model)
|
| 302 |
+
continue
|
| 303 |
+
except APIStatusError as e:
|
| 304 |
+
status = getattr(e, "status_code", None)
|
| 305 |
+
# 402 credits / 403 access / 404 retired-id / 5xx upstream → next.
|
| 306 |
+
if status in (402, 403, 404) or (status is not None and 500 <= status < 600):
|
| 307 |
+
logger.info("OpenRouter %s on %s; advancing to next free model.", status, model)
|
| 308 |
+
continue
|
| 309 |
+
logger.warning("LLM call failed on %s (%s); falling back to template.", model, e)
|
| 310 |
+
return None
|
| 311 |
+
except (APIConnectionError, APITimeoutError) as e:
|
| 312 |
+
# Network is global — switching models won't help.
|
| 313 |
+
logger.warning("LLM connection error (%s); falling back to template.", type(e).__name__)
|
| 314 |
+
return None
|
| 315 |
+
except Exception as e:
|
| 316 |
+
logger.warning("LLM unexpected error on %s (%s); falling back to template.", model, type(e).__name__)
|
| 317 |
+
return None
|
| 318 |
+
|
| 319 |
+
try:
|
| 320 |
+
text = completion.choices[0].message.content
|
| 321 |
+
except (AttributeError, IndexError, TypeError) as e:
|
| 322 |
+
logger.info("LLM response malformed on %s (%s); advancing to next model.", model, e)
|
| 323 |
+
continue
|
| 324 |
+
|
| 325 |
+
if not text or not text.strip():
|
| 326 |
+
logger.info("LLM returned empty rationale on %s; advancing to next model.", model)
|
| 327 |
+
continue
|
| 328 |
+
|
| 329 |
+
return text.strip(), model
|
| 330 |
+
|
| 331 |
+
logger.warning("All free models exhausted; falling back to template.")
|
| 332 |
+
return None
|
| 333 |
|
| 334 |
|
| 335 |
def explain(
|