Spaces:
Running
Running
fix: cap max_tokens per provider to avoid 400 errors on fallback
Browse filesSambaNova (Llama 3.1 405B) has a 4096 output token ceiling. Passing
max_tokens=8192 (needed for verbose tour JSON) would throw a 400 that
_is_exhausted doesn't catch. _MAX_OUTPUT dict silently caps each provider
to its actual limit so fallback providers degrade gracefully rather than
erroring out.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
backend/services/generation.py
CHANGED
|
@@ -570,12 +570,27 @@ class GenerationService:
|
|
| 570 |
# ββ Groq / Gemini / OpenRouter implementation βββββββββββββββββββββββββββββ
|
| 571 |
# All three use the OpenAI SDK interface, so one implementation covers all.
|
| 572 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
def _groq_complete(self, system: str, messages: list[dict], params: dict) -> str:
|
|
|
|
| 574 |
kwargs: dict = dict(
|
| 575 |
model=self._model,
|
| 576 |
messages=[{"role": "system", "content": system}] + messages,
|
| 577 |
temperature=params["temperature"],
|
| 578 |
-
max_tokens=
|
| 579 |
)
|
| 580 |
# Structured JSON output: instructs the model to emit ONLY a valid JSON
|
| 581 |
# object. No markdown fences, no explanatory text β just the JSON.
|
|
|
|
| 570 |
# ββ Groq / Gemini / OpenRouter implementation βββββββββββββββββββββββββββββ
|
| 571 |
# All three use the OpenAI SDK interface, so one implementation covers all.
|
| 572 |
|
| 573 |
+
# Hard output-token ceilings per provider. Each provider enforces its own
|
| 574 |
+
# limit server-side β exceeding it returns a 400 that _is_exhausted doesn't
|
| 575 |
+
# catch. We cap here so fallback providers silently use their actual limit
|
| 576 |
+
# rather than erroring out. Callers can request more; we deliver as much as
|
| 577 |
+
# the current provider allows.
|
| 578 |
+
_MAX_OUTPUT: dict[str, int] = {
|
| 579 |
+
"gemini": 65536,
|
| 580 |
+
"cerebras": 16384,
|
| 581 |
+
"sambanova": 4096, # Llama 3.1 405B free tier
|
| 582 |
+
"openrouter": 8192, # conservative; varies by routed model
|
| 583 |
+
"mistral": 32768,
|
| 584 |
+
"groq": 32768,
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
def _groq_complete(self, system: str, messages: list[dict], params: dict) -> str:
|
| 588 |
+
max_out = min(params["max_tokens"], self._MAX_OUTPUT.get(self.provider, params["max_tokens"]))
|
| 589 |
kwargs: dict = dict(
|
| 590 |
model=self._model,
|
| 591 |
messages=[{"role": "system", "content": system}] + messages,
|
| 592 |
temperature=params["temperature"],
|
| 593 |
+
max_tokens=max_out,
|
| 594 |
)
|
| 595 |
# Structured JSON output: instructs the model to emit ONLY a valid JSON
|
| 596 |
# object. No markdown fences, no explanatory text β just the JSON.
|