Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

ml-intern / agent /core /effort_probe.py

lewtun HF Staff

Switch from Bedrock to Anthropic endpoint as default. Include support for gpt-5.5 (#118)

0545e40 unverified 12 days ago

raw

history blame

8.63 kB

	"""Probe-and-cascade for reasoning effort on /model switch.

	We don't maintain a per-model capability table. Instead, the first time a
	user picks a model we fire a 1-token ping with the same params we'd use
	for real and walk down a cascade (``max`` → ``xhigh`` → ``high`` → …)
	until the provider stops rejecting us. The result is cached per-model on
	the session, so real messages don't pay the probe cost again.

	Three outcomes, classified from the 400 error text:

	* success → cache the effort that worked
	* ``"thinking ... not supported"`` → model doesn't do thinking at all;
	cache ``None`` so we stop sending thinking params
	* ``"effort ... invalid"`` / synonyms → cascade walks down and retries

	Transient errors (5xx, timeout, connection reset) bubble out as
	``ProbeInconclusive`` so the caller can complete the switch with a
	warning instead of blocking on a flaky provider.
	"""

	from __future__ import annotations

	import asyncio
	import logging
	from dataclasses import dataclass

	from litellm import acompletion

	from agent.core.llm_params import UnsupportedEffortError, _resolve_llm_params

	logger = logging.getLogger(__name__)


	# Cascade: for each user-stated preference, the ordered list of levels to
	# try. First success wins. ``max`` is Anthropic-only; ``xhigh`` is also
	# supported on current OpenAI GPT-5 models. Providers that don't accept a
	# requested level raise ``UnsupportedEffortError`` synchronously (no wasted
	# network round-trip) and we advance to the next level.
	_EFFORT_CASCADE: dict[str, list[str]] = {
	"max": ["max", "xhigh", "high", "medium", "low"],
	"xhigh": ["xhigh", "high", "medium", "low"],
	"high": ["high", "medium", "low"],
	"medium": ["medium", "low"],
	"minimal": ["minimal", "low"],
	"low": ["low"],
	}

	_PROBE_TIMEOUT = 15.0
	# Keep the probe cheap, but high enough that frontier reasoning models can
	# finish a trivial reply instead of tripping a false "output limit reached"
	# error during capability detection.
	_PROBE_MAX_TOKENS = 64


	class ProbeInconclusive(Exception):
	"""The probe couldn't reach a verdict (transient network / provider error).

	Caller should complete the switch with a warning — the next real call
	will re-surface the error if it's persistent.
	"""


	@dataclass
	class ProbeOutcome:
	"""What the probe learned. ``effective_effort`` semantics match the cache:

	* str → send this level
	* None → model doesn't support thinking; strip it
	"""
	effective_effort: str \| None
	attempts: int
	elapsed_ms: int
	note: str \| None = None # e.g. "max not supported, falling back"


	def _is_thinking_unsupported(e: Exception) -> bool:
	"""Model rejected any thinking config.

	Matches Anthropic's 'thinking.type.enabled is not supported for this
	model' as well as the adaptive variant. Substring-match because the
	exact wording shifts across API versions.
	"""
	s = str(e).lower()
	return "thinking" in s and "not supported" in s


	def _is_invalid_effort(e: Exception) -> bool:
	"""The requested effort level isn't accepted for this model.

	Covers both API responses (Anthropic/OpenAI 400 with "invalid", "must
	be one of", etc.) and LiteLLM's local validation that fires before
	the request (e.g. "effort='max' is only supported by Claude Opus 4.6"
	— LiteLLM knows max is Opus-4.6-only and raises synchronously). The
	cascade walks down on either.

	Explicitly returns False when the message is really about thinking
	itself (e.g. Anthropic's 4.7 error mentions ``output_config.effort``
	in its fix hint, but the actual failure is ``thinking.type.enabled``
	being unsupported). That case is caught by ``_is_thinking_unsupported``.
	"""
	if _is_thinking_unsupported(e):
	return False
	s = str(e).lower()
	if "effort" not in s and "output_config" not in s:
	return False
	return any(
	phrase in s
	for phrase in (
	"invalid", "not supported", "must be one of", "not a valid",
	"unrecognized", "unknown",
	# LiteLLM's own pre-flight validation phrasing.
	"only supported by", "is only supported",
	)
	)


	def _is_transient(e: Exception) -> bool:
	"""Network / provider-side flake. Keep in sync with agent_loop's list.

	Also matches by type for ``asyncio.TimeoutError`` — its ``str(e)`` is
	empty, so substring matching alone misses it.
	"""
	if isinstance(e, (asyncio.TimeoutError, TimeoutError)):
	return True
	s = str(e).lower()
	return any(
	p in s
	for p in (
	"timeout", "timed out", "429", "rate limit",
	"503", "service unavailable", "502", "bad gateway",
	"500", "internal server error", "overloaded", "capacity",
	"connection reset", "connection refused", "connection error",
	"eof", "broken pipe",
	)
	)


	async def probe_effort(
	model_name: str,
	preference: str \| None,
	hf_token: str \| None,
	) -> ProbeOutcome:
	"""Walk the cascade for ``preference`` on ``model_name``.

	Returns the first effort the provider accepts, or ``None`` if it
	rejects thinking altogether. Raises ``ProbeInconclusive`` only for
	transient errors (5xx, timeout) — persistent 4xx that aren't thinking/
	effort related bubble as the original exception so callers can surface
	them (auth, model-not-found, quota, etc.).
	"""
	loop = asyncio.get_event_loop()
	start = loop.time()
	attempts = 0

	if not preference:
	# User explicitly turned effort off — nothing to probe. A bare
	# ping with no thinking params is pointless; just report "off".
	return ProbeOutcome(effective_effort=None, attempts=0, elapsed_ms=0)

	cascade = _EFFORT_CASCADE.get(preference, [preference])
	skipped: list[str] = [] # levels the provider rejected synchronously

	last_error: Exception \| None = None
	for effort in cascade:
	try:
	params = _resolve_llm_params(
	model_name, hf_token, reasoning_effort=effort, strict=True,
	)
	except UnsupportedEffortError:
	# Provider can't even accept this effort name (e.g. "max" on
	# HF router). Skip without a network call.
	skipped.append(effort)
	continue

	attempts += 1
	try:
	await asyncio.wait_for(
	acompletion(
	messages=[{"role": "user", "content": "ping"}],
	max_tokens=_PROBE_MAX_TOKENS,
	stream=False,
	**params,
	),
	timeout=_PROBE_TIMEOUT,
	)
	except Exception as e:
	last_error = e
	if _is_thinking_unsupported(e):
	elapsed = int((loop.time() - start) * 1000)
	return ProbeOutcome(
	effective_effort=None,
	attempts=attempts,
	elapsed_ms=elapsed,
	note="model doesn't support reasoning, dropped",
	)
	if _is_invalid_effort(e):
	logger.debug("probe: %s rejected effort=%s, trying next", model_name, effort)
	continue
	if _is_transient(e):
	raise ProbeInconclusive(str(e)) from e
	# Persistent non-thinking 4xx (auth, quota, model-not-found) —
	# let the caller classify & surface.
	raise
	else:
	elapsed = int((loop.time() - start) * 1000)
	note = None
	if effort != preference:
	note = f"{preference} not supported, using {effort}"
	return ProbeOutcome(
	effective_effort=effort,
	attempts=attempts,
	elapsed_ms=elapsed,
	note=note,
	)

	# Cascade exhausted without a success. This only happens when every
	# level was either rejected synchronously (``UnsupportedEffortError``,
	# e.g. preference=max on HF and we also somehow filtered all others)
	# or the provider 400'd ``invalid effort`` on every level.
	elapsed = int((loop.time() - start) * 1000)
	if last_error is not None and not _is_invalid_effort(last_error):
	raise last_error
	note = (
	"no effort level accepted — proceeding without thinking"
	if not skipped
	else f"provider rejected all efforts ({', '.join(skipped)})"
	)
	return ProbeOutcome(
	effective_effort=None,
	attempts=attempts,
	elapsed_ms=elapsed,
	note=note,
	)