Upload agent/core/effort_probe.py

736e99b verified 4 days ago

10.7 kB

	"""Probe-and-cascade for reasoning effort on /model switch.

	We don't maintain a per-model capability table. Instead, the first time a
	user picks a model we fire a 1-token ping with the same params we'd use
	for real and walk down a cascade (``max`` → ``xhigh`` → ``high`` → …)
	until the provider stops rejecting us. The result is cached per-model on
	the session, so real messages don't pay the probe cost again.

	Three outcomes, classified from the 400 error text:

	* success → cache the effort that worked
	* ``"thinking ... not supported"`` → model doesn't do thinking at all;
	cache ``None`` so we stop sending thinking params
	* ``"effort ... invalid"`` / synonyms → cascade walks down and retries

	Transient errors (5xx, timeout, connection reset) bubble out as
	``ProbeInconclusive`` so the caller can complete the switch with a
	warning instead of blocking on a flaky provider.
	"""

	from __future__ import annotations

	import asyncio
	import logging
	import time
	from dataclasses import dataclass
	from typing import Any

	from litellm import acompletion

	from agent.core.llm_params import UnsupportedEffortError, _resolve_llm_params

	logger = logging.getLogger(__name__)


	# Cascade: for each user-stated preference, the ordered list of levels to
	# try. First success wins. ``max`` is Anthropic-only; ``xhigh`` is also
	# supported on current OpenAI GPT-5 models. Providers that don't accept a
	# requested level raise ``UnsupportedEffortError`` synchronously (no wasted
	# network round-trip) and we advance to the next level.
	_EFFORT_CASCADE: dict[str, list[str]] = {
	"max": ["max", "xhigh", "high", "medium", "low"],
	"xhigh": ["xhigh", "high", "medium", "low"],
	"high": ["high", "medium", "low"],
	"medium": ["medium", "low"],
	"minimal": ["minimal", "low"],
	"low": ["low"],
	}

	_PROBE_TIMEOUT = 15.0
	# Keep the probe cheap, but high enough that frontier reasoning models can
	# finish a trivial reply instead of tripping a false "output limit reached"
	# error during capability detection.
	_PROBE_MAX_TOKENS = 64


	class ProbeInconclusive(Exception):
	"""The probe couldn't reach a verdict (transient network / provider error).

	Caller should complete the switch with a warning — the next real call
	will re-surface the error if it's persistent.
	"""


	@dataclass
	class ProbeOutcome:
	"""What the probe learned. ``effective_effort`` semantics match the cache:

	* str → send this level
	* None → model doesn't support thinking; strip it
	"""

	effective_effort: str \| None
	attempts: int
	elapsed_ms: int
	note: str \| None = None # e.g. "max not supported, falling back"


	def _is_thinking_unsupported(e: Exception) -> bool:
	"""Model rejected any thinking config.

	Matches Anthropic's 'thinking.type.enabled is not supported for this
	model' as well as the adaptive variant. Substring-match because the
	exact wording shifts across API versions.
	"""
	s = str(e).lower()
	return "thinking" in s and "not supported" in s


	def _is_invalid_effort(e: Exception) -> bool:
	"""The requested effort level isn't accepted for this model.

	Covers both API responses (Anthropic/OpenAI 400 with "invalid", "must
	be one of", etc.) and LiteLLM's local validation that fires before
	the request (e.g. "effort='max' is only supported by Claude Opus 4.6"
	— LiteLLM knows max is Opus-4.6-only and raises synchronously). The
	cascade walks down on either.

	Explicitly returns False when the message is really about thinking
	itself (e.g. Anthropic's 4.7 error mentions ``output_config.effort``
	in its fix hint, but the actual failure is ``thinking.type.enabled``
	being unsupported). That case is caught by ``_is_thinking_unsupported``.
	"""
	if _is_thinking_unsupported(e):
	return False
	s = str(e).lower()
	if "effort" not in s and "output_config" not in s:
	return False
	return any(
	phrase in s
	for phrase in (
	"invalid",
	"not supported",
	"must be one of",
	"not a valid",
	"unrecognized",
	"unknown",
	# LiteLLM's own pre-flight validation phrasing.
	"only supported by",
	"is only supported",
	)
	)


	def _is_transient(e: Exception) -> bool:
	"""Network / provider-side flake. Keep in sync with agent_loop's list.

	Also matches by type for ``asyncio.TimeoutError`` — its ``str(e)`` is
	empty, so substring matching alone misses it.
	"""
	if isinstance(e, (asyncio.TimeoutError, TimeoutError)):
	return True
	s = str(e).lower()
	return any(
	p in s
	for p in (
	"timeout",
	"timed out",
	"429",
	"rate limit",
	"503",
	"service unavailable",
	"502",
	"bad gateway",
	"500",
	"internal server error",
	"overloaded",
	"capacity",
	"connection reset",
	"connection refused",
	"connection error",
	"eof",
	"broken pipe",
	)
	)


	async def probe_effort(
	model_name: str,
	preference: str \| None,
	hf_token: str \| None,
	session: Any = None,
	) -> ProbeOutcome:
	"""Walk the cascade for ``preference`` on ``model_name``.

	Returns the first effort the provider accepts, or ``None`` if it
	rejects thinking altogether. Raises ``ProbeInconclusive`` only for
	transient errors (5xx, timeout) — persistent 4xx that aren't thinking/
	effort related bubble as the original exception so callers can surface
	them (auth, model-not-found, quota, etc.).

	``session`` is optional; when provided, each successful probe attempt
	is recorded via ``telemetry.record_llm_call(kind="effort_probe")`` so
	the cost shows up in the session's ``total_cost_usd``. Failed probes
	(rejected by the provider) typically aren't billed, so we only record
	on success.
	"""
	loop = asyncio.get_event_loop()
	start = loop.time()
	attempts = 0

	if not preference:
	# User explicitly turned effort off — nothing to probe. A bare
	# ping with no thinking params is pointless; just report "off".
	return ProbeOutcome(effective_effort=None, attempts=0, elapsed_ms=0)

	# Local / self-hosted providers rarely support reasoning effort.
	# Skip the probe to avoid wasting time on a cascade that will fail.
	_LOCAL_PREFIXES = {
	"llamacpp", "lmstudio", "mlx", "nim", "local",
	"ollama", "vllm", "tgi",
	}
	if model_name.split("/", 1)[0] in _LOCAL_PREFIXES:
	return ProbeOutcome(
	effective_effort=None,
	attempts=0,
	elapsed_ms=0,
	note="local provider — reasoning effort skipped",
	)

	cascade = _EFFORT_CASCADE.get(preference, [preference])
	skipped: list[str] = [] # levels the provider rejected synchronously

	last_error: Exception \| None = None
	for effort in cascade:
	try:
	params = _resolve_llm_params(
	model_name,
	hf_token,
	reasoning_effort=effort,
	strict=True,
	)
	except UnsupportedEffortError:
	# Provider can't even accept this effort name (e.g. "max" on
	# HF router). Skip without a network call.
	skipped.append(effort)
	continue

	attempts += 1
	try:
	_t0 = time.monotonic()
	response = await asyncio.wait_for(
	acompletion(
	messages=[{"role": "user", "content": "ping"}],
	max_tokens=_PROBE_MAX_TOKENS,
	stream=False,
	**params,
	),
	timeout=_PROBE_TIMEOUT,
	)
	if session is not None:
	# Best-effort telemetry — never let a logging blip propagate
	# out of the probe and break model switching.
	try:
	from agent.core import telemetry

	await telemetry.record_llm_call(
	session,
	model=model_name,
	response=response,
	latency_ms=int((time.monotonic() - _t0) * 1000),
	finish_reason=response.choices[0].finish_reason
	if response.choices
	else None,
	kind="effort_probe",
	)
	except Exception as _telem_err:
	logger.debug("effort_probe telemetry failed: %s", _telem_err)
	except Exception as e:
	last_error = e
	if _is_thinking_unsupported(e):
	elapsed = int((loop.time() - start) * 1000)
	return ProbeOutcome(
	effective_effort=None,
	attempts=attempts,
	elapsed_ms=elapsed,
	note="model doesn't support reasoning, dropped",
	)
	if _is_invalid_effort(e):
	logger.debug(
	"probe: %s rejected effort=%s, trying next", model_name, effort
	)
	continue
	if _is_transient(e):
	raise ProbeInconclusive(str(e)) from e
	# Persistent non-thinking 4xx (auth, quota, model-not-found) —
	# let the caller classify & surface.
	raise
	else:
	elapsed = int((loop.time() - start) * 1000)
	note = None
	if effort != preference:
	note = f"{preference} not supported, using {effort}"
	return ProbeOutcome(
	effective_effort=effort,
	attempts=attempts,
	elapsed_ms=elapsed,
	note=note,
	)

	# Cascade exhausted without a success. This only happens when every
	# level was either rejected synchronously (``UnsupportedEffortError``,
	# e.g. preference=max on HF and we also somehow filtered all others)
	# or the provider 400'd ``invalid effort`` on every level.
	elapsed = int((loop.time() - start) * 1000)
	if last_error is not None and not _is_invalid_effort(last_error):
	raise last_error
	note = (
	"no effort level accepted — proceeding without thinking"
	if not skipped
	else f"provider rejected all efforts ({', '.join(skipped)})"
	)
	return ProbeOutcome(
	effective_effort=None,
	attempts=attempts,
	elapsed_ms=elapsed,
	note=note,
	)