ml-intern / agent /core /prompt_caching.py
Guillaume Salou
feat: route Claude via Bedrock inference profile (#82)
2fac9ff unverified
raw
history blame
2.17 kB
"""Anthropic prompt caching breakpoints for outgoing LLM requests.
Caching is GA on Anthropic's API and natively supported by litellm >=1.83
via ``cache_control`` blocks. We apply two breakpoints (out of 4 allowed):
1. The tool block — caches all tool definitions as a single prefix.
2. The system message — caches the rendered system prompt.
Together these cover the ~4-5K static tokens that were being re-billed on
every turn. Subsequent turns within the 5-minute TTL hit cache_read pricing
(~10% of input cost) instead of full input.
Non-Anthropic models (HF router, OpenAI) are passed through unchanged.
"""
from typing import Any
def with_prompt_caching(
messages: list[Any],
tools: list[dict] | None,
model_name: str | None,
) -> tuple[list[Any], list[dict] | None]:
"""Return (messages, tools) with cache_control breakpoints for Anthropic.
No-op for non-Anthropic models. Original objects are not mutated; a fresh
list with replaced first message and last tool is returned, so callers
that share the underlying ``ContextManager.items`` list don't see their
persisted history rewritten.
"""
if not model_name or "anthropic" not in model_name:
return messages, tools
if tools:
new_tools = list(tools)
last = dict(new_tools[-1])
last["cache_control"] = {"type": "ephemeral"}
new_tools[-1] = last
tools = new_tools
if messages:
first = messages[0]
role = first.get("role") if isinstance(first, dict) else getattr(first, "role", None)
if role == "system":
content = (
first.get("content")
if isinstance(first, dict)
else getattr(first, "content", None)
)
if isinstance(content, str) and content:
cached_block = [{
"type": "text",
"text": content,
"cache_control": {"type": "ephemeral"},
}]
new_first = {"role": "system", "content": cached_block}
messages = [new_first] + list(messages[1:])
return messages, tools