Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 2,166 Bytes
5d357ba 2fac9ff 5d357ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | """Anthropic prompt caching breakpoints for outgoing LLM requests.
Caching is GA on Anthropic's API and natively supported by litellm >=1.83
via ``cache_control`` blocks. We apply two breakpoints (out of 4 allowed):
1. The tool block — caches all tool definitions as a single prefix.
2. The system message — caches the rendered system prompt.
Together these cover the ~4-5K static tokens that were being re-billed on
every turn. Subsequent turns within the 5-minute TTL hit cache_read pricing
(~10% of input cost) instead of full input.
Non-Anthropic models (HF router, OpenAI) are passed through unchanged.
"""
from typing import Any
def with_prompt_caching(
messages: list[Any],
tools: list[dict] | None,
model_name: str | None,
) -> tuple[list[Any], list[dict] | None]:
"""Return (messages, tools) with cache_control breakpoints for Anthropic.
No-op for non-Anthropic models. Original objects are not mutated; a fresh
list with replaced first message and last tool is returned, so callers
that share the underlying ``ContextManager.items`` list don't see their
persisted history rewritten.
"""
if not model_name or "anthropic" not in model_name:
return messages, tools
if tools:
new_tools = list(tools)
last = dict(new_tools[-1])
last["cache_control"] = {"type": "ephemeral"}
new_tools[-1] = last
tools = new_tools
if messages:
first = messages[0]
role = first.get("role") if isinstance(first, dict) else getattr(first, "role", None)
if role == "system":
content = (
first.get("content")
if isinstance(first, dict)
else getattr(first, "content", None)
)
if isinstance(content, str) and content:
cached_block = [{
"type": "text",
"text": content,
"cache_control": {"type": "ephemeral"},
}]
new_first = {"role": "system", "content": cached_block}
messages = [new_first] + list(messages[1:])
return messages, tools
|