File size: 2,166 Bytes
5d357ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fac9ff
5d357ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
"""Anthropic prompt caching breakpoints for outgoing LLM requests.

Caching is GA on Anthropic's API and natively supported by litellm >=1.83
via ``cache_control`` blocks. We apply two breakpoints (out of 4 allowed):

  1. The tool block — caches all tool definitions as a single prefix.
  2. The system message — caches the rendered system prompt.

Together these cover the ~4-5K static tokens that were being re-billed on
every turn. Subsequent turns within the 5-minute TTL hit cache_read pricing
(~10% of input cost) instead of full input.

Non-Anthropic models (HF router, OpenAI) are passed through unchanged.
"""

from typing import Any


def with_prompt_caching(
    messages: list[Any],
    tools: list[dict] | None,
    model_name: str | None,
) -> tuple[list[Any], list[dict] | None]:
    """Return (messages, tools) with cache_control breakpoints for Anthropic.

    No-op for non-Anthropic models. Original objects are not mutated; a fresh
    list with replaced first message and last tool is returned, so callers
    that share the underlying ``ContextManager.items`` list don't see their
    persisted history rewritten.
    """
    if not model_name or "anthropic" not in model_name:
        return messages, tools

    if tools:
        new_tools = list(tools)
        last = dict(new_tools[-1])
        last["cache_control"] = {"type": "ephemeral"}
        new_tools[-1] = last
        tools = new_tools

    if messages:
        first = messages[0]
        role = first.get("role") if isinstance(first, dict) else getattr(first, "role", None)
        if role == "system":
            content = (
                first.get("content")
                if isinstance(first, dict)
                else getattr(first, "content", None)
            )
            if isinstance(content, str) and content:
                cached_block = [{
                    "type": "text",
                    "text": content,
                    "cache_control": {"type": "ephemeral"},
                }]
                new_first = {"role": "system", "content": cached_block}
                messages = [new_first] + list(messages[1:])

    return messages, tools