Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

App Files Files Community

XciD7 commited on 15 days ago

Commit

5d357ba

unverified ·

1 Parent(s): e2552e8

feat: enable Anthropic prompt caching on system prompt and tools (#69)

Browse files

* feat: enable Anthropic prompt caching on system prompt and tools

Mark the rendered system prompt and the tool block with cache_control
breakpoints when calling Anthropic models. The static prefix (~4-5K
tokens of system prompt + 15+ tool definitions) was being re-billed at
full input rate on every turn, every retry, and every research
sub-agent iteration (up to 60 per task).

With ephemeral cache breakpoints, subsequent turns within the 5-minute
TTL are billed at cache-read pricing (~10% of input cost). Expected
savings: 40-50% input tokens on multi-turn conversations, 60-80% on
research sub-agent loops.

Caching is GA in the Anthropic API and natively supported by litellm
1.83+ via cache_control blocks (no beta header required). Non-Anthropic
models (HF router, OpenAI) are passed through unchanged.

The helper does not mutate the caller's message list or tool list, so
the persisted ContextManager.items history stays in its original
string-content form.

* refactor: hoist prompt_caching imports to module level, drop cached_ prefix

Files changed (4) hide show

agent/context_manager/manager.py +5 -0
agent/core/agent_loop.py +3 -0
agent/core/prompt_caching.py +59 -0
agent/tools/research_tool.py +10 -4

agent/context_manager/manager.py CHANGED Viewed

@@ -13,6 +13,8 @@ import yaml
 from jinja2 import Template
 from litellm import Message, acompletion
 logger = logging.getLogger(__name__)
 _HF_WHOAMI_URL = "https://huggingface.co/api/whoami-v2"
@@ -114,6 +116,9 @@ async def summarize_messages(
     prompt_messages = list(messages) + [Message(role="user", content=prompt)]
     llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
     response = await acompletion(
         messages=prompt_messages,
         max_completion_tokens=max_tokens,

 from jinja2 import Template
 from litellm import Message, acompletion
+from agent.core.prompt_caching import with_prompt_caching
 logger = logging.getLogger(__name__)
 _HF_WHOAMI_URL = "https://huggingface.co/api/whoami-v2"
     prompt_messages = list(messages) + [Message(role="user", content=prompt)]
     llm_params = _resolve_llm_params(model_name, hf_token, reasoning_effort="high")
+    prompt_messages, tool_specs = with_prompt_caching(
+        prompt_messages, tool_specs, llm_params.get("model")
+    )
     response = await acompletion(
         messages=prompt_messages,
         max_completion_tokens=max_tokens,

agent/core/agent_loop.py CHANGED Viewed

@@ -14,6 +14,7 @@ from litellm.exceptions import ContextWindowExceededError
 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
@@ -296,6 +297,7 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
     """Call the LLM with streaming, emitting assistant_chunk events."""
     response = None
     _healed_effort = False  # one-shot safety net per call
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
@@ -390,6 +392,7 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
     """Call the LLM without streaming, emit assistant_message at the end."""
     response = None
     _healed_effort = False
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(

 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
+from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
 from agent.tools.jobs_tool import CPU_FLAVORS
     """Call the LLM with streaming, emitting assistant_chunk events."""
     response = None
     _healed_effort = False  # one-shot safety net per call
+    messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(
     """Call the LLM without streaming, emit assistant_message at the end."""
     response = None
     _healed_effort = False
+    messages, tools = with_prompt_caching(messages, tools, llm_params.get("model"))
     for _llm_attempt in range(_MAX_LLM_RETRIES):
         try:
             response = await acompletion(

agent/core/prompt_caching.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""Anthropic prompt caching breakpoints for outgoing LLM requests.
+Caching is GA on Anthropic's API and natively supported by litellm >=1.83
+via ``cache_control`` blocks. We apply two breakpoints (out of 4 allowed):
+  1. The tool block — caches all tool definitions as a single prefix.
+  2. The system message — caches the rendered system prompt.
+Together these cover the ~4-5K static tokens that were being re-billed on
+every turn. Subsequent turns within the 5-minute TTL hit cache_read pricing
+(~10% of input cost) instead of full input.
+Non-Anthropic models (HF router, OpenAI) are passed through unchanged.
+"""
+from typing import Any
+def with_prompt_caching(
+    messages: list[Any],
+    tools: list[dict] | None,
+    model_name: str | None,
+) -> tuple[list[Any], list[dict] | None]:
+    """Return (messages, tools) with cache_control breakpoints for Anthropic.
+    No-op for non-Anthropic models. Original objects are not mutated; a fresh
+    list with replaced first message and last tool is returned, so callers
+    that share the underlying ``ContextManager.items`` list don't see their
+    persisted history rewritten.
+    """
+    if not model_name or not model_name.startswith("anthropic/"):
+        return messages, tools
+    if tools:
+        new_tools = list(tools)
+        last = dict(new_tools[-1])
+        last["cache_control"] = {"type": "ephemeral"}
+        new_tools[-1] = last
+        tools = new_tools
+    if messages:
+        first = messages[0]
+        role = first.get("role") if isinstance(first, dict) else getattr(first, "role", None)
+        if role == "system":
+            content = (
+                first.get("content")
+                if isinstance(first, dict)
+                else getattr(first, "content", None)
+            )
+            if isinstance(content, str) and content:
+                cached_block = [{
+                    "type": "text",
+                    "text": content,
+                    "cache_control": {"type": "ephemeral"},
+                }]
+                new_first = {"role": "system", "content": cached_block}
+                messages = [new_first] + list(messages[1:])
+    return messages, tools

agent/tools/research_tool.py CHANGED Viewed

@@ -15,6 +15,7 @@ from litellm import Message, acompletion
 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
@@ -323,8 +324,9 @@ async def research_handler(
                 ),
             ))
             try:
                 response = await acompletion(
-                    messages=messages,
                     tools=None,  # no tools — force text response
                     stream=False,
                     timeout=120,
@@ -348,9 +350,12 @@ async def research_handler(
             ))
         try:
             response = await acompletion(
-                messages=messages,
-                tools=tool_specs if tool_specs else None,
                 tool_choice="auto",
                 stream=False,
                 timeout=120,
@@ -446,8 +451,9 @@ async def research_handler(
         ),
     ))
     try:
         response = await acompletion(
-            messages=messages,
             tools=None,
             stream=False,
             timeout=120,

 from agent.core.doom_loop import check_for_doom_loop
 from agent.core.llm_params import _resolve_llm_params
+from agent.core.prompt_caching import with_prompt_caching
 from agent.core.session import Event
 logger = logging.getLogger(__name__)
                 ),
             ))
             try:
+                _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
                 response = await acompletion(
+                    messages=_msgs,
                     tools=None,  # no tools — force text response
                     stream=False,
                     timeout=120,
             ))
         try:
+            _msgs, _tools = with_prompt_caching(
+                messages, tool_specs if tool_specs else None, llm_params.get("model")
+            )
             response = await acompletion(
+                messages=_msgs,
+                tools=_tools,
                 tool_choice="auto",
                 stream=False,
                 timeout=120,
         ),
     ))
     try:
+        _msgs, _ = with_prompt_caching(messages, None, llm_params.get("model"))
         response = await acompletion(
+            messages=_msgs,
             tools=None,
             stream=False,
             timeout=120,