Spaces:
Running on CPU Upgrade
Read max context from litellm + compact at 90% (#54)
Browse files* Read max context from litellm.get_model_info + compact at 90%
The local _MAX_TOKENS_MAP had Claude Opus 4.6 pinned at 200k, triggering
compaction at ~20% of its real 1M window. Swap the hand-maintained table
for litellm.get_model_info()['max_input_tokens'], which LiteLLM keeps
in sync upstream (Opus 4.6=1M, GPT-5=272k, Sonnet 4.5=200k, etc.). HF
router-only ids (MiniMax, Kimi, GLM) aren't in litellm's catalog and
fall through to the 200k default — close enough to their advertised
ranges and safe if the model lies.
Also shifts compaction to fire at 90% of max_context instead of >100%.
The old condition waited until context had already overshot, risking
the next LLM call hitting ContextWindowExceededError before the compact
finished. 90% gives headroom for the summary call + one more turn.
* Drop the legacy -10k buffer — the 90% ratio is the headroom now
ContextManager was subtracting a fixed 10k tokens from max_context on top
of the new 90% compaction threshold, so a 1M-window Opus was triggering
at 891k instead of the intended 900k. Keep max_context == the real model
ceiling; _COMPACT_THRESHOLD_RATIO is the single source of headroom.
* Rename max_context -> model_max_tokens
'max_context' read ambiguously — some reviewers assumed it meant the
compaction threshold. Rename to 'model_max_tokens' so it's unmistakably
the model's real input-token ceiling (what litellm.get_model_info
reports), distinct from the internally-computed compaction threshold.
Touches the ContextManager attribute + ctor param, Session's constructor
kwarg and update_model setter, and the agent loop's debug/warning logs.
No behavior change.
* Simplify compaction gate + rename context_length -> running_context_usage
- ContextManager gains compaction_threshold and needs_compaction
properties so callers stop recomputing "90% of model_max_tokens" by
hand. compact() becomes `if not self.needs_compaction: return`.
- Rename self.context_length to self.running_context_usage. The old
name read like a second ceiling value; the new name says what it is —
the last-reported total_tokens from usage. add_message(), the
ContextWindowExceededError handler, and the compact-finish recompute
all updated accordingly.
- Collapsed _compact_and_notify's local-alias dance (old_length /
max_ctx / threshold) into a single `cm = session.context_manager`.
- agent/context_manager/manager.py +26 -7
- agent/core/agent_loop.py +17 -23
- agent/core/session.py +26 -42
|
@@ -73,7 +73,7 @@ class ContextManager:
|
|
| 73 |
|
| 74 |
def __init__(
|
| 75 |
self,
|
| 76 |
-
|
| 77 |
compact_size: float = 0.1,
|
| 78 |
untouched_messages: int = 5,
|
| 79 |
tool_specs: list[dict[str, Any]] | None = None,
|
|
@@ -87,9 +87,15 @@ class ContextManager:
|
|
| 87 |
hf_token=hf_token,
|
| 88 |
local_mode=local_mode,
|
| 89 |
)
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
self.untouched_messages = untouched_messages
|
| 94 |
self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
|
| 95 |
|
|
@@ -149,7 +155,7 @@ class ContextManager:
|
|
| 149 |
def add_message(self, message: Message, token_count: int = None) -> None:
|
| 150 |
"""Add a message to the history"""
|
| 151 |
if token_count:
|
| 152 |
-
self.
|
| 153 |
self.items.append(message)
|
| 154 |
|
| 155 |
def get_messages(self) -> list[Message]:
|
|
@@ -262,6 +268,19 @@ class ContextManager:
|
|
| 262 |
count += 1
|
| 263 |
return False
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
async def compact(
|
| 266 |
self,
|
| 267 |
model_name: str,
|
|
@@ -269,7 +288,7 @@ class ContextManager:
|
|
| 269 |
hf_token: str | None = None,
|
| 270 |
) -> None:
|
| 271 |
"""Remove old messages to keep history under target size"""
|
| 272 |
-
if
|
| 273 |
return
|
| 274 |
|
| 275 |
system_msg = (
|
|
@@ -325,6 +344,6 @@ class ContextManager:
|
|
| 325 |
head.append(first_user_msg)
|
| 326 |
self.items = head + [summarized_message] + recent_messages
|
| 327 |
|
| 328 |
-
self.
|
| 329 |
len(self.system_prompt) // 4 + response.usage.completion_tokens
|
| 330 |
)
|
|
|
|
| 73 |
|
| 74 |
def __init__(
|
| 75 |
self,
|
| 76 |
+
model_max_tokens: int = 180_000,
|
| 77 |
compact_size: float = 0.1,
|
| 78 |
untouched_messages: int = 5,
|
| 79 |
tool_specs: list[dict[str, Any]] | None = None,
|
|
|
|
| 87 |
hf_token=hf_token,
|
| 88 |
local_mode=local_mode,
|
| 89 |
)
|
| 90 |
+
# The model's real input-token ceiling (from litellm.get_model_info).
|
| 91 |
+
# Compaction triggers at _COMPACT_THRESHOLD_RATIO below it — see
|
| 92 |
+
# the compaction_threshold property.
|
| 93 |
+
self.model_max_tokens = model_max_tokens
|
| 94 |
+
self.compact_size = int(model_max_tokens * compact_size)
|
| 95 |
+
# Running count of tokens the last LLM call reported. Drives the
|
| 96 |
+
# compaction gate; updated in add_message() with each response's
|
| 97 |
+
# usage.total_tokens.
|
| 98 |
+
self.running_context_usage = 0
|
| 99 |
self.untouched_messages = untouched_messages
|
| 100 |
self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
|
| 101 |
|
|
|
|
| 155 |
def add_message(self, message: Message, token_count: int = None) -> None:
|
| 156 |
"""Add a message to the history"""
|
| 157 |
if token_count:
|
| 158 |
+
self.running_context_usage = token_count
|
| 159 |
self.items.append(message)
|
| 160 |
|
| 161 |
def get_messages(self) -> list[Message]:
|
|
|
|
| 268 |
count += 1
|
| 269 |
return False
|
| 270 |
|
| 271 |
+
# Compaction fires at 90% of model_max_tokens so there's headroom for
|
| 272 |
+
# the next turn's prompt + response before we actually hit the ceiling.
|
| 273 |
+
_COMPACT_THRESHOLD_RATIO = 0.9
|
| 274 |
+
|
| 275 |
+
@property
|
| 276 |
+
def compaction_threshold(self) -> int:
|
| 277 |
+
"""Token count at which `compact()` kicks in."""
|
| 278 |
+
return int(self.model_max_tokens * self._COMPACT_THRESHOLD_RATIO)
|
| 279 |
+
|
| 280 |
+
@property
|
| 281 |
+
def needs_compaction(self) -> bool:
|
| 282 |
+
return self.running_context_usage > self.compaction_threshold and bool(self.items)
|
| 283 |
+
|
| 284 |
async def compact(
|
| 285 |
self,
|
| 286 |
model_name: str,
|
|
|
|
| 288 |
hf_token: str | None = None,
|
| 289 |
) -> None:
|
| 290 |
"""Remove old messages to keep history under target size"""
|
| 291 |
+
if not self.needs_compaction:
|
| 292 |
return
|
| 293 |
|
| 294 |
system_msg = (
|
|
|
|
| 344 |
head.append(first_user_msg)
|
| 345 |
self.items = head + [summarized_message] + recent_messages
|
| 346 |
|
| 347 |
+
self.running_context_usage = (
|
| 348 |
len(self.system_prompt) // 4 + response.usage.completion_tokens
|
| 349 |
)
|
|
@@ -180,29 +180,27 @@ def _friendly_error_message(error: Exception) -> str | None:
|
|
| 180 |
|
| 181 |
async def _compact_and_notify(session: Session) -> None:
|
| 182 |
"""Run compaction and send event if context was reduced."""
|
| 183 |
-
|
| 184 |
-
|
| 185 |
logger.debug(
|
| 186 |
-
"Compaction check:
|
| 187 |
-
|
| 188 |
)
|
| 189 |
-
|
| 190 |
-
await session.context_manager.compact(
|
| 191 |
model_name=session.config.model_name,
|
| 192 |
-
tool_specs=
|
| 193 |
hf_token=session.hf_token,
|
| 194 |
)
|
| 195 |
-
|
| 196 |
-
if
|
| 197 |
logger.warning(
|
| 198 |
"Context compacted: %d -> %d tokens (max=%d, %d messages)",
|
| 199 |
-
|
| 200 |
-
len(session.context_manager.items),
|
| 201 |
)
|
| 202 |
await session.send_event(
|
| 203 |
Event(
|
| 204 |
event_type="compacted",
|
| 205 |
-
data={"old_tokens":
|
| 206 |
)
|
| 207 |
)
|
| 208 |
|
|
@@ -576,13 +574,13 @@ class Handlers:
|
|
| 576 |
logger.debug(
|
| 577 |
"Agent loop ending: no tool calls. "
|
| 578 |
"finish_reason=%s, token_count=%d, "
|
| 579 |
-
"
|
| 580 |
"iteration=%d/%d, "
|
| 581 |
"response_text=%s",
|
| 582 |
finish_reason,
|
| 583 |
token_count,
|
| 584 |
-
session.context_manager.
|
| 585 |
-
session.context_manager.
|
| 586 |
iteration,
|
| 587 |
max_iterations,
|
| 588 |
(content or "")[:500],
|
|
@@ -785,17 +783,13 @@ class Handlers:
|
|
| 785 |
|
| 786 |
except ContextWindowExceededError:
|
| 787 |
# Force compact and retry this iteration
|
|
|
|
| 788 |
logger.warning(
|
| 789 |
"ContextWindowExceededError at iteration %d — forcing compaction "
|
| 790 |
-
"(
|
| 791 |
-
iteration,
|
| 792 |
-
session.context_manager.context_length,
|
| 793 |
-
session.context_manager.max_context,
|
| 794 |
-
len(session.context_manager.items),
|
| 795 |
-
)
|
| 796 |
-
session.context_manager.context_length = (
|
| 797 |
-
session.context_manager.max_context + 1
|
| 798 |
)
|
|
|
|
| 799 |
await _compact_and_notify(session)
|
| 800 |
continue
|
| 801 |
|
|
|
|
| 180 |
|
| 181 |
async def _compact_and_notify(session: Session) -> None:
|
| 182 |
"""Run compaction and send event if context was reduced."""
|
| 183 |
+
cm = session.context_manager
|
| 184 |
+
old_usage = cm.running_context_usage
|
| 185 |
logger.debug(
|
| 186 |
+
"Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
|
| 187 |
+
old_usage, cm.model_max_tokens, cm.compaction_threshold, cm.needs_compaction,
|
| 188 |
)
|
| 189 |
+
await cm.compact(
|
|
|
|
| 190 |
model_name=session.config.model_name,
|
| 191 |
+
tool_specs=session.tool_router.get_tool_specs_for_llm(),
|
| 192 |
hf_token=session.hf_token,
|
| 193 |
)
|
| 194 |
+
new_usage = cm.running_context_usage
|
| 195 |
+
if new_usage != old_usage:
|
| 196 |
logger.warning(
|
| 197 |
"Context compacted: %d -> %d tokens (max=%d, %d messages)",
|
| 198 |
+
old_usage, new_usage, cm.model_max_tokens, len(cm.items),
|
|
|
|
| 199 |
)
|
| 200 |
await session.send_event(
|
| 201 |
Event(
|
| 202 |
event_type="compacted",
|
| 203 |
+
data={"old_tokens": old_usage, "new_tokens": new_usage},
|
| 204 |
)
|
| 205 |
)
|
| 206 |
|
|
|
|
| 574 |
logger.debug(
|
| 575 |
"Agent loop ending: no tool calls. "
|
| 576 |
"finish_reason=%s, token_count=%d, "
|
| 577 |
+
"usage=%d, model_max_tokens=%d, "
|
| 578 |
"iteration=%d/%d, "
|
| 579 |
"response_text=%s",
|
| 580 |
finish_reason,
|
| 581 |
token_count,
|
| 582 |
+
session.context_manager.running_context_usage,
|
| 583 |
+
session.context_manager.model_max_tokens,
|
| 584 |
iteration,
|
| 585 |
max_iterations,
|
| 586 |
(content or "")[:500],
|
|
|
|
| 783 |
|
| 784 |
except ContextWindowExceededError:
|
| 785 |
# Force compact and retry this iteration
|
| 786 |
+
cm = session.context_manager
|
| 787 |
logger.warning(
|
| 788 |
"ContextWindowExceededError at iteration %d — forcing compaction "
|
| 789 |
+
"(usage=%d, model_max_tokens=%d, messages=%d)",
|
| 790 |
+
iteration, cm.running_context_usage, cm.model_max_tokens, len(cm.items),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
)
|
| 792 |
+
cm.running_context_usage = cm.model_max_tokens + 1
|
| 793 |
await _compact_and_notify(session)
|
| 794 |
continue
|
| 795 |
|
|
@@ -15,53 +15,37 @@ from agent.context_manager.manager import ContextManager
|
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
| 18 |
-
# Local max-token lookup — avoids litellm.get_max_tokens() which can hang
|
| 19 |
-
# on network calls for certain providers (known litellm issue).
|
| 20 |
-
_MAX_TOKENS_MAP: dict[str, int] = {
|
| 21 |
-
"anthropic/claude-opus-4-6": 200_000,
|
| 22 |
-
"anthropic/claude-opus-4-5-20251101": 200_000,
|
| 23 |
-
"anthropic/claude-sonnet-4-5-20250929": 200_000,
|
| 24 |
-
"anthropic/claude-sonnet-4-20250514": 200_000,
|
| 25 |
-
"anthropic/claude-haiku-3-5-20241022": 200_000,
|
| 26 |
-
"anthropic/claude-3-5-sonnet-20241022": 200_000,
|
| 27 |
-
"anthropic/claude-3-opus-20240229": 200_000,
|
| 28 |
-
}
|
| 29 |
_DEFAULT_MAX_TOKENS = 200_000
|
| 30 |
|
| 31 |
|
| 32 |
def _get_max_tokens_safe(model_name: str) -> int:
|
| 33 |
-
"""Return the max context
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
"""
|
| 39 |
-
|
| 40 |
-
if tokens:
|
| 41 |
-
return tokens
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
try:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
from litellm import get_max_tokens
|
| 56 |
-
|
| 57 |
-
result = get_max_tokens(model_name)
|
| 58 |
-
if result and isinstance(result, int):
|
| 59 |
-
return result
|
| 60 |
-
logger.warning(
|
| 61 |
-
f"get_max_tokens returned {result} for {model_name}, using default"
|
| 62 |
-
)
|
| 63 |
-
except Exception as e:
|
| 64 |
-
logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
|
| 65 |
return _DEFAULT_MAX_TOKENS
|
| 66 |
|
| 67 |
|
|
@@ -101,7 +85,7 @@ class Session:
|
|
| 101 |
self.stream = stream
|
| 102 |
tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
|
| 103 |
self.context_manager = context_manager or ContextManager(
|
| 104 |
-
|
| 105 |
compact_size=0.1,
|
| 106 |
untouched_messages=5,
|
| 107 |
tool_specs=tool_specs,
|
|
@@ -153,7 +137,7 @@ class Session:
|
|
| 153 |
def update_model(self, model_name: str) -> None:
|
| 154 |
"""Switch the active model and update the context window limit."""
|
| 155 |
self.config.model_name = model_name
|
| 156 |
-
self.context_manager.
|
| 157 |
|
| 158 |
def increment_turn(self) -> None:
|
| 159 |
"""Increment turn counter (called after each user interaction)"""
|
|
|
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
_DEFAULT_MAX_TOKENS = 200_000
|
| 19 |
|
| 20 |
|
| 21 |
def _get_max_tokens_safe(model_name: str) -> int:
|
| 22 |
+
"""Return the max input-context tokens for a model.
|
| 23 |
+
|
| 24 |
+
Primary source: ``litellm.get_model_info(model)['max_input_tokens']`` —
|
| 25 |
+
LiteLLM maintains an upstream catalog that knows Claude Opus 4.6 is
|
| 26 |
+
1M, GPT-5 is 272k, Sonnet 4.5 is 200k, and so on. Strips any HF routing
|
| 27 |
+
suffix / huggingface/ prefix so tagged ids ('moonshotai/Kimi-K2.6:cheapest')
|
| 28 |
+
look up the bare model. Falls back to a conservative 200k default for
|
| 29 |
+
models not in the catalog (typically HF-router-only models).
|
| 30 |
"""
|
| 31 |
+
from litellm import get_model_info
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
candidates = [model_name]
|
| 34 |
+
stripped = model_name.removeprefix("huggingface/").split(":", 1)[0]
|
| 35 |
+
if stripped != model_name:
|
| 36 |
+
candidates.append(stripped)
|
| 37 |
+
for candidate in candidates:
|
| 38 |
try:
|
| 39 |
+
info = get_model_info(candidate)
|
| 40 |
+
max_input = info.get("max_input_tokens") if info else None
|
| 41 |
+
if isinstance(max_input, int) and max_input > 0:
|
| 42 |
+
return max_input
|
| 43 |
+
except Exception:
|
| 44 |
+
continue
|
| 45 |
+
logger.info(
|
| 46 |
+
"No litellm.get_model_info entry for %s, falling back to %d",
|
| 47 |
+
model_name, _DEFAULT_MAX_TOKENS,
|
| 48 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
return _DEFAULT_MAX_TOKENS
|
| 50 |
|
| 51 |
|
|
|
|
| 85 |
self.stream = stream
|
| 86 |
tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
|
| 87 |
self.context_manager = context_manager or ContextManager(
|
| 88 |
+
model_max_tokens=_get_max_tokens_safe(config.model_name),
|
| 89 |
compact_size=0.1,
|
| 90 |
untouched_messages=5,
|
| 91 |
tool_specs=tool_specs,
|
|
|
|
| 137 |
def update_model(self, model_name: str) -> None:
|
| 138 |
"""Switch the active model and update the context window limit."""
|
| 139 |
self.config.model_name = model_name
|
| 140 |
+
self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
|
| 141 |
|
| 142 |
def increment_turn(self) -> None:
|
| 143 |
"""Increment turn counter (called after each user interaction)"""
|