Aksel Joonas Reedi commited on
Commit
28b8f2b
·
unverified ·
1 Parent(s): 5ab7c4e

Read max context from litellm + compact at 90% (#54)

Browse files

* Read max context from litellm.get_model_info + compact at 90%

The local _MAX_TOKENS_MAP had Claude Opus 4.6 pinned at 200k, triggering
compaction at ~20% of its real 1M window. Swap the hand-maintained table
for litellm.get_model_info()['max_input_tokens'], which LiteLLM keeps
in sync upstream (Opus 4.6=1M, GPT-5=272k, Sonnet 4.5=200k, etc.). HF
router-only ids (MiniMax, Kimi, GLM) aren't in litellm's catalog and
fall through to the 200k default — close enough to their advertised
ranges and safe if the model lies.

Also shifts compaction to fire at 90% of max_context instead of >100%.
The old condition waited until context had already overshot, risking
the next LLM call hitting ContextWindowExceededError before the compact
finished. 90% gives headroom for the summary call + one more turn.

* Drop the legacy -10k buffer — the 90% ratio is the headroom now

ContextManager was subtracting a fixed 10k tokens from max_context on top
of the new 90% compaction threshold, so a 1M-window Opus was triggering
at 891k instead of the intended 900k. Keep max_context == the real model
ceiling; _COMPACT_THRESHOLD_RATIO is the single source of headroom.

* Rename max_context -> model_max_tokens

'max_context' read ambiguously — some reviewers assumed it meant the
compaction threshold. Rename to 'model_max_tokens' so it's unmistakably
the model's real input-token ceiling (what litellm.get_model_info
reports), distinct from the internally-computed compaction threshold.

Touches the ContextManager attribute + ctor param, Session's constructor
kwarg and update_model setter, and the agent loop's debug/warning logs.
No behavior change.

* Simplify compaction gate + rename context_length -> running_context_usage

- ContextManager gains compaction_threshold and needs_compaction
properties so callers stop recomputing "90% of model_max_tokens" by
hand. compact() becomes `if not self.needs_compaction: return`.
- Rename self.context_length to self.running_context_usage. The old
name read like a second ceiling value; the new name says what it is —
the last-reported total_tokens from usage. add_message(), the
ContextWindowExceededError handler, and the compact-finish recompute
all updated accordingly.
- Collapsed _compact_and_notify's local-alias dance (old_length /
max_ctx / threshold) into a single `cm = session.context_manager`.

agent/context_manager/manager.py CHANGED
@@ -73,7 +73,7 @@ class ContextManager:
73
 
74
  def __init__(
75
  self,
76
- max_context: int = 180_000,
77
  compact_size: float = 0.1,
78
  untouched_messages: int = 5,
79
  tool_specs: list[dict[str, Any]] | None = None,
@@ -87,9 +87,15 @@ class ContextManager:
87
  hf_token=hf_token,
88
  local_mode=local_mode,
89
  )
90
- self.max_context = max_context - 10000
91
- self.compact_size = int(max_context * compact_size)
92
- self.context_length = 0 # Updated after each LLM call with actual usage
 
 
 
 
 
 
93
  self.untouched_messages = untouched_messages
94
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
95
 
@@ -149,7 +155,7 @@ class ContextManager:
149
  def add_message(self, message: Message, token_count: int = None) -> None:
150
  """Add a message to the history"""
151
  if token_count:
152
- self.context_length = token_count
153
  self.items.append(message)
154
 
155
  def get_messages(self) -> list[Message]:
@@ -262,6 +268,19 @@ class ContextManager:
262
  count += 1
263
  return False
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  async def compact(
266
  self,
267
  model_name: str,
@@ -269,7 +288,7 @@ class ContextManager:
269
  hf_token: str | None = None,
270
  ) -> None:
271
  """Remove old messages to keep history under target size"""
272
- if (self.context_length <= self.max_context) or not self.items:
273
  return
274
 
275
  system_msg = (
@@ -325,6 +344,6 @@ class ContextManager:
325
  head.append(first_user_msg)
326
  self.items = head + [summarized_message] + recent_messages
327
 
328
- self.context_length = (
329
  len(self.system_prompt) // 4 + response.usage.completion_tokens
330
  )
 
73
 
74
  def __init__(
75
  self,
76
+ model_max_tokens: int = 180_000,
77
  compact_size: float = 0.1,
78
  untouched_messages: int = 5,
79
  tool_specs: list[dict[str, Any]] | None = None,
 
87
  hf_token=hf_token,
88
  local_mode=local_mode,
89
  )
90
+ # The model's real input-token ceiling (from litellm.get_model_info).
91
+ # Compaction triggers at _COMPACT_THRESHOLD_RATIO below it — see
92
+ # the compaction_threshold property.
93
+ self.model_max_tokens = model_max_tokens
94
+ self.compact_size = int(model_max_tokens * compact_size)
95
+ # Running count of tokens the last LLM call reported. Drives the
96
+ # compaction gate; updated in add_message() with each response's
97
+ # usage.total_tokens.
98
+ self.running_context_usage = 0
99
  self.untouched_messages = untouched_messages
100
  self.items: list[Message] = [Message(role="system", content=self.system_prompt)]
101
 
 
155
  def add_message(self, message: Message, token_count: int = None) -> None:
156
  """Add a message to the history"""
157
  if token_count:
158
+ self.running_context_usage = token_count
159
  self.items.append(message)
160
 
161
  def get_messages(self) -> list[Message]:
 
268
  count += 1
269
  return False
270
 
271
+ # Compaction fires at 90% of model_max_tokens so there's headroom for
272
+ # the next turn's prompt + response before we actually hit the ceiling.
273
+ _COMPACT_THRESHOLD_RATIO = 0.9
274
+
275
+ @property
276
+ def compaction_threshold(self) -> int:
277
+ """Token count at which `compact()` kicks in."""
278
+ return int(self.model_max_tokens * self._COMPACT_THRESHOLD_RATIO)
279
+
280
+ @property
281
+ def needs_compaction(self) -> bool:
282
+ return self.running_context_usage > self.compaction_threshold and bool(self.items)
283
+
284
  async def compact(
285
  self,
286
  model_name: str,
 
288
  hf_token: str | None = None,
289
  ) -> None:
290
  """Remove old messages to keep history under target size"""
291
+ if not self.needs_compaction:
292
  return
293
 
294
  system_msg = (
 
344
  head.append(first_user_msg)
345
  self.items = head + [summarized_message] + recent_messages
346
 
347
+ self.running_context_usage = (
348
  len(self.system_prompt) // 4 + response.usage.completion_tokens
349
  )
agent/core/agent_loop.py CHANGED
@@ -180,29 +180,27 @@ def _friendly_error_message(error: Exception) -> str | None:
180
 
181
  async def _compact_and_notify(session: Session) -> None:
182
  """Run compaction and send event if context was reduced."""
183
- old_length = session.context_manager.context_length
184
- max_ctx = session.context_manager.max_context
185
  logger.debug(
186
- "Compaction check: context_length=%d, max_context=%d, needs_compact=%s",
187
- old_length, max_ctx, old_length > max_ctx,
188
  )
189
- tool_specs = session.tool_router.get_tool_specs_for_llm()
190
- await session.context_manager.compact(
191
  model_name=session.config.model_name,
192
- tool_specs=tool_specs,
193
  hf_token=session.hf_token,
194
  )
195
- new_length = session.context_manager.context_length
196
- if new_length != old_length:
197
  logger.warning(
198
  "Context compacted: %d -> %d tokens (max=%d, %d messages)",
199
- old_length, new_length, max_ctx,
200
- len(session.context_manager.items),
201
  )
202
  await session.send_event(
203
  Event(
204
  event_type="compacted",
205
- data={"old_tokens": old_length, "new_tokens": new_length},
206
  )
207
  )
208
 
@@ -576,13 +574,13 @@ class Handlers:
576
  logger.debug(
577
  "Agent loop ending: no tool calls. "
578
  "finish_reason=%s, token_count=%d, "
579
- "context_length=%d, max_context=%d, "
580
  "iteration=%d/%d, "
581
  "response_text=%s",
582
  finish_reason,
583
  token_count,
584
- session.context_manager.context_length,
585
- session.context_manager.max_context,
586
  iteration,
587
  max_iterations,
588
  (content or "")[:500],
@@ -785,17 +783,13 @@ class Handlers:
785
 
786
  except ContextWindowExceededError:
787
  # Force compact and retry this iteration
 
788
  logger.warning(
789
  "ContextWindowExceededError at iteration %d — forcing compaction "
790
- "(context_length=%d, max_context=%d, messages=%d)",
791
- iteration,
792
- session.context_manager.context_length,
793
- session.context_manager.max_context,
794
- len(session.context_manager.items),
795
- )
796
- session.context_manager.context_length = (
797
- session.context_manager.max_context + 1
798
  )
 
799
  await _compact_and_notify(session)
800
  continue
801
 
 
180
 
181
  async def _compact_and_notify(session: Session) -> None:
182
  """Run compaction and send event if context was reduced."""
183
+ cm = session.context_manager
184
+ old_usage = cm.running_context_usage
185
  logger.debug(
186
+ "Compaction check: usage=%d, max=%d, threshold=%d, needs_compact=%s",
187
+ old_usage, cm.model_max_tokens, cm.compaction_threshold, cm.needs_compaction,
188
  )
189
+ await cm.compact(
 
190
  model_name=session.config.model_name,
191
+ tool_specs=session.tool_router.get_tool_specs_for_llm(),
192
  hf_token=session.hf_token,
193
  )
194
+ new_usage = cm.running_context_usage
195
+ if new_usage != old_usage:
196
  logger.warning(
197
  "Context compacted: %d -> %d tokens (max=%d, %d messages)",
198
+ old_usage, new_usage, cm.model_max_tokens, len(cm.items),
 
199
  )
200
  await session.send_event(
201
  Event(
202
  event_type="compacted",
203
+ data={"old_tokens": old_usage, "new_tokens": new_usage},
204
  )
205
  )
206
 
 
574
  logger.debug(
575
  "Agent loop ending: no tool calls. "
576
  "finish_reason=%s, token_count=%d, "
577
+ "usage=%d, model_max_tokens=%d, "
578
  "iteration=%d/%d, "
579
  "response_text=%s",
580
  finish_reason,
581
  token_count,
582
+ session.context_manager.running_context_usage,
583
+ session.context_manager.model_max_tokens,
584
  iteration,
585
  max_iterations,
586
  (content or "")[:500],
 
783
 
784
  except ContextWindowExceededError:
785
  # Force compact and retry this iteration
786
+ cm = session.context_manager
787
  logger.warning(
788
  "ContextWindowExceededError at iteration %d — forcing compaction "
789
+ "(usage=%d, model_max_tokens=%d, messages=%d)",
790
+ iteration, cm.running_context_usage, cm.model_max_tokens, len(cm.items),
 
 
 
 
 
 
791
  )
792
+ cm.running_context_usage = cm.model_max_tokens + 1
793
  await _compact_and_notify(session)
794
  continue
795
 
agent/core/session.py CHANGED
@@ -15,53 +15,37 @@ from agent.context_manager.manager import ContextManager
15
 
16
  logger = logging.getLogger(__name__)
17
 
18
- # Local max-token lookup — avoids litellm.get_max_tokens() which can hang
19
- # on network calls for certain providers (known litellm issue).
20
- _MAX_TOKENS_MAP: dict[str, int] = {
21
- "anthropic/claude-opus-4-6": 200_000,
22
- "anthropic/claude-opus-4-5-20251101": 200_000,
23
- "anthropic/claude-sonnet-4-5-20250929": 200_000,
24
- "anthropic/claude-sonnet-4-20250514": 200_000,
25
- "anthropic/claude-haiku-3-5-20241022": 200_000,
26
- "anthropic/claude-3-5-sonnet-20241022": 200_000,
27
- "anthropic/claude-3-opus-20240229": 200_000,
28
- }
29
  _DEFAULT_MAX_TOKENS = 200_000
30
 
31
 
32
  def _get_max_tokens_safe(model_name: str) -> int:
33
- """Return the max context window for a model.
34
-
35
- Anthropic/OpenAI ids hit the local table; HF router ids ask the catalog
36
- (cached) for the max ``context_length`` across live providers. Falls back
37
- to ``_DEFAULT_MAX_TOKENS`` if nothing is available.
 
 
 
38
  """
39
- tokens = _MAX_TOKENS_MAP.get(model_name)
40
- if tokens:
41
- return tokens
42
 
43
- if not model_name.startswith(("anthropic/", "openai/")):
 
 
 
 
44
  try:
45
- from agent.core import hf_router_catalog as cat
46
-
47
- bare = model_name.removeprefix("huggingface/").split(":", 1)[0]
48
- info = cat.lookup(bare)
49
- if info and info.max_context_length:
50
- return info.max_context_length
51
- except Exception as e:
52
- logger.warning("HF catalog lookup failed for %s: %s", model_name, e)
53
-
54
- try:
55
- from litellm import get_max_tokens
56
-
57
- result = get_max_tokens(model_name)
58
- if result and isinstance(result, int):
59
- return result
60
- logger.warning(
61
- f"get_max_tokens returned {result} for {model_name}, using default"
62
- )
63
- except Exception as e:
64
- logger.warning(f"get_max_tokens failed for {model_name}, using default: {e}")
65
  return _DEFAULT_MAX_TOKENS
66
 
67
 
@@ -101,7 +85,7 @@ class Session:
101
  self.stream = stream
102
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
103
  self.context_manager = context_manager or ContextManager(
104
- max_context=_get_max_tokens_safe(config.model_name),
105
  compact_size=0.1,
106
  untouched_messages=5,
107
  tool_specs=tool_specs,
@@ -153,7 +137,7 @@ class Session:
153
  def update_model(self, model_name: str) -> None:
154
  """Switch the active model and update the context window limit."""
155
  self.config.model_name = model_name
156
- self.context_manager.max_context = _get_max_tokens_safe(model_name)
157
 
158
  def increment_turn(self) -> None:
159
  """Increment turn counter (called after each user interaction)"""
 
15
 
16
  logger = logging.getLogger(__name__)
17
 
 
 
 
 
 
 
 
 
 
 
 
18
  _DEFAULT_MAX_TOKENS = 200_000
19
 
20
 
21
  def _get_max_tokens_safe(model_name: str) -> int:
22
+ """Return the max input-context tokens for a model.
23
+
24
+ Primary source: ``litellm.get_model_info(model)['max_input_tokens']``
25
+ LiteLLM maintains an upstream catalog that knows Claude Opus 4.6 is
26
+ 1M, GPT-5 is 272k, Sonnet 4.5 is 200k, and so on. Strips any HF routing
27
+ suffix / huggingface/ prefix so tagged ids ('moonshotai/Kimi-K2.6:cheapest')
28
+ look up the bare model. Falls back to a conservative 200k default for
29
+ models not in the catalog (typically HF-router-only models).
30
  """
31
+ from litellm import get_model_info
 
 
32
 
33
+ candidates = [model_name]
34
+ stripped = model_name.removeprefix("huggingface/").split(":", 1)[0]
35
+ if stripped != model_name:
36
+ candidates.append(stripped)
37
+ for candidate in candidates:
38
  try:
39
+ info = get_model_info(candidate)
40
+ max_input = info.get("max_input_tokens") if info else None
41
+ if isinstance(max_input, int) and max_input > 0:
42
+ return max_input
43
+ except Exception:
44
+ continue
45
+ logger.info(
46
+ "No litellm.get_model_info entry for %s, falling back to %d",
47
+ model_name, _DEFAULT_MAX_TOKENS,
48
+ )
 
 
 
 
 
 
 
 
 
 
49
  return _DEFAULT_MAX_TOKENS
50
 
51
 
 
85
  self.stream = stream
86
  tool_specs = tool_router.get_tool_specs_for_llm() if tool_router else []
87
  self.context_manager = context_manager or ContextManager(
88
+ model_max_tokens=_get_max_tokens_safe(config.model_name),
89
  compact_size=0.1,
90
  untouched_messages=5,
91
  tool_specs=tool_specs,
 
137
  def update_model(self, model_name: str) -> None:
138
  """Switch the active model and update the context window limit."""
139
  self.config.model_name = model_name
140
+ self.context_manager.model_max_tokens = _get_max_tokens_safe(model_name)
141
 
142
  def increment_turn(self) -> None:
143
  """Increment turn counter (called after each user interaction)"""