narcolepticchicken
/

agent-cost-optimizer

Safetensors

Model card Files Files and versions

xet

Community

narcolepticchicken commited on about 16 hours ago

Commit

eaafe86

verified ·

1 Parent(s): c8ece28

Upload aco/cache_layout.py with huggingface_hub

Browse files

Files changed (1) hide show

aco/cache_layout.py +68 -172

aco/cache_layout.py CHANGED Viewed

@@ -1,186 +1,82 @@
-"""Cache-Aware Prompt Layout - Module 5.
-Optimizes prompt/context structure for prefix-cache reuse.
-Strategy:
-- Keep stable rules in the prefix (system rules, tool descriptions, user preferences)
-- Keep tool descriptions stable
-- Move dynamic content to the suffix (user message, retrieved docs, recent trace, artifacts)
-- Avoid injecting timestamps/random metadata above cache boundary
-- Preserve sticky provider/session routing where useful
-Metrics:
-- cache hit rate
-- warm-cache cost
-- cold-cache cost
-- latency
-- context staleness failures
-"""
-from typing import Dict, List, Tuple, Optional, Any
 from dataclasses import dataclass
-from .config import ACOConfig
 @dataclass
 class PromptLayout:
-    prefix: str  # Stable, cacheable content
-    suffix: str  # Dynamic content per turn
     prefix_tokens: int
     suffix_tokens: int
-    cache_boundary_token: int
-    estimated_cold_cost: float
-    estimated_warm_cost: float
-    cache_discount: float
-class CacheAwarePromptLayout:
-    """Lays out prompts to maximize prefix cache reuse."""
-    # Content types that should stay in prefix
-    PREFIX_CONTENT_TYPES = [
-        "system_rules",
-        "tool_descriptions",
-        "user_preferences",
-        "static_examples",
-        "persona_definition",
-    ]
-    # Content types that should be in suffix
-    SUFFIX_CONTENT_TYPES = [
-        "user_message",
-        "retrieved_docs",
-        "recent_trace",
-        "artifacts",
-        "timestamp",
-        "session_id",
-        "dynamic_examples",
-        "conversation_history",
-    ]
-    def __init__(self, config: Optional[ACOConfig] = None):
-        self.config = config or ACOConfig()
-        self.cache_stats = {
-            "cold_runs": 0,
-            "warm_runs": 0,
-            "prefix_tokens_avg": 0,
-            "cache_hit_rate": 0.0,
-            "staleness_failures": 0,
-        }
-    def layout(
-        self,
-        content_pieces: Dict[str, str],
-        cost_per_1k_input: float = 0.01,
-        cache_discount_rate: float = 0.5,
-    ) -> PromptLayout:
-        """Partition content into prefix (cacheable) and suffix (dynamic)."""
-        prefix_pieces = []
-        suffix_pieces = []
-        for key, text in content_pieces.items():
-            if self._is_prefix_content(key):
-                prefix_pieces.append(text)
             else:
-                suffix_pieces.append(text)
-        # Sort prefix: most stable first
-        prefix = "\n\n".join(prefix_pieces)
-        suffix = "\n\n".join(suffix_pieces)
-        # Token estimation (rough: 1 token ~ 4 chars for English)
-        prefix_tokens = len(prefix) // 4
-        suffix_tokens = len(suffix) // 4
-        # Costs
-        estimated_cold_cost = ((prefix_tokens + suffix_tokens) / 1000) * cost_per_1k_input
-        estimated_warm_cost = ((suffix_tokens + prefix_tokens * (1 - cache_discount_rate)) / 1000) * cost_per_1k_input
-        cache_discount = ((prefix_tokens * cache_discount_rate) / 1000) * cost_per_1k_input
         return PromptLayout(
-            prefix=prefix,
-            suffix=suffix,
             prefix_tokens=prefix_tokens,
             suffix_tokens=suffix_tokens,
-            cache_boundary_token=prefix_tokens,
-            estimated_cold_cost=estimated_cold_cost,
-            estimated_warm_cost=estimated_warm_cost,
-            cache_discount=cache_discount,
-        )
-    def _is_prefix_content(self, key: str) -> bool:
-        """Determine if a content key belongs in the prefix."""
-        # Direct matches
-        if key in self.PREFIX_CONTENT_TYPES:
-            return True
-        if key in self.SUFFIX_CONTENT_TYPES:
-            return False
-        # Pattern matching
-        if any(kw in key.lower() for kw in ["system", "static", "rule", "persona", "schema", "format"]):
-            return True
-        if any(kw in key.lower() for kw in ["user_", "dynamic", "current", "live", "now", "timestamp"]):
-            return False
-        # Default: prefix if name suggests stability
-        return True
-    def optimize_for_provider(
-        self,
-        layout: PromptLayout,
-        provider: str,
-    ) -> PromptLayout:
-        """Provider-specific cache layout optimizations."""
-        provider = provider.lower()
-        if "anthropic" in provider:
-            # Claude has system prompts that are automatically cached
-            # Keep system content separate
-            return layout
-        elif "openai" in provider:
-            # OpenAI has prefix caching on system + first user message
-            # Ensure system content is at the very top
-            return layout
-        elif "gemini" in provider:
-            # Gemini has context caching for repeated contexts
-            return layout
-        elif "deepseek" in provider:
-            # DeepSeek has cache hit discounts
-            return layout
-        return layout
-    def measure_hit_rate(self, prefix_tokens: int, cache_hit: bool) -> None:
-        """Update cache statistics."""
-        if cache_hit:
-            self.cache_stats["warm_runs"] += 1
-        else:
-            self.cache_stats["cold_runs"] += 1
-        total = self.cache_stats["warm_runs"] + self.cache_stats["cold_runs"]
-        self.cache_stats["cache_hit_rate"] = self.cache_stats["warm_runs"] / total if total > 0 else 0.0
-        # Running average of prefix tokens
-        n = total
-        self.cache_stats["prefix_tokens_avg"] = (
-            (self.cache_stats["prefix_tokens_avg"] * (n - 1) + prefix_tokens) / n
         )
-    def report(self) -> Dict[str, Any]:
-        """Generate cache performance report."""
-        total = self.cache_stats["warm_runs"] + self.cache_stats["cold_runs"]
         return {
-            "total_runs": total,
-            "warm_runs": self.cache_stats["warm_runs"],
-            "cold_runs": self.cache_stats["cold_runs"],
-            "cache_hit_rate": self.cache_stats["cache_hit_rate"],
-            "avg_prefix_tokens": self.cache_stats["prefix_tokens_avg"],
-            "staleness_failures": self.cache_stats["staleness_failures"],
-            "estimated_cost_saved": self.cache_stats["warm_runs"] * self.cache_stats.get("avg_cache_discount", 0.0),
         }

+"""Cache-Aware Prompt Layout: Optimize prompt structure for prefix-cache reuse."""
+from typing import Dict, List, Tuple
 from dataclasses import dataclass
 @dataclass
 class PromptLayout:
+    prefix_content: str
+    suffix_content: str
     prefix_tokens: int
     suffix_tokens: int
+    cache_boundary: int  # token position of cache boundary
+    stable_sources: List[str]
+    dynamic_sources: List[str]
+CACHE_STABLE_SOURCES = {"system_rules", "tool_descriptions", "user_preferences"}
+CACHE_DYNAMIC_SOURCES = {"recent_messages", "task_plan", "retrieved_docs", "artifacts"}
+class CacheAwareLayout:
+    def __init__(self, session_id: str = None):
+        self.session_id = session_id
+        self._prev_prefix_hash = None
+        self.cache_hits = 0
+        self.cache_misses = 0
+        self.total_prefix_tokens = 0
+        self.total_suffix_tokens = 0
+    def layout(self, sources: Dict[str, str], context_budget) -> PromptLayout:
+        prefix_parts = []
+        suffix_parts = []
+        stable = []
+        dynamic = []
+        prefix_tokens = 0
+        suffix_tokens = 0
+        for source_name, content in sources.items():
+            token_est = len(content) // 4  # rough estimate
+            if source_name in context_budget.cache_prefix:
+                prefix_parts.append(f"# {source_name}\n{content}")
+                prefix_tokens += token_est
+                stable.append(source_name)
+            elif source_name in context_budget.dynamic_suffix:
+                suffix_parts.append(f"# {source_name}\n{content}")
+                suffix_tokens += token_est
+                dynamic.append(source_name)
+            elif source_name in context_budget.keep_exact:
+                prefix_parts.append(f"# {source_name}\n{content}")
+                prefix_tokens += token_est
+                stable.append(source_name)
             else:
+                suffix_parts.append(f"# {source_name}\n{content}")
+                suffix_tokens += token_est
+                dynamic.append(source_name)
+        prefix = "\n\n".join(prefix_parts)
+        suffix = "\n\n".join(suffix_parts)
+        # Check cache hit
+        import hashlib
+        prefix_hash = hashlib.md5(prefix.encode()).hexdigest()
+        if self._prev_prefix_hash == prefix_hash:
+            self.cache_hits += 1
+        else:
+            self.cache_misses += 1
+        self._prev_prefix_hash = prefix_hash
+        self.total_prefix_tokens += prefix_tokens
+        self.total_suffix_tokens += suffix_tokens
         return PromptLayout(
+            prefix_content=prefix,
+            suffix_content=suffix,
             prefix_tokens=prefix_tokens,
             suffix_tokens=suffix_tokens,
+            cache_boundary=prefix_tokens,
+            stable_sources=stable,
+            dynamic_sources=dynamic,
         )
+    def stats(self) -> Dict:
+        total = self.cache_hits + self.cache_misses
         return {
+            "cache_hit_rate": self.cache_hits / max(total, 1),
+            "total_cache_hits": self.cache_hits,
+            "total_cache_misses": self.cache_misses,
+            "avg_prefix_tokens": self.total_prefix_tokens / max(total, 1),
+            "avg_suffix_tokens": self.total_suffix_tokens / max(total, 1),
         }