narcolepticchicken
/

agent-cost-optimizer

Safetensors

Model card Files Files and versions

xet

Community

narcolepticchicken commited on about 16 hours ago

Commit

a7e3035

verified ·

1 Parent(s): eaafe86

Upload aco/tool_gate.py with huggingface_hub

Browse files

Files changed (1) hide show

aco/tool_gate.py +77 -253

aco/tool_gate.py CHANGED Viewed

@@ -1,260 +1,84 @@
-"""Tool-Use Cost Gate - Module 6.
-Predicts whether a tool call is worth the cost.
-Decisions:
-- use tool
-- skip tool
-- batch tool calls
-- run in parallel
-- use cheaper tool
-- use cached result
-- ask user instead
-- escalate
-Tracks:
-- unnecessary tool calls
-- missed tool calls
-- failed tool calls
-- tool result ignored
-- repeated tool calls
-- tool cost
-- tool latency
-"""
-from typing import Dict, List, Tuple, Optional
 from dataclasses import dataclass
-from enum import Enum
-from .trace_schema import TaskType, ToolCall
-from .config import ACOConfig, ToolConfig
-class ToolDecision(Enum):
-    USE = "use"
-    SKIP = "skip"
-    BATCH = "batch"
-    PARALLEL = "parallel"
-    USE_CHEAPER = "use_cheaper"
-    USE_CACHE = "use_cache"
-    ASK_USER = "ask_user"
-    ESCALATE = "escalate"
 @dataclass
-class ToolGateDecision:
-    decision: ToolDecision
     tool_name: str
     reasoning: str
     estimated_cost: float
-    estimated_benefit: float  # 0-1 probability of useful result
-    confidence: float
-    alternative_tool: Optional[str] = None
-    batched_with: Optional[List[str]] = None
-class ToolUseCostGate:
-    """Gates tool calls based on predicted value vs. cost."""
-    # Tool necessity by task type (probability that task needs this tool class)
-    TOOL_NECESSITY = {
-        TaskType.QUICK_ANSWER: {
-            "search": 0.3,
-            "retrieve": 0.1,
-            "calculator": 0.2,
-        },
-        TaskType.CODING: {
-            "code_execution": 0.8,
-            "linter": 0.6,
-            "test_runner": 0.7,
-            "file_read": 0.9,
-            "file_write": 0.5,
-            "search": 0.4,
-        },
-        TaskType.RESEARCH: {
-            "search": 0.95,
-            "retrieve": 0.9,
-            "fetch": 0.7,
-            "summarize": 0.8,
-        },
-        TaskType.LEGAL_REGULATED: {
-            "document_retrieval": 0.95,
-            "compliance_check": 0.9,
-            "search": 0.6,
-        },
-        TaskType.TOOL_HEAVY: {
-            "search": 0.7,
-            "fetch": 0.6,
-            "api_call": 0.8,
-            "database_query": 0.7,
-        },
-        TaskType.RETRIEVAL_HEAVY: {
-            "retrieve": 0.95,
-            "search": 0.8,
-            "fetch": 0.7,
-        },
-        TaskType.LONG_HORIZON: {
-            "task_planner": 0.7,
-            "progress_tracker": 0.5,
-            "file_read": 0.6,
-            "search": 0.4,
-        },
-    }
-    # Cost-benefit threshold
-    MIN_BENEFIT_COST_RATIO = 2.0  # benefit must be > 2x cost to call
-    def __init__(self, config: Optional[ACOConfig] = None):
-        self.config = config or ACOConfig()
-        self.tool_stats: Dict[str, Dict] = {}  # tool_name -> stats
-        self.decision_history: List[Dict] = []
-    def decide(
-        self,
-        tool_name: str,
-        task_type: TaskType,
-        tool_input: Dict,
-        previous_tool_calls: Optional[List[ToolCall]] = None,
-        current_cost_so_far: float = 0.0,
-        predicted_task_cost: float = 0.1,
-    ) -> ToolGateDecision:
-        """Decide whether to make a tool call."""
-        tool_cfg = self.config.tools.get(tool_name, ToolConfig(tool_name=tool_name))
-        tool_cost = tool_cfg.cost_per_call
-        tool_latency = tool_cfg.latency_ms_estimate
-        previous = previous_tool_calls or []
-        # Check for repeated identical calls
-        if self._is_repeated(tool_name, tool_input, previous):
-            return ToolGateDecision(
-                decision=ToolDecision.USE_CACHE,
-                tool_name=tool_name,
-                reasoning="Repeated identical tool call detected — use cached result",
-                estimated_cost=0.0,
-                estimated_benefit=0.9,
-                confidence=0.95,
-            )
-        # Check if tool result was ignored in previous steps
-        if previous and self._was_ignored(tool_name, previous):
-            # If tool results are consistently ignored, skip
-            return ToolGateDecision(
-                decision=ToolDecision.SKIP,
-                tool_name=tool_name,
-                reasoning="Previous results from this tool were ignored by the agent",
-                estimated_cost=0.0,
-                estimated_benefit=0.1,
-                confidence=0.8,
-            )
-        # Estimate necessity
-        necessity = self.TOOL_NECESSITY.get(task_type, {}).get(tool_name, 0.5)
-        # Adjust by historical success rate
-        stats = self.tool_stats.get(tool_name, {"calls": 0, "useful": 0})
-        if stats["calls"] > 5:
-            historical_useful_rate = stats["useful"] / stats["calls"]
-            necessity = (necessity + historical_useful_rate) / 2
-        # Cost escalation check: if we're already over predicted cost, be more selective
-        cost_ratio = current_cost_so_far / max(predicted_task_cost, 0.001)
-        if cost_ratio > 1.5:
-            necessity *= 0.7
-        if cost_ratio > 2.5:
-            necessity *= 0.5
-        # Normalize cost to benefit scale (assume $0.01 = 1.0 benefit unit)
-        normalized_cost = tool_cost / 0.01
-        benefit_cost_ratio = necessity / max(normalized_cost, 0.001)
-        if benefit_cost_ratio < self.MIN_BENEFIT_COST_RATIO and necessity < 0.5:
-            return ToolGateDecision(
-                decision=ToolDecision.SKIP,
-                tool_name=tool_name,
-                reasoning=f"Low benefit/cost ratio ({benefit_cost_ratio:.2f}) and low necessity ({necessity:.2f})",
-                estimated_cost=0.0,
-                estimated_benefit=necessity,
-                confidence=0.75,
-            )
-        # Check if we can batch with other pending tool calls
-        # (simplified: if multiple reads, batch them)
-        if tool_name in ("file_read", "search", "retrieve"):
-            return ToolGateDecision(
-                decision=ToolDecision.USE,
-                tool_name=tool_name,
-                reasoning=f"Tool is necessary (necessity={necessity:.2f}) and cacheable — proceed",
-                estimated_cost=tool_cost,
-                estimated_benefit=necessity,
-                confidence=min(necessity + 0.2, 1.0),
-            )
-        return ToolGateDecision(
-            decision=ToolDecision.USE,
-            tool_name=tool_name,
-            reasoning=f"Tool justified: necessity={necessity:.2f}, cost={tool_cost:.4f}",
-            estimated_cost=tool_cost,
-            estimated_benefit=necessity,
-            confidence=min(necessity + 0.1, 1.0),
-        )
-    def decide_batch(
-        self,
-        tool_requests: List[Tuple[str, Dict]],
-        task_type: TaskType,
-        previous_tool_calls: Optional[List[ToolCall]] = None,
-    ) -> List[ToolGateDecision]:
-        """Decide on a batch of tool calls, potentially grouping parallel ones."""
-        decisions = []
-        for tool_name, tool_input in tool_requests:
-            d = self.decide(tool_name, task_type, tool_input, previous_tool_calls)
-            decisions.append(d)
-        # Group independent tool calls for parallel execution
-        # (reads can be parallel, writes should be sequential)
-        read_tools = ["file_read", "search", "retrieve", "fetch", "database_query"]
-        parallel_group = []
-        sequential = []
-        for d in decisions:
-            if d.tool_name in read_tools and d.decision == ToolDecision.USE:
-                parallel_group.append(d)
-            else:
-                if parallel_group:
-                    # Mark the first as parallel with the rest
-                    parallel_group[0].batched_with = [p.tool_name for p in parallel_group[1:]]
-                    sequential.append(parallel_group[0])
-                    parallel_group = []
-                sequential.append(d)
-        if parallel_group:
-            parallel_group[0].batched_with = [p.tool_name for p in parallel_group[1:]]
-            sequential.append(parallel_group[0])
-        return sequential
-    def _is_repeated(self, tool_name: str, tool_input: Dict, previous: List[ToolCall]) -> bool:
-        """Check if this exact tool call was made before in this trace."""
-        for call in reversed(previous[-5:]):  # check last 5
-            if call.tool_name == tool_name and call.tool_input == tool_input:
-                return True
-        return False
-    def _was_ignored(self, tool_name: str, previous: List[ToolCall]) -> bool:
-        """Check if previous calls to this tool had their results ignored."""
-        relevant = [c for c in previous if c.tool_name == tool_name]
-        if len(relevant) < 2:
-            return False
-        ignored_count = sum(1 for c in relevant if c.ignored_result)
-        return ignored_count / len(relevant) > 0.5
-    def record_outcome(self, tool_name: str, was_useful: bool, cost: float) -> None:
-        """Record whether a tool call was actually useful."""
-        stats = self.tool_stats.setdefault(tool_name, {"calls": 0, "useful": 0, "total_cost": 0.0})
-        stats["calls"] += 1
         if was_useful:
-            stats["useful"] += 1
-        stats["total_cost"] += cost
-        stats["useful_rate"] = stats["useful"] / stats["calls"]

+"""Tool-Use Cost Gate: Predict whether a tool call is worth the cost."""
+from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass
 @dataclass
+class ToolDecision:
+    action: str  # "use", "skip", "batch", "cache", "escalate"
     tool_name: str
+    confidence: float
     reasoning: str
     estimated_cost: float
+    estimated_value: float
+    alternative: Optional[str] = None
+TOOL_COST_ESTIMATES = {
+    "web_search": {"cost": 0.01, "latency_ms": 2000, "value_base": 0.6},
+    "code_search": {"cost": 0.005, "latency_ms": 1000, "value_base": 0.7},
+    "file_read": {"cost": 0.001, "latency_ms": 100, "value_base": 0.8},
+    "file_write": {"cost": 0.001, "latency_ms": 100, "value_base": 0.9},
+    "api_call": {"cost": 0.05, "latency_ms": 3000, "value_base": 0.5},
+    "database_query": {"cost": 0.02, "latency_ms": 500, "value_base": 0.6},
+    "code_execute": {"cost": 0.01, "latency_ms": 5000, "value_base": 0.7},
+    "web_scrape": {"cost": 0.02, "latency_ms": 5000, "value_base": 0.4},
+    "summarize": {"cost": 0.005, "latency_ms": 2000, "value_base": 0.5},
+    "verify": {"cost": 0.02, "latency_ms": 3000, "value_base": 0.6},
+}
+class ToolCostGate:
+    def __init__(self, cost_threshold: float = 0.05, value_threshold: float = 0.3,
+                 batch_window_ms: int = 5000):
+        self.cost_threshold = cost_threshold
+        self.value_threshold = value_threshold
+        self.batch_window_ms = batch_window_ms
+        self.pending_calls: List[Dict] = []
+        self.call_stats = {"used":0,"skipped":0,"batched":0,"cached":0,"escalated":0}
+        self._result_cache: Dict[str, str] = {}
+    def gate(self, tool_name: str, args: Dict, task_type: str,
+             step_num: int, total_steps: int, confidence: float,
+             prior_results: List[str] = None) -> ToolDecision:
+        est = TOOL_COST_ESTIMATES.get(tool_name, {"cost":0.02,"latency_ms":2000,"value_base":0.5})
+        # Check cache
+        cache_key = f"{tool_name}:{hash(str(args))}"
+        if cache_key in self._result_cache:
+            self.call_stats["cached"] += 1
+            return ToolDecision("cache", tool_name, 1.0, "cached result available",
+                              0.0, est["value_base"], "use_cached_result")
+        # Adjust value based on context
+        value = est["value_base"]
+        if task_type == "coding" and tool_name in ("code_search","file_read","code_execute"):
+            value += 0.2
+        if task_type == "research" and tool_name in ("web_search","web_scrape"):
+            value += 0.2
+        if step_num == 1 and tool_name in ("web_search","code_search"):
+            value += 0.1  # First search is usually valuable
+        if prior_results and len(prior_results) > 3:
+            value -= 0.2  # Already have enough info
+        if confidence > 0.8:
+            value -= 0.3  # Already confident, tool less valuable
+        cost = est["cost"]
+        # Decision
+        if value < self.value_threshold:
+            self.call_stats["skipped"] += 1
+            return ToolDecision("skip", tool_name, value, "low value, not worth cost",
+                              cost, value, "proceed_without_tool")
+        if cost > self.cost_threshold and value < 0.6:
+            self.call_stats["skipped"] += 1
+            return ToolDecision("skip", tool_name, value, "cost exceeds threshold",
+                              cost, value, "proceed_without_tool")
+        self.call_stats["used"] += 1
+        return ToolDecision("use", tool_name, value, "tool value justifies cost",
+                          cost, value, None)
+    def should_batch(self, pending_calls: List[Dict]) -> bool:
+        if len(pending_calls) < 2: return False
+        independent = all(c.get("independent", True) for c in pending_calls)
+        same_type = len(set(c["tool"] for c in pending_calls)) <= 2
+        return independent and same_type
+    def record_result(self, tool_name: str, args: Dict, result: str,
+                      was_useful: bool = True):
+        cache_key = f"{tool_name}:{hash(str(args))}"
         if was_useful:
+            self._result_cache[cache_key] = result