Agent Cost Optimizer — Deployment Guide
Quick Start
Installation
pip install git+https://huggingface.co/narcolepticchicken/agent-cost-optimizer
Or clone and install locally:
git clone https://huggingface.co/narcolepticchicken/agent-cost-optimizer
cd agent-cost-optimizer
pip install -e .
Basic Usage
from aco import AgentCostOptimizer
# Load default configuration
optimizer = AgentCostOptimizer()
# Optimize a single agent request
result = optimizer.optimize(
"Write a Python function to reverse a linked list",
run_state={
"trace_id": "run-001",
"planned_tools": [("code_execution", {"code": "test"})],
}
)
print(f"Model: {result.routing_decision.model_id}")
print(f"Tier: {result.routing_decision.tier}")
print(f"Estimated Cost: ${result.estimated_cost:.4f}")
print(f"Tool Decisions: {[d.decision.value for d in result.tool_decisions]}")
Configuration
Config File
Create a config.yaml:
project_name: "my-agent-optimizer"
trace_storage_path: "./traces"
models:
gpt-4o-mini:
model_id: "gpt-4o-mini"
provider: "openai"
cost_per_1k_input: 0.00015
cost_per_1k_output: 0.0006
strength_tier: 2
max_context: 128000
cache_discount_rate: 0.5
gpt-4o:
model_id: "gpt-4o"
provider: "openai"
cost_per_1k_input: 0.0025
cost_per_1k_output: 0.01
strength_tier: 4
max_context: 128000
cache_discount_rate: 0.5
tools:
search:
tool_name: "search"
cost_per_call: 0.002
latency_ms_estimate: 500
code_execution:
tool_name: "code_execution"
cost_per_call: 0.005
latency_ms_estimate: 1000
requires_verification: true
verifiers:
verifier_medium:
verifier_model_id: "gpt-4o-mini"
cost_per_call: 0.005
confidence_threshold: 0.8
# Enable/disable modules
enable_router: true
enable_context_budgeter: true
enable_cache_layout: true
enable_tool_gate: true
enable_verifier_budgeter: true
enable_retry_optimizer: true
enable_meta_tool_miner: true
enable_early_termination: true
Load it:
optimizer = AgentCostOptimizer.from_config("config.yaml")
Integration with Agent Harness
Generic Integration Pattern
class MyAgentHarness:
def __init__(self):
self.optimizer = AgentCostOptimizer.from_config("config.yaml")
def execute(self, user_request: str, context: dict):
# 1. Build run state
run_state = {
"trace_id": f"run-{uuid.uuid4()}",
"planned_tools": self.plan_tools(user_request),
"context_pieces": context,
"current_cost": 0.0,
"step_number": 1,
"total_steps": self.estimate_steps(user_request),
"is_irreversible": False,
}
# 2. Call optimizer BEFORE execution
decision = self.optimizer.optimize(user_request, run_state)
# 3. Apply optimizer decisions
selected_model = decision.routing_decision.model_id
# Apply tool gate
approved_tools = [
td for td in decision.tool_decisions
if td.decision.value in ("use", "batch", "parallel")
]
# Apply context budget
if decision.context_budget:
context = self._apply_context_budget(context, decision.context_budget)
# Apply cache layout
if decision.prompt_layout:
prompt = self._apply_cache_layout(decision.prompt_layout)
# Check doom assessment
if decision.doom_assessment and decision.doom_assessment.action.value == "mark_blocked":
return {"status": "BLOCKED", "reason": decision.doom_assessment.reasoning}
# 4. Execute with optimized parameters
result = self.llm_call(
model=selected_model,
prompt=prompt,
tools=approved_tools,
max_tokens=decision.routing_decision.max_tokens,
)
# 5. Record step
self.optimizer.record_step(
trace_id=decision.trace_id,
model_call=ModelCall(
model_id=selected_model,
provider="openai",
input_tokens=result.input_tokens,
output_tokens=result.output_tokens,
cost_per_1k_input=0.0025,
cost_per_1k_output=0.01,
),
tool_calls=[...],
context_size_tokens=len(prompt) // 4,
step_outcome=Outcome.SUCCESS if result.success else Outcome.FAILURE,
)
# 6. Finalize trace
self.optimizer.finalize_trace(
trace_id=decision.trace_id,
outcome=Outcome.SUCCESS if result.success else Outcome.FAILURE,
user_satisfaction=1.0 if result.success else 0.0,
)
return result
LangChain Integration
from aco import AgentCostOptimizer
from langchain.agents import AgentExecutor
class ACOWrapper:
def __init__(self, agent_executor, optimizer):
self.agent = agent_executor
self.optimizer = optimizer
def invoke(self, input_data):
# Pre-optimize
decision = self.optimizer.optimize(
input_data["input"],
run_state={
"planned_tools": [(t.name, {}) for t in self.agent.tools],
"trace_id": input_data.get("run_id", str(uuid.uuid4())),
}
)
# Override agent LLM based on routing decision
self.agent.llm = self.get_llm(decision.routing_decision.model_id)
# Filter tools based on tool gate
self.agent.tools = [
t for t in self.agent.tools
if any(d.tool_name == t.name and d.decision.value == "use"
for d in decision.tool_decisions)
]
# Execute
result = self.agent.invoke(input_data)
# Record and finalize
# ... (see generic pattern above)
return result
OpenAI Assistants Integration
from aco import AgentCostOptimizer
class ACOAssistantWrapper:
def __init__(self, assistant_id, optimizer):
self.assistant_id = assistant_id
self.optimizer = optimizer
def create_run(self, thread_id, instructions):
# Optimize instructions (context budgeter)
decision = self.optimizer.optimize(
instructions,
run_state={
"trace_id": f"assistant-run-{thread_id}",
"context_pieces": {"system_rules": instructions},
}
)
# Use cache-aware prompt layout
if decision.prompt_layout:
optimized_instructions = decision.prompt_layout.prefix + "\n\n" + decision.prompt_layout.suffix
else:
optimized_instructions = instructions
# Create run with optimized parameters
return openai.beta.threads.runs.create(
thread_id=thread_id,
assistant_id=self.assistant_id,
instructions=optimized_instructions,
model=decision.routing_decision.model_id,
)
Multi-Provider Support
ACO supports any provider with cost metadata:
models:
claude-3-haiku:
model_id: "claude-3-haiku-20240307"
provider: "anthropic"
cost_per_1k_input: 0.00025
cost_per_1k_output: 0.00125
strength_tier: 2
claude-3-opus:
model_id: "claude-3-opus-20240229"
provider: "anthropic"
cost_per_1k_input: 0.015
cost_per_1k_output: 0.075
strength_tier: 4
gemini-pro:
model_id: "gemini-1.5-pro"
provider: "google"
cost_per_1k_input: 0.0035
cost_per_1k_output: 0.0105
strength_tier: 3
deepseek-chat:
model_id: "deepseek-chat"
provider: "deepseek"
cost_per_1k_input: 0.00014
cost_per_1k_output: 0.00028
strength_tier: 2
cache_discount_rate: 0.5
Local Model Support
For self-hosted models:
models:
llama-3.2-1b:
model_id: "meta-llama/Llama-3.2-1B-Instruct"
provider: "local"
cost_per_1k_input: 0.0
cost_per_1k_output: 0.0
strength_tier: 1
max_context: 128000
qwen2.5-7b:
model_id: "Qwen/Qwen2.5-7B-Instruct"
provider: "local"
cost_per_1k_input: 0.0
cost_per_1k_output: 0.0
strength_tier: 3
max_context: 131072
Use cost_per_1k_input: 0.0 for local models. ACO will still optimize latency and context size.
Benchmarking
Run the benchmark suite:
python eval_runner.py --tasks 1000 --output ./eval_results
With ablations:
python eval_runner.py --tasks 1000 --ablations --output ./eval_results
Generate report:
python -m aco.cli report --input ./eval_results/baseline_results.json
Telemetry and Monitoring
Traces are stored as JSON in trace_storage_path:
# List all traces
traces = optimizer.telemetry.list_traces()
# Get statistics
stats = optimizer.telemetry.get_stats()
print(f"Total traces: {stats['count']}")
print(f"Avg cost: ${stats['avg_cost']:.4f}")
print(f"Success rate: {stats['success_rate']:.1%}")
# Full optimizer stats
all_stats = optimizer.get_stats()
print(json.dumps(all_stats, indent=2))
Advanced: Training a Custom Router
To train a model-specific router using your trace data:
from aco.optimizer import AgentCostOptimizer
from aco.config import ACOConfig, ModelConfig
# 1. Collect traces
optimizer = AgentCostOptimizer()
# ... run agent tasks ...
# 2. Extract features and labels from traces
traces = [optimizer.telemetry.load_trace(tid) for tid in optimizer.telemetry.list_traces()]
# 3. Train a simple classifier (example with sklearn)
from sklearn.ensemble import RandomForestClassifier
import numpy as np
X = []
y = []
for trace in traces:
# Features: task_type, request_length, predicted_cost, prior_success_rate
features = [
hash(trace["task_type"]) % 1000,
len(trace["user_request"]),
trace.get("total_cost", 0.01),
]
# Label: optimal model tier (from oracle comparison)
optimal_tier = trace.get("metadata", {}).get("optimal_tier", 3)
X.append(features)
y.append(optimal_tier)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)
# 4. Deploy: override router decisions
# In production, integrate the classifier into ModelCascadeRouter._route_learned()
For RL-based routing (GRPO/DPO), see the literature review for BAAR and xRouter approaches.
Production Checklist
- Configure all models with accurate cost metadata
- Configure all tools with cost/latency estimates
- Set appropriate tier mappings for your use case
- Enable telemetry to collect traces for learning
- Set doom thresholds appropriate for your SLA
- Configure verifier thresholds for safety-critical tasks
- Test with small synthetic benchmark before deployment
- Monitor regression rate and false-DONE rate
- Review and adjust routing policy monthly
- Mine meta-tools after collecting 100+ successful traces
Troubleshooting
High regression rate
- Check if model tier mappings match your actual model capabilities
- Increase
unsafe_cheap_model_penaltyin config - Enable verifier on more task types
Low cost savings
- Verify cache layout is enabled (check cache hit rate)
- Ensure tool gate is catching repeated/unnecessary calls
- Check if meta-tool miner is enabled and has enough traces
High false-DONE rate
- Increase verifier threshold for final-step verification
- Enable doom detector with stricter
doom_no_progress_steps - Add more failure patterns to retry optimizer
Slow routing decisions
- Use prompt-only or static routing instead of learned
- Cache classification results for repeated request patterns
- Pre-compute meta-tools during off-peak hours
Support
- Repository: https://huggingface.co/narcolepticchicken/agent-cost-optimizer
- Issues: Open a discussion on the Hugging Face Hub
- Literature Review: See
docs/literature_review.md