# Agent Cost Optimizer — Deployment Guide ## Quick Start ### Installation ```bash pip install git+https://huggingface.co/narcolepticchicken/agent-cost-optimizer ``` Or clone and install locally: ```bash git clone https://huggingface.co/narcolepticchicken/agent-cost-optimizer cd agent-cost-optimizer pip install -e . ``` ### Basic Usage ```python from aco import AgentCostOptimizer # Load default configuration optimizer = AgentCostOptimizer() # Optimize a single agent request result = optimizer.optimize( "Write a Python function to reverse a linked list", run_state={ "trace_id": "run-001", "planned_tools": [("code_execution", {"code": "test"})], } ) print(f"Model: {result.routing_decision.model_id}") print(f"Tier: {result.routing_decision.tier}") print(f"Estimated Cost: ${result.estimated_cost:.4f}") print(f"Tool Decisions: {[d.decision.value for d in result.tool_decisions]}") ``` ## Configuration ### Config File Create a `config.yaml`: ```yaml project_name: "my-agent-optimizer" trace_storage_path: "./traces" models: gpt-4o-mini: model_id: "gpt-4o-mini" provider: "openai" cost_per_1k_input: 0.00015 cost_per_1k_output: 0.0006 strength_tier: 2 max_context: 128000 cache_discount_rate: 0.5 gpt-4o: model_id: "gpt-4o" provider: "openai" cost_per_1k_input: 0.0025 cost_per_1k_output: 0.01 strength_tier: 4 max_context: 128000 cache_discount_rate: 0.5 tools: search: tool_name: "search" cost_per_call: 0.002 latency_ms_estimate: 500 code_execution: tool_name: "code_execution" cost_per_call: 0.005 latency_ms_estimate: 1000 requires_verification: true verifiers: verifier_medium: verifier_model_id: "gpt-4o-mini" cost_per_call: 0.005 confidence_threshold: 0.8 # Enable/disable modules enable_router: true enable_context_budgeter: true enable_cache_layout: true enable_tool_gate: true enable_verifier_budgeter: true enable_retry_optimizer: true enable_meta_tool_miner: true enable_early_termination: true ``` Load it: ```python optimizer = AgentCostOptimizer.from_config("config.yaml") ``` ## Integration with Agent Harness ### Generic Integration Pattern ```python class MyAgentHarness: def __init__(self): self.optimizer = AgentCostOptimizer.from_config("config.yaml") def execute(self, user_request: str, context: dict): # 1. Build run state run_state = { "trace_id": f"run-{uuid.uuid4()}", "planned_tools": self.plan_tools(user_request), "context_pieces": context, "current_cost": 0.0, "step_number": 1, "total_steps": self.estimate_steps(user_request), "is_irreversible": False, } # 2. Call optimizer BEFORE execution decision = self.optimizer.optimize(user_request, run_state) # 3. Apply optimizer decisions selected_model = decision.routing_decision.model_id # Apply tool gate approved_tools = [ td for td in decision.tool_decisions if td.decision.value in ("use", "batch", "parallel") ] # Apply context budget if decision.context_budget: context = self._apply_context_budget(context, decision.context_budget) # Apply cache layout if decision.prompt_layout: prompt = self._apply_cache_layout(decision.prompt_layout) # Check doom assessment if decision.doom_assessment and decision.doom_assessment.action.value == "mark_blocked": return {"status": "BLOCKED", "reason": decision.doom_assessment.reasoning} # 4. Execute with optimized parameters result = self.llm_call( model=selected_model, prompt=prompt, tools=approved_tools, max_tokens=decision.routing_decision.max_tokens, ) # 5. Record step self.optimizer.record_step( trace_id=decision.trace_id, model_call=ModelCall( model_id=selected_model, provider="openai", input_tokens=result.input_tokens, output_tokens=result.output_tokens, cost_per_1k_input=0.0025, cost_per_1k_output=0.01, ), tool_calls=[...], context_size_tokens=len(prompt) // 4, step_outcome=Outcome.SUCCESS if result.success else Outcome.FAILURE, ) # 6. Finalize trace self.optimizer.finalize_trace( trace_id=decision.trace_id, outcome=Outcome.SUCCESS if result.success else Outcome.FAILURE, user_satisfaction=1.0 if result.success else 0.0, ) return result ``` ### LangChain Integration ```python from aco import AgentCostOptimizer from langchain.agents import AgentExecutor class ACOWrapper: def __init__(self, agent_executor, optimizer): self.agent = agent_executor self.optimizer = optimizer def invoke(self, input_data): # Pre-optimize decision = self.optimizer.optimize( input_data["input"], run_state={ "planned_tools": [(t.name, {}) for t in self.agent.tools], "trace_id": input_data.get("run_id", str(uuid.uuid4())), } ) # Override agent LLM based on routing decision self.agent.llm = self.get_llm(decision.routing_decision.model_id) # Filter tools based on tool gate self.agent.tools = [ t for t in self.agent.tools if any(d.tool_name == t.name and d.decision.value == "use" for d in decision.tool_decisions) ] # Execute result = self.agent.invoke(input_data) # Record and finalize # ... (see generic pattern above) return result ``` ### OpenAI Assistants Integration ```python from aco import AgentCostOptimizer class ACOAssistantWrapper: def __init__(self, assistant_id, optimizer): self.assistant_id = assistant_id self.optimizer = optimizer def create_run(self, thread_id, instructions): # Optimize instructions (context budgeter) decision = self.optimizer.optimize( instructions, run_state={ "trace_id": f"assistant-run-{thread_id}", "context_pieces": {"system_rules": instructions}, } ) # Use cache-aware prompt layout if decision.prompt_layout: optimized_instructions = decision.prompt_layout.prefix + "\n\n" + decision.prompt_layout.suffix else: optimized_instructions = instructions # Create run with optimized parameters return openai.beta.threads.runs.create( thread_id=thread_id, assistant_id=self.assistant_id, instructions=optimized_instructions, model=decision.routing_decision.model_id, ) ``` ## Multi-Provider Support ACO supports any provider with cost metadata: ```yaml models: claude-3-haiku: model_id: "claude-3-haiku-20240307" provider: "anthropic" cost_per_1k_input: 0.00025 cost_per_1k_output: 0.00125 strength_tier: 2 claude-3-opus: model_id: "claude-3-opus-20240229" provider: "anthropic" cost_per_1k_input: 0.015 cost_per_1k_output: 0.075 strength_tier: 4 gemini-pro: model_id: "gemini-1.5-pro" provider: "google" cost_per_1k_input: 0.0035 cost_per_1k_output: 0.0105 strength_tier: 3 deepseek-chat: model_id: "deepseek-chat" provider: "deepseek" cost_per_1k_input: 0.00014 cost_per_1k_output: 0.00028 strength_tier: 2 cache_discount_rate: 0.5 ``` ## Local Model Support For self-hosted models: ```yaml models: llama-3.2-1b: model_id: "meta-llama/Llama-3.2-1B-Instruct" provider: "local" cost_per_1k_input: 0.0 cost_per_1k_output: 0.0 strength_tier: 1 max_context: 128000 qwen2.5-7b: model_id: "Qwen/Qwen2.5-7B-Instruct" provider: "local" cost_per_1k_input: 0.0 cost_per_1k_output: 0.0 strength_tier: 3 max_context: 131072 ``` Use `cost_per_1k_input: 0.0` for local models. ACO will still optimize latency and context size. ## Benchmarking Run the benchmark suite: ```bash python eval_runner.py --tasks 1000 --output ./eval_results ``` With ablations: ```bash python eval_runner.py --tasks 1000 --ablations --output ./eval_results ``` Generate report: ```bash python -m aco.cli report --input ./eval_results/baseline_results.json ``` ## Telemetry and Monitoring Traces are stored as JSON in `trace_storage_path`: ```python # List all traces traces = optimizer.telemetry.list_traces() # Get statistics stats = optimizer.telemetry.get_stats() print(f"Total traces: {stats['count']}") print(f"Avg cost: ${stats['avg_cost']:.4f}") print(f"Success rate: {stats['success_rate']:.1%}") # Full optimizer stats all_stats = optimizer.get_stats() print(json.dumps(all_stats, indent=2)) ``` ## Advanced: Training a Custom Router To train a model-specific router using your trace data: ```python from aco.optimizer import AgentCostOptimizer from aco.config import ACOConfig, ModelConfig # 1. Collect traces optimizer = AgentCostOptimizer() # ... run agent tasks ... # 2. Extract features and labels from traces traces = [optimizer.telemetry.load_trace(tid) for tid in optimizer.telemetry.list_traces()] # 3. Train a simple classifier (example with sklearn) from sklearn.ensemble import RandomForestClassifier import numpy as np X = [] y = [] for trace in traces: # Features: task_type, request_length, predicted_cost, prior_success_rate features = [ hash(trace["task_type"]) % 1000, len(trace["user_request"]), trace.get("total_cost", 0.01), ] # Label: optimal model tier (from oracle comparison) optimal_tier = trace.get("metadata", {}).get("optimal_tier", 3) X.append(features) y.append(optimal_tier) clf = RandomForestClassifier(n_estimators=100) clf.fit(X, y) # 4. Deploy: override router decisions # In production, integrate the classifier into ModelCascadeRouter._route_learned() ``` For RL-based routing (GRPO/DPO), see the literature review for BAAR and xRouter approaches. ## Production Checklist - [ ] Configure all models with accurate cost metadata - [ ] Configure all tools with cost/latency estimates - [ ] Set appropriate tier mappings for your use case - [ ] Enable telemetry to collect traces for learning - [ ] Set doom thresholds appropriate for your SLA - [ ] Configure verifier thresholds for safety-critical tasks - [ ] Test with small synthetic benchmark before deployment - [ ] Monitor regression rate and false-DONE rate - [ ] Review and adjust routing policy monthly - [ ] Mine meta-tools after collecting 100+ successful traces ## Troubleshooting ### High regression rate - Check if model tier mappings match your actual model capabilities - Increase `unsafe_cheap_model_penalty` in config - Enable verifier on more task types ### Low cost savings - Verify cache layout is enabled (check cache hit rate) - Ensure tool gate is catching repeated/unnecessary calls - Check if meta-tool miner is enabled and has enough traces ### High false-DONE rate - Increase verifier threshold for final-step verification - Enable doom detector with stricter `doom_no_progress_steps` - Add more failure patterns to retry optimizer ### Slow routing decisions - Use prompt-only or static routing instead of learned - Cache classification results for repeated request patterns - Pre-compute meta-tools during off-peak hours ## Support - Repository: https://huggingface.co/narcolepticchicken/agent-cost-optimizer - Issues: Open a discussion on the Hugging Face Hub - Literature Review: See `docs/literature_review.md`