"""End-to-end demo: ACO in action with a simulated agent harness. This script demonstrates how to bolt ACO onto any agent harness. No actual LLM calls are made — decisions are simulated with realistic parameters. """ import json from typing import Dict, Any from datetime import datetime from aco import AgentCostOptimizer from aco.config import ACOConfig, ModelConfig, ToolConfig, VerifierConfig, RoutingPolicy from aco.trace_schema import ModelCall, ToolCall, Outcome, FailureTag def build_demo_config() -> ACOConfig: """Build a demo config with realistic provider pricing.""" return ACOConfig( models={ "gpt-4o-mini": ModelConfig( model_id="gpt-4o-mini", provider="openai", cost_per_1k_input=0.00015, cost_per_1k_output=0.0006, latency_ms_estimate=400, strength_tier=2, max_context=128000, ), "gpt-4o": ModelConfig( model_id="gpt-4o", provider="openai", cost_per_1k_input=0.0025, cost_per_1k_output=0.01, latency_ms_estimate=1500, strength_tier=4, max_context=128000, ), "claude-3.5-sonnet": ModelConfig( model_id="claude-3-5-sonnet-20241022", provider="anthropic", cost_per_1k_input=0.003, cost_per_1k_output=0.015, latency_ms_estimate=1200, strength_tier=3, max_context=200000, ), "claude-3.5-haiku": ModelConfig( model_id="claude-3-5-haiku-20241022", provider="anthropic", cost_per_1k_input=0.00025, cost_per_1k_output=0.00125, latency_ms_estimate=300, strength_tier=2, max_context=200000, ), "deepseek-chat": ModelConfig( model_id="deepseek-chat", provider="deepseek", cost_per_1k_input=0.00014, cost_per_1k_output=0.00028, latency_ms_estimate=800, strength_tier=3, max_context=64000, cache_discount_rate=0.5, ), "local-qwen-7b": ModelConfig( model_id="Qwen/Qwen2.5-7B-Instruct", provider="local", cost_per_1k_input=0.0, cost_per_1k_output=0.0, latency_ms_estimate=600, strength_tier=3, max_context=131072, ), }, tools={ "search": ToolConfig("search", 0.002, 500, cacheable=False), "code_execution": ToolConfig("code_execution", 0.005, 1000, requires_verification=True), "file_read": ToolConfig("file_read", 0.0005, 100, cacheable=True), "linter": ToolConfig("linter", 0.001, 200), "document_retrieval": ToolConfig("document_retrieval", 0.001, 300, cacheable=True), "compliance_check": ToolConfig("compliance_check", 0.01, 1500, requires_verification=True), }, verifiers={ "verifier_medium": VerifierConfig("claude-3.5-haiku", 0.005, 800, 0.8), }, routing_policy=RoutingPolicy("demo"), ) def demo_task(optimizer: AgentCostOptimizer, request: str, expected_difficulty: int = 3): """Run ACO optimization for a single task and show decisions.""" print(f"\n{'='*80}") print(f"TASK: {request}") print(f"{'='*80}") # Build run state for a fresh task run_state = { "trace_id": f"demo-{hash(request) % 10000:04d}", "planned_tools": [("file_read", {"path": "project.md"}), ("code_execution", {"code": "test"})], "previous_tool_calls": [], "current_cost": 0.0, "step_number": 1, "total_steps": 3, "is_irreversible": False, "context_pieces": { "system_rules": "You are a helpful coding assistant.", "tool_descriptions": "Available: file_read, code_execution, linter", "user_preferences": "Prefer Python, type hints, docstrings", "recent_messages": "", }, "retrieved_docs": [], "routing_mode": "cascade", } # Call optimizer result = optimizer.optimize(request, run_state) # Display decisions print(f"\nšŸ“Š OPTIMIZATION DECISIONS") print(f" Trace ID: {result.trace_id}") print(f" Estimated Cost: ${result.estimated_cost:.4f}") print(f" Estimated Latency: {result.estimated_latency_ms:.0f}ms") print(f" Confidence: {result.confidence:.2f}") print(f"\n šŸŽÆ Model Routing") print(f" Selected: {result.routing_decision.model_id} (tier {result.routing_decision.tier})") print(f" Provider: {result.routing_decision.provider}") print(f" Max Tokens: {result.routing_decision.max_tokens}") print(f" Temperature: {result.routing_decision.temperature}") print(f" Reasoning: {result.routing_decision.reasoning}") if result.routing_decision.fallback_model_id: print(f" Fallback: {result.routing_decision.fallback_model_id}") if result.context_budget: cb = result.context_budget print(f"\n šŸ“„ Context Budget ({cb.total_budget_tokens:,} tokens)") print(f" Prefix (cacheable): {cb.cache_prefix_tokens:,} tokens") print(f" Suffix (dynamic): {cb.dynamic_suffix_tokens:,} tokens") if cb.omitted_sources: print(f" Omitted: {[s.name for s in cb.omitted_sources]}") if cb.summarized_sources: print(f" Summarized: {[s.name for s, _ in cb.summarized_sources]}") if cb.retrieval_queries: print(f" Retrieval: {cb.retrieval_queries}") if result.prompt_layout: pl = result.prompt_layout print(f"\n šŸ’¾ Cache Layout") print(f" Cold Cost: ${pl.estimated_cold_cost:.4f}") print(f" Warm Cost: ${pl.estimated_warm_cost:.4f}") print(f" Cache Discount: ${pl.cache_discount:.4f}") print(f"\n šŸ”§ Tool Decisions ({len(result.tool_decisions)} tools)") for td in result.tool_decisions: icon = "āœ…" if td.decision.value in ("use", "batch", "parallel") else "āŒ" print(f" {icon} {td.tool_name}: {td.decision.value} (cost: ${td.estimated_cost:.4f}, benefit: {td.estimated_benefit:.2f})") if result.verifier_decision: vd = result.verifier_decision print(f"\n šŸ” Verifier Decision") print(f" Decision: {vd.decision.value}") print(f" Checks: {vd.checks}") print(f" Estimated Cost: ${vd.estimated_verifier_cost:.4f}") if result.meta_tool_match: mm = result.meta_tool_match print(f"\n ⚔ Meta-Tool Match") print(f" ID: {mm['meta_tool_id']}") print(f" Est. Savings: ${mm['estimated_cost_savings']:.4f}") if result.doom_assessment: da = result.doom_assessment print(f"\n āš ļø Doom Assessment") print(f" Action: {da.action.value}") print(f" Confidence: {da.confidence:.2f}") if da.signals_triggered: print(f" Signals: {da.signals_triggered}") # Simulate execution print(f"\nšŸŽ¬ SIMULATED EXECUTION") model_cost = (result.routing_decision.max_tokens / 1000) * optimizer.config.models[result.routing_decision.model_id].cost_per_1k_input tool_cost = sum(d.estimated_cost for d in result.tool_decisions if d.decision.value in ("use", "batch")) verifier_cost = result.verifier_decision.estimated_verifier_cost if result.verifier_decision else 0.0 total_cost = model_cost + tool_cost + verifier_cost print(f" Model call: ${model_cost:.4f}") print(f" Tool calls: ${tool_cost:.4f}") print(f" Verifier: ${verifier_cost:.4f}") print(f" TOTAL: ${total_cost:.4f}") # Estimate what frontier-only would cost frontier_cfg = optimizer.config.models.get("gpt-4o") if frontier_cfg: frontier_cost = (result.routing_decision.max_tokens / 1000) * frontier_cfg.cost_per_1k_input + tool_cost + verifier_cost savings = frontier_cost - total_cost print(f"\nšŸ’° vs Frontier Model (gpt-4o)") print(f" Frontier cost: ${frontier_cost:.4f}") print(f" Savings: ${savings:.4f} ({savings/max(frontier_cost,0.001)*100:.1f}%)") return result def main(): print("=" * 80) print("AGENT COST OPTIMIZER - End-to-End Demo") print("=" * 80) config = build_demo_config() optimizer = AgentCostOptimizer(config) tasks = [ ("What is the capital of France?", 1), ("Write a Python function to reverse a linked list", 3), ("Research the latest advancements in transformer architectures and summarize key findings", 4), ("Review this contract for liability clauses and check GDPR compliance", 5), ("Help me with this thing", 3), ("Debug this segfault in our C++ thread pool implementation", 4), ("Draft an email to the team about the deployment schedule for next week", 2), ("Plan a 3-month roadmap for migrating our ML infrastructure to Kubernetes", 4), ("Search for open issues in the repo and create a summary report", 2), ("Query the database for Q3 sales data broken down by region, then produce a chart", 3), ] results = [] for request, difficulty in tasks: result = demo_task(optimizer, request, difficulty) results.append({ "request": request, "model": result.routing_decision.model_id, "tier": result.routing_decision.tier, "estimated_cost": result.estimated_cost, "verifier": result.verifier_decision.decision.value if result.verifier_decision else "none", }) # Summary print(f"\n{'='*80}") print("SUMMARY") print(f"{'='*80}") total_est = sum(r["estimated_cost"] for r in results) print(f"Total estimated cost for {len(tasks)} tasks: ${total_est:.4f}") # Show model distribution from collections import Counter model_counts = Counter(r["model"] for r in results) print(f"\nModel distribution:") for model, count in model_counts.most_common(): print(f" {model}: {count} tasks ({count/len(tasks)*100:.0f}%)") print(f"\nāœ… Demo complete!") print(f" Repo: https://huggingface.co/narcolepticchicken/agent-cost-optimizer") if __name__ == "__main__": main()