Upload examples/end_to_end_demo.py
Browse files- examples/end_to_end_demo.py +255 -0
examples/end_to_end_demo.py
ADDED
|
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""End-to-end demo: ACO in action with a simulated agent harness.
|
| 2 |
+
|
| 3 |
+
This script demonstrates how to bolt ACO onto any agent harness.
|
| 4 |
+
No actual LLM calls are made — decisions are simulated with realistic parameters.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
from typing import Dict, Any
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
from aco import AgentCostOptimizer
|
| 12 |
+
from aco.config import ACOConfig, ModelConfig, ToolConfig, VerifierConfig, RoutingPolicy
|
| 13 |
+
from aco.trace_schema import ModelCall, ToolCall, Outcome, FailureTag
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_demo_config() -> ACOConfig:
|
| 17 |
+
"""Build a demo config with realistic provider pricing."""
|
| 18 |
+
return ACOConfig(
|
| 19 |
+
models={
|
| 20 |
+
"gpt-4o-mini": ModelConfig(
|
| 21 |
+
model_id="gpt-4o-mini",
|
| 22 |
+
provider="openai",
|
| 23 |
+
cost_per_1k_input=0.00015,
|
| 24 |
+
cost_per_1k_output=0.0006,
|
| 25 |
+
latency_ms_estimate=400,
|
| 26 |
+
strength_tier=2,
|
| 27 |
+
max_context=128000,
|
| 28 |
+
),
|
| 29 |
+
"gpt-4o": ModelConfig(
|
| 30 |
+
model_id="gpt-4o",
|
| 31 |
+
provider="openai",
|
| 32 |
+
cost_per_1k_input=0.0025,
|
| 33 |
+
cost_per_1k_output=0.01,
|
| 34 |
+
latency_ms_estimate=1500,
|
| 35 |
+
strength_tier=4,
|
| 36 |
+
max_context=128000,
|
| 37 |
+
),
|
| 38 |
+
"claude-3.5-sonnet": ModelConfig(
|
| 39 |
+
model_id="claude-3-5-sonnet-20241022",
|
| 40 |
+
provider="anthropic",
|
| 41 |
+
cost_per_1k_input=0.003,
|
| 42 |
+
cost_per_1k_output=0.015,
|
| 43 |
+
latency_ms_estimate=1200,
|
| 44 |
+
strength_tier=3,
|
| 45 |
+
max_context=200000,
|
| 46 |
+
),
|
| 47 |
+
"claude-3.5-haiku": ModelConfig(
|
| 48 |
+
model_id="claude-3-5-haiku-20241022",
|
| 49 |
+
provider="anthropic",
|
| 50 |
+
cost_per_1k_input=0.00025,
|
| 51 |
+
cost_per_1k_output=0.00125,
|
| 52 |
+
latency_ms_estimate=300,
|
| 53 |
+
strength_tier=2,
|
| 54 |
+
max_context=200000,
|
| 55 |
+
),
|
| 56 |
+
"deepseek-chat": ModelConfig(
|
| 57 |
+
model_id="deepseek-chat",
|
| 58 |
+
provider="deepseek",
|
| 59 |
+
cost_per_1k_input=0.00014,
|
| 60 |
+
cost_per_1k_output=0.00028,
|
| 61 |
+
latency_ms_estimate=800,
|
| 62 |
+
strength_tier=3,
|
| 63 |
+
max_context=64000,
|
| 64 |
+
cache_discount_rate=0.5,
|
| 65 |
+
),
|
| 66 |
+
"local-qwen-7b": ModelConfig(
|
| 67 |
+
model_id="Qwen/Qwen2.5-7B-Instruct",
|
| 68 |
+
provider="local",
|
| 69 |
+
cost_per_1k_input=0.0,
|
| 70 |
+
cost_per_1k_output=0.0,
|
| 71 |
+
latency_ms_estimate=600,
|
| 72 |
+
strength_tier=3,
|
| 73 |
+
max_context=131072,
|
| 74 |
+
),
|
| 75 |
+
},
|
| 76 |
+
tools={
|
| 77 |
+
"search": ToolConfig("search", 0.002, 500, cacheable=False),
|
| 78 |
+
"code_execution": ToolConfig("code_execution", 0.005, 1000, requires_verification=True),
|
| 79 |
+
"file_read": ToolConfig("file_read", 0.0005, 100, cacheable=True),
|
| 80 |
+
"linter": ToolConfig("linter", 0.001, 200),
|
| 81 |
+
"document_retrieval": ToolConfig("document_retrieval", 0.001, 300, cacheable=True),
|
| 82 |
+
"compliance_check": ToolConfig("compliance_check", 0.01, 1500, requires_verification=True),
|
| 83 |
+
},
|
| 84 |
+
verifiers={
|
| 85 |
+
"verifier_medium": VerifierConfig("claude-3.5-haiku", 0.005, 800, 0.8),
|
| 86 |
+
},
|
| 87 |
+
routing_policy=RoutingPolicy("demo"),
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def demo_task(optimizer: AgentCostOptimizer, request: str, expected_difficulty: int = 3):
|
| 92 |
+
"""Run ACO optimization for a single task and show decisions."""
|
| 93 |
+
|
| 94 |
+
print(f"\n{'='*80}")
|
| 95 |
+
print(f"TASK: {request}")
|
| 96 |
+
print(f"{'='*80}")
|
| 97 |
+
|
| 98 |
+
# Build run state for a fresh task
|
| 99 |
+
run_state = {
|
| 100 |
+
"trace_id": f"demo-{hash(request) % 10000:04d}",
|
| 101 |
+
"planned_tools": [("file_read", {"path": "project.md"}), ("code_execution", {"code": "test"})],
|
| 102 |
+
"previous_tool_calls": [],
|
| 103 |
+
"current_cost": 0.0,
|
| 104 |
+
"step_number": 1,
|
| 105 |
+
"total_steps": 3,
|
| 106 |
+
"is_irreversible": False,
|
| 107 |
+
"context_pieces": {
|
| 108 |
+
"system_rules": "You are a helpful coding assistant.",
|
| 109 |
+
"tool_descriptions": "Available: file_read, code_execution, linter",
|
| 110 |
+
"user_preferences": "Prefer Python, type hints, docstrings",
|
| 111 |
+
"recent_messages": "",
|
| 112 |
+
},
|
| 113 |
+
"retrieved_docs": [],
|
| 114 |
+
"routing_mode": "cascade",
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# Call optimizer
|
| 118 |
+
result = optimizer.optimize(request, run_state)
|
| 119 |
+
|
| 120 |
+
# Display decisions
|
| 121 |
+
print(f"\n📊 OPTIMIZATION DECISIONS")
|
| 122 |
+
print(f" Trace ID: {result.trace_id}")
|
| 123 |
+
print(f" Estimated Cost: ${result.estimated_cost:.4f}")
|
| 124 |
+
print(f" Estimated Latency: {result.estimated_latency_ms:.0f}ms")
|
| 125 |
+
print(f" Confidence: {result.confidence:.2f}")
|
| 126 |
+
print(f"\n 🎯 Model Routing")
|
| 127 |
+
print(f" Selected: {result.routing_decision.model_id} (tier {result.routing_decision.tier})")
|
| 128 |
+
print(f" Provider: {result.routing_decision.provider}")
|
| 129 |
+
print(f" Max Tokens: {result.routing_decision.max_tokens}")
|
| 130 |
+
print(f" Temperature: {result.routing_decision.temperature}")
|
| 131 |
+
print(f" Reasoning: {result.routing_decision.reasoning}")
|
| 132 |
+
if result.routing_decision.fallback_model_id:
|
| 133 |
+
print(f" Fallback: {result.routing_decision.fallback_model_id}")
|
| 134 |
+
|
| 135 |
+
if result.context_budget:
|
| 136 |
+
cb = result.context_budget
|
| 137 |
+
print(f"\n 📄 Context Budget ({cb.total_budget_tokens:,} tokens)")
|
| 138 |
+
print(f" Prefix (cacheable): {cb.cache_prefix_tokens:,} tokens")
|
| 139 |
+
print(f" Suffix (dynamic): {cb.dynamic_suffix_tokens:,} tokens")
|
| 140 |
+
if cb.omitted_sources:
|
| 141 |
+
print(f" Omitted: {[s.name for s in cb.omitted_sources]}")
|
| 142 |
+
if cb.summarized_sources:
|
| 143 |
+
print(f" Summarized: {[s.name for s, _ in cb.summarized_sources]}")
|
| 144 |
+
if cb.retrieval_queries:
|
| 145 |
+
print(f" Retrieval: {cb.retrieval_queries}")
|
| 146 |
+
|
| 147 |
+
if result.prompt_layout:
|
| 148 |
+
pl = result.prompt_layout
|
| 149 |
+
print(f"\n 💾 Cache Layout")
|
| 150 |
+
print(f" Cold Cost: ${pl.estimated_cold_cost:.4f}")
|
| 151 |
+
print(f" Warm Cost: ${pl.estimated_warm_cost:.4f}")
|
| 152 |
+
print(f" Cache Discount: ${pl.cache_discount:.4f}")
|
| 153 |
+
|
| 154 |
+
print(f"\n 🔧 Tool Decisions ({len(result.tool_decisions)} tools)")
|
| 155 |
+
for td in result.tool_decisions:
|
| 156 |
+
icon = "✅" if td.decision.value in ("use", "batch", "parallel") else "❌"
|
| 157 |
+
print(f" {icon} {td.tool_name}: {td.decision.value} (cost: ${td.estimated_cost:.4f}, benefit: {td.estimated_benefit:.2f})")
|
| 158 |
+
|
| 159 |
+
if result.verifier_decision:
|
| 160 |
+
vd = result.verifier_decision
|
| 161 |
+
print(f"\n 🔍 Verifier Decision")
|
| 162 |
+
print(f" Decision: {vd.decision.value}")
|
| 163 |
+
print(f" Checks: {vd.checks}")
|
| 164 |
+
print(f" Estimated Cost: ${vd.estimated_verifier_cost:.4f}")
|
| 165 |
+
|
| 166 |
+
if result.meta_tool_match:
|
| 167 |
+
mm = result.meta_tool_match
|
| 168 |
+
print(f"\n ⚡ Meta-Tool Match")
|
| 169 |
+
print(f" ID: {mm['meta_tool_id']}")
|
| 170 |
+
print(f" Est. Savings: ${mm['estimated_cost_savings']:.4f}")
|
| 171 |
+
|
| 172 |
+
if result.doom_assessment:
|
| 173 |
+
da = result.doom_assessment
|
| 174 |
+
print(f"\n ⚠️ Doom Assessment")
|
| 175 |
+
print(f" Action: {da.action.value}")
|
| 176 |
+
print(f" Confidence: {da.confidence:.2f}")
|
| 177 |
+
if da.signals_triggered:
|
| 178 |
+
print(f" Signals: {da.signals_triggered}")
|
| 179 |
+
|
| 180 |
+
# Simulate execution
|
| 181 |
+
print(f"\n🎬 SIMULATED EXECUTION")
|
| 182 |
+
model_cost = (result.routing_decision.max_tokens / 1000) * optimizer.config.models[result.routing_decision.model_id].cost_per_1k_input
|
| 183 |
+
tool_cost = sum(d.estimated_cost for d in result.tool_decisions if d.decision.value in ("use", "batch"))
|
| 184 |
+
verifier_cost = result.verifier_decision.estimated_verifier_cost if result.verifier_decision else 0.0
|
| 185 |
+
total_cost = model_cost + tool_cost + verifier_cost
|
| 186 |
+
|
| 187 |
+
print(f" Model call: ${model_cost:.4f}")
|
| 188 |
+
print(f" Tool calls: ${tool_cost:.4f}")
|
| 189 |
+
print(f" Verifier: ${verifier_cost:.4f}")
|
| 190 |
+
print(f" TOTAL: ${total_cost:.4f}")
|
| 191 |
+
|
| 192 |
+
# Estimate what frontier-only would cost
|
| 193 |
+
frontier_cfg = optimizer.config.models.get("gpt-4o")
|
| 194 |
+
if frontier_cfg:
|
| 195 |
+
frontier_cost = (result.routing_decision.max_tokens / 1000) * frontier_cfg.cost_per_1k_input + tool_cost + verifier_cost
|
| 196 |
+
savings = frontier_cost - total_cost
|
| 197 |
+
print(f"\n💰 vs Frontier Model (gpt-4o)")
|
| 198 |
+
print(f" Frontier cost: ${frontier_cost:.4f}")
|
| 199 |
+
print(f" Savings: ${savings:.4f} ({savings/max(frontier_cost,0.001)*100:.1f}%)")
|
| 200 |
+
|
| 201 |
+
return result
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def main():
|
| 205 |
+
print("=" * 80)
|
| 206 |
+
print("AGENT COST OPTIMIZER - End-to-End Demo")
|
| 207 |
+
print("=" * 80)
|
| 208 |
+
|
| 209 |
+
config = build_demo_config()
|
| 210 |
+
optimizer = AgentCostOptimizer(config)
|
| 211 |
+
|
| 212 |
+
tasks = [
|
| 213 |
+
("What is the capital of France?", 1),
|
| 214 |
+
("Write a Python function to reverse a linked list", 3),
|
| 215 |
+
("Research the latest advancements in transformer architectures and summarize key findings", 4),
|
| 216 |
+
("Review this contract for liability clauses and check GDPR compliance", 5),
|
| 217 |
+
("Help me with this thing", 3),
|
| 218 |
+
("Debug this segfault in our C++ thread pool implementation", 4),
|
| 219 |
+
("Draft an email to the team about the deployment schedule for next week", 2),
|
| 220 |
+
("Plan a 3-month roadmap for migrating our ML infrastructure to Kubernetes", 4),
|
| 221 |
+
("Search for open issues in the repo and create a summary report", 2),
|
| 222 |
+
("Query the database for Q3 sales data broken down by region, then produce a chart", 3),
|
| 223 |
+
]
|
| 224 |
+
|
| 225 |
+
results = []
|
| 226 |
+
for request, difficulty in tasks:
|
| 227 |
+
result = demo_task(optimizer, request, difficulty)
|
| 228 |
+
results.append({
|
| 229 |
+
"request": request,
|
| 230 |
+
"model": result.routing_decision.model_id,
|
| 231 |
+
"tier": result.routing_decision.tier,
|
| 232 |
+
"estimated_cost": result.estimated_cost,
|
| 233 |
+
"verifier": result.verifier_decision.decision.value if result.verifier_decision else "none",
|
| 234 |
+
})
|
| 235 |
+
|
| 236 |
+
# Summary
|
| 237 |
+
print(f"\n{'='*80}")
|
| 238 |
+
print("SUMMARY")
|
| 239 |
+
print(f"{'='*80}")
|
| 240 |
+
total_est = sum(r["estimated_cost"] for r in results)
|
| 241 |
+
print(f"Total estimated cost for {len(tasks)} tasks: ${total_est:.4f}")
|
| 242 |
+
|
| 243 |
+
# Show model distribution
|
| 244 |
+
from collections import Counter
|
| 245 |
+
model_counts = Counter(r["model"] for r in results)
|
| 246 |
+
print(f"\nModel distribution:")
|
| 247 |
+
for model, count in model_counts.most_common():
|
| 248 |
+
print(f" {model}: {count} tasks ({count/len(tasks)*100:.0f}%)")
|
| 249 |
+
|
| 250 |
+
print(f"\n✅ Demo complete!")
|
| 251 |
+
print(f" Repo: https://huggingface.co/narcolepticchicken/agent-cost-optimizer")
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
if __name__ == "__main__":
|
| 255 |
+
main()
|