narcolepticchicken
/

agent-cost-optimizer

Safetensors

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 1 day ago

Commit

d18f367

verified ·

1 Parent(s): c55ff1a

Upload aco/datasets/synthetic_traces.py

Browse files

Files changed (1) hide show

aco/datasets/synthetic_traces.py +416 -0

aco/datasets/synthetic_traces.py ADDED Viewed

	@@ -0,0 +1,416 @@

+"""Synthetic Trace Generator.
+Generates 10,000+ agent traces with:
+- task type
+- model used
+- tool calls
+- context size
+- cost
+- latency
+- failure mode
+- final outcome
+- optimal cheaper alternative
+- recovery action
+- verifier need
+- escalation decision
+Includes traces with:
+- cheap model success
+- cheap model failure
+- frontier model unnecessary
+- tool overuse
+- tool underuse
+- retrieval overuse
+- verifier overuse
+- retry loops
+- cache breaks
+- false-DONE
+- successful meta-tool reuse
+- bad meta-tool reuse
+"""
+import uuid
+import random
+import json
+from datetime import datetime, timedelta
+from typing import List, Dict, Any, Optional
+from dataclasses import asdict
+from aco.trace_schema import (
+    AgentTrace, TraceStep, ModelCall, ToolCall, VerifierCall,
+    TaskType, Outcome, FailureTag,
+)
+class SyntheticTraceGenerator:
+    """Generates diverse synthetic agent traces for training and benchmarking."""
+    MODEL_CONFIGS = {
+        "tiny_local": {"tier": 1, "cost_input": 0.0001, "cost_output": 0.0002, "latency": 200, "strength": 0.3},
+        "cheap_cloud": {"tier": 2, "cost_input": 0.0005, "cost_output": 0.001, "latency": 500, "strength": 0.5},
+        "medium": {"tier": 3, "cost_input": 0.003, "cost_output": 0.006, "latency": 800, "strength": 0.75},
+        "frontier": {"tier": 4, "cost_input": 0.01, "cost_output": 0.03, "latency": 1500, "strength": 0.95},
+        "specialist": {"tier": 5, "cost_input": 0.015, "cost_output": 0.045, "latency": 2000, "strength": 0.98},
+    }
+    TOOL_COSTS = {
+        "search": 0.002,
+        "retrieve": 0.001,
+        "fetch": 0.003,
+        "code_execution": 0.005,
+        "linter": 0.001,
+        "test_runner": 0.003,
+        "file_read": 0.0005,
+        "file_write": 0.0005,
+        "calculator": 0.0001,
+        "database_query": 0.004,
+        "compliance_check": 0.01,
+        "summarize": 0.002,
+        "task_planner": 0.001,
+        "progress_tracker": 0.0005,
+    }
+    TASK_TYPE_DISTRIBUTION = {
+        TaskType.QUICK_ANSWER: 0.20,
+        TaskType.CODING: 0.20,
+        TaskType.RESEARCH: 0.15,
+        TaskType.DOCUMENT_DRAFTING: 0.10,
+        TaskType.LEGAL_REGULATED: 0.05,
+        TaskType.TOOL_HEAVY: 0.10,
+        TaskType.RETRIEVAL_HEAVY: 0.10,
+        TaskType.LONG_HORIZON: 0.08,
+        TaskType.UNKNOWN_AMBIGUOUS: 0.02,
+    }
+    # Scenario templates for generating realistic traces
+    SCENARIOS = [
+        # cheap model success
+        {"name": "cheap_success", "prob": 0.15, "tier": [1, 2], "outcome": Outcome.SUCCESS, "failure_tags": []},
+        # cheap model failure -> should have escalated
+        {"name": "cheap_failure", "prob": 0.10, "tier": [1, 2], "outcome": Outcome.FAILURE, "failure_tags": [FailureTag.MODEL_TOO_WEAK]},
+        # frontier model unnecessary -> overpaid
+        {"name": "frontier_unnecessary", "prob": 0.08, "tier": [4], "outcome": Outcome.SUCCESS, "failure_tags": [], "optimal_tier": [1, 2]},
+        # tool overuse
+        {"name": "tool_overuse", "prob": 0.07, "tier": [3, 4], "outcome": Outcome.PARTIAL_SUCCESS, "failure_tags": [FailureTag.TOOL_UNNECESSARY], "extra_tools": 3},
+        # tool underuse
+        {"name": "tool_underuse", "prob": 0.05, "tier": [3, 4], "outcome": Outcome.FAILURE, "failure_tags": [FailureTag.TOOL_MISSED], "missing_tools": 2},
+        # retrieval overuse
+        {"name": "retrieval_overuse", "prob": 0.04, "tier": [3, 4], "outcome": Outcome.SUCCESS, "failure_tags": [], "extra_retrievals": 5},
+        # verifier overuse
+        {"name": "verifier_overuse", "prob": 0.03, "tier": [3, 4], "outcome": Outcome.SUCCESS, "failure_tags": [], "extra_verifiers": 2},
+        # retry loops
+        {"name": "retry_loop", "prob": 0.05, "tier": [3, 4], "outcome": Outcome.FAILURE, "failure_tags": [FailureTag.RETRY_LOOP], "retries": 5},
+        # cache breaks
+        {"name": "cache_break", "prob": 0.04, "tier": [3, 4], "outcome": Outcome.PARTIAL_SUCCESS, "failure_tags": [FailureTag.CACHE_BREAK]},
+        # false DONE
+        {"name": "false_done", "prob": 0.05, "tier": [3, 4], "outcome": Outcome.FALSE_DONE, "failure_tags": [FailureTag.VERIFIER_FALSE_PASS]},
+        # meta-tool success
+        {"name": "meta_tool_success", "prob": 0.06, "tier": [2, 3], "outcome": Outcome.SUCCESS, "failure_tags": [], "uses_meta_tool": True},
+        # meta-tool bad reuse
+        {"name": "meta_tool_bad", "prob": 0.02, "tier": [2, 3], "outcome": Outcome.FAILURE, "failure_tags": [FailureTag.MODEL_TOO_WEAK], "uses_meta_tool": True},
+        # normal success
+        {"name": "normal_success", "prob": 0.20, "tier": [3, 4], "outcome": Outcome.SUCCESS, "failure_tags": []},
+        # blocked
+        {"name": "blocked", "prob": 0.03, "tier": [4], "outcome": Outcome.BLOCKED, "failure_tags": [FailureTag.MISSED_ESCALATION]},
+        # human escalation
+        {"name": "human_escalation", "prob": 0.02, "tier": [4, 5], "outcome": Outcome.ESCALATED_HUMAN, "failure_tags": [FailureTag.MISSED_ESCALATION]},
+        # stopped doom
+        {"name": "stopped_doom", "prob": 0.03, "tier": [3, 4], "outcome": Outcome.STOPPED_DOOM, "failure_tags": [FailureTag.COST_EXCEEDED]},
+    ]
+    def __init__(self, seed: int = 42):
+        self.rng = random.Random(seed)
+    def generate(self, n: int = 10000) -> List[AgentTrace]:
+        """Generate n synthetic traces."""
+        traces = []
+        for i in range(n):
+            trace = self._generate_trace(i)
+            traces.append(trace)
+        return traces
+    def _generate_trace(self, idx: int) -> AgentTrace:
+        trace_id = f"synth_{idx}_{uuid.uuid4().hex[:8]}"
+        # Pick task type
+        task_type = self.rng.choices(
+            list(self.TASK_TYPE_DISTRIBUTION.keys()),
+            weights=list(self.TASK_TYPE_DISTRIBUTION.values()),
+        )[0]
+        # Pick scenario
+        scenario = self._pick_scenario()
+        # Generate user request based on task type
+        user_request = self._generate_request(task_type, scenario["name"])
+        # Number of steps
+        base_steps = self.rng.randint(1, 8)
+        if scenario["name"] in ("retry_loop", "false_done"):
+            base_steps = self.rng.randint(5, 12)
+        if scenario.get("uses_meta_tool"):
+            base_steps = max(2, base_steps // 2)  # meta-tools compress steps
+        steps = []
+        tier = self.rng.choice(scenario["tier"])
+        model_key = self._tier_to_model(tier)
+        model_cfg = self.MODEL_CONFIGS[model_key]
+        total_cost = 0.0
+        total_latency = 0.0
+        for step_idx in range(base_steps):
+            step = self._generate_step(
+                trace_id, step_idx, task_type, model_key, model_cfg,
+                scenario, step_idx == base_steps - 1,
+            )
+            steps.append(step)
+            total_cost += step.step_cost
+            total_latency += step.step_latency_ms
+        # Generate final outcome
+        outcome = scenario["outcome"]
+        failure_tags = list(scenario["failure_tags"])
+        # Optimal cheaper alternative
+        optimal_tier = scenario.get("optimal_tier")
+        if optimal_tier:
+            optimal_model = self._tier_to_model(self.rng.choice(optimal_tier))
+            optimal_cost = self.MODEL_CONFIGS[optimal_model]["cost_input"] * 2000  # rough estimate
+        else:
+            optimal_cost = total_cost * 0.6
+        # Compute cost saved vs always frontier
+        frontier_cost = self.MODEL_CONFIGS["frontier"]["cost_input"] * 2000 * base_steps
+        cost_saved = frontier_cost - total_cost
+        return AgentTrace(
+            trace_id=trace_id,
+            user_request=user_request,
+            task_type=task_type,
+            steps=steps,
+            final_outcome=outcome,
+            failure_tags=failure_tags,
+            total_cost=total_cost,
+            total_cost_saved_vs_frontier=cost_saved,
+            optimal_cost=optimal_cost,
+            metadata={
+                "scenario": scenario["name"],
+                "synthetic": True,
+                "generation_timestamp": datetime.utcnow().isoformat(),
+                "optimal_tier": optimal_tier[0] if optimal_tier else tier,
+            },
+        )
+    def _pick_scenario(self) -> Dict:
+        names = [s["name"] for s in self.SCENARIOS]
+        probs = [s["prob"] for s in self.SCENARIOS]
+        return self.rng.choices(self.SCENARIOS, weights=probs)[0]
+    def _generate_request(self, task_type: TaskType, scenario: str) -> str:
+        templates = {
+            TaskType.QUICK_ANSWER: [
+                "What is the capital of France?",
+                "Briefly explain quantum computing.",
+                "Summarize the key points of article X.",
+                "What is 237 * 452?",
+            ],
+            TaskType.CODING: [
+                "Write a Python function to reverse a linked list.",
+                "Fix the bug in this React component.",
+                "Refactor the authentication module to use JWT.",
+                "Implement a LRU cache in Go.",
+            ],
+            TaskType.RESEARCH: [
+                "Research the latest advancements in transformer architectures.",
+                "Find sources comparing LoRA and full fine-tuning.",
+                "Investigate the climate impact of data centers.",
+                "What does the literature say about speculative decoding?",
+            ],
+            TaskType.DOCUMENT_DRAFTING: [
+                "Draft a project proposal for the ML pipeline.",
+                "Write an email to the team about the deployment schedule.",
+                "Create a technical report on system performance.",
+            ],
+            TaskType.LEGAL_REGULATED: [
+                "Review this contract for liability clauses.",
+                "Check compliance with GDPR for this data processing pipeline.",
+                "Draft a privacy policy section for user data.",
+            ],
+            TaskType.TOOL_HEAVY: [
+                "Search for open issues in the repo and create a summary.",
+                "Fetch the latest API documentation and generate client code.",
+                "Query the database for Q3 sales and produce a chart.",
+            ],
+            TaskType.RETRIEVAL_HEAVY: [
+                "Answer based on the attached 50-page document.",
+                "Find all mentions of 'payment processing' in my files.",
+                "Retrieve relevant cases for this legal query.",
+            ],
+            TaskType.LONG_HORIZON: [
+                "Plan a 3-month roadmap for the agent framework.",
+                "Orchestrate the deployment of the multi-region system.",
+                "Project: redesign the data architecture end-to-end.",
+            ],
+            TaskType.UNKNOWN_AMBIGUOUS: [
+                "Help me with this thing.",
+                "I need something done about the server.",
+                "Can you look into that issue we discussed?",
+            ],
+        }
+        options = templates.get(task_type, ["Generic request"])
+        return self.rng.choice(options)
+    def _tier_to_model(self, tier: int) -> str:
+        mapping = {1: "tiny_local", 2: "cheap_cloud", 3: "medium", 4: "frontier", 5: "specialist"}
+        return mapping.get(tier, "medium")
+    def _generate_step(
+        self,
+        trace_id: str,
+        step_idx: int,
+        task_type: TaskType,
+        model_key: str,
+        model_cfg: Dict,
+        scenario: Dict,
+        is_last: bool,
+    ) -> TraceStep:
+        step_id = f"{trace_id}_step_{step_idx}"
+        # Model call
+        input_tokens = self.rng.randint(500, 8000)
+        output_tokens = self.rng.randint(100, 4000)
+        cache_hit = self.rng.random() < 0.3
+        cache_hit_tokens = int(input_tokens * self.rng.random() * 0.5) if cache_hit else 0
+        model_call = ModelCall(
+            model_id=model_key,
+            provider="synthetic",
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            reasoning_tokens=output_tokens // 5 if model_key == "frontier" else 0,
+            cost_per_1k_input=model_cfg["cost_input"],
+            cost_per_1k_output=model_cfg["cost_output"],
+            cache_hit_input_tokens=cache_hit_tokens,
+            latency_ms=model_cfg["latency"] * self.rng.uniform(0.8, 1.5),
+        )
+        # Tool calls
+        tool_calls = []
+        base_tools = self._get_tools_for_task(task_type)
+        num_tools = self.rng.randint(0, len(base_tools))
+        if scenario.get("extra_tools"):
+            num_tools += scenario["extra_tools"]
+        if scenario.get("missing_tools"):
+            num_tools = max(0, num_tools - scenario["missing_tools"])
+        for t in range(min(num_tools, len(base_tools))):
+            tool_name = base_tools[t]
+            tool_cost = self.TOOL_COSTS.get(tool_name, 0.001)
+            repeated = self.rng.random() < 0.1
+            ignored = self.rng.random() < 0.05
+            failed = self.rng.random() < (0.2 if scenario["name"] in ("retry_loop", "tool_underuse") else 0.05)
+            tool_calls.append(ToolCall(
+                tool_name=tool_name,
+                tool_input={"query": f"auto_{tool_name}"},
+                tool_cost=tool_cost,
+                tool_latency_ms=self.rng.uniform(100, 1000),
+                cache_hit=self.rng.random() < 0.2,
+                repeated=repeated,
+                ignored_result=ignored,
+                failed=failed,
+            ))
+        # Verifier calls
+        verifier_calls = []
+        num_verifiers = 0
+        if task_type in (TaskType.LEGAL_REGULATED, TaskType.CODING, TaskType.RESEARCH):
+            num_verifiers = 1 if self.rng.random() < 0.5 else 0
+        if scenario.get("extra_verifiers"):
+            num_verifiers += scenario["extra_verifiers"]
+        for v in range(num_verifiers):
+            passed = self.rng.random() < 0.8
+            verifier_calls.append(VerifierCall(
+                verifier_model_id="verifier_medium",
+                target_step_id=step_id,
+                passed=passed,
+                confidence=self.rng.uniform(0.6, 0.99),
+                cost=0.005,
+                latency_ms=500,
+            ))
+        # Context size
+        context_size = self.rng.randint(1000, 15000)
+        if scenario["name"] == "cache_break":
+            context_size += self.rng.randint(5000, 20000)  # excessive context
+        # Retry count
+        retries = 0
+        if scenario.get("retries"):
+            retries = self.rng.randint(scenario["retries"] - 1, scenario["retries"] + 1)
+        elif self.rng.random() < 0.15:
+            retries = self.rng.randint(1, 2)
+        # Recovery action
+        recovery = None
+        if retries > 0:
+            recovery = self.rng.choice([
+                "retry_same", "retry_changed_prompt", "repair_tool",
+                "retrieve_more_context", "switch_model", "ask_clarification",
+            ])
+        # Outcome per step
+        step_outcome = Outcome.SUCCESS
+        if is_last:
+            step_outcome = scenario["outcome"]
+        elif scenario["name"] == "retry_loop" and step_idx >= 2:
+            step_outcome = Outcome.FAILURE
+        elif scenario["name"] == "false_done" and is_last:
+            step_outcome = Outcome.FALSE_DONE
+        return TraceStep(
+            step_id=step_id,
+            timestamp=datetime.utcnow() + timedelta(seconds=step_idx * 30),
+            task_type=task_type,
+            model_call=model_call,
+            tool_calls=tool_calls,
+            verifier_calls=verifier_calls,
+            context_size_tokens=context_size,
+            context_sources=["system_rules", "tool_descriptions", "user_preferences", "recent_messages"],
+            retry_count=retries,
+            recovery_action=recovery,
+            artifacts_created=[f"artifact_{step_idx}"] if self.rng.random() < 0.3 else [],
+            step_outcome=step_outcome,
+        )
+    def _get_tools_for_task(self, task_type: TaskType) -> List[str]:
+        mapping = {
+            TaskType.QUICK_ANSWER: ["calculator", "search"],
+            TaskType.CODING: ["file_read", "file_write", "code_execution", "linter", "test_runner"],
+            TaskType.RESEARCH: ["search", "retrieve", "fetch", "summarize"],
+            TaskType.DOCUMENT_DRAFTING: ["file_read", "summarize"],
+            TaskType.LEGAL_REGULATED: ["document_retrieval", "compliance_check", "search"],
+            TaskType.TOOL_HEAVY: ["search", "fetch", "api_call", "database_query"],
+            TaskType.RETRIEVAL_HEAVY: ["retrieve", "search", "fetch"],
+            TaskType.LONG_HORIZON: ["task_planner", "progress_tracker", "file_read"],
+            TaskType.UNKNOWN_AMBIGUOUS: ["search"],
+        }
+        return mapping.get(task_type, ["search"])
+    def to_dicts(self, traces: List[AgentTrace]) -> List[Dict[str, Any]]:
+        return [t.to_dict() for t in traces]
+    def save(self, traces: List[AgentTrace], path: str) -> None:
+        with open(path, "w") as f:
+            for trace in traces:
+                f.write(json.dumps(trace.to_dict()) + "\n")
+    def load(self, path: str) -> List[Dict[str, Any]]:
+        traces = []
+        with open(path, "r") as f:
+            for line in f:
+                traces.append(json.loads(line))
+        return traces