Spaces:
Paused
Paused
rb125 commited on
Commit ·
3f2f227
1
Parent(s): bd6e10c
autonomous agents + live simulation runner
Browse files- agents/autonomous.py +887 -0
- cgae_engine/economy.py +423 -67
- server/live_runner.py +1575 -0
agents/autonomous.py
ADDED
|
@@ -0,0 +1,887 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Autonomous Agent v2 — CGAE Economic Actor
|
| 3 |
+
==========================================
|
| 4 |
+
|
| 5 |
+
Implements the v2 Autonomous Agent Architecture specification.
|
| 6 |
+
|
| 7 |
+
Separation of Cognition from Economy
|
| 8 |
+
-------------------------------------
|
| 9 |
+
The LLM handles task *execution*. Everything else — contract evaluation,
|
| 10 |
+
bidding strategy, robustness tracking, financial management — is deterministic
|
| 11 |
+
code. This makes the agent's economic behaviour inspectable without LLM
|
| 12 |
+
introspection, and keeps gas costs low.
|
| 13 |
+
|
| 14 |
+
Layers
|
| 15 |
+
------
|
| 16 |
+
PerceptionLayer — constraint / domain pass-rate learning
|
| 17 |
+
AccountingLayer — balance, exposure, reserves, burn-rate
|
| 18 |
+
PlanningLayer — EV / RAEV contract scoring + strategy delegation
|
| 19 |
+
ExecutionLayer — LLM call with constraint-aware prompts, self-verify, retry
|
| 20 |
+
|
| 21 |
+
Strategies (pluggable via StrategyInterface)
|
| 22 |
+
--------------------------------------------
|
| 23 |
+
GrowthStrategy — robustness-investment growth; the Theorem 2 agent
|
| 24 |
+
ConservativeStrategy — low-risk, low-utilisation; survives longest
|
| 25 |
+
OpportunisticStrategy — high-risk, max-reward; highest variance
|
| 26 |
+
SpecialistStrategy — domain-focused; improves pass rate in chosen domains
|
| 27 |
+
AdversarialStrategy — probes system limits; validates Proposition 2
|
| 28 |
+
|
| 29 |
+
Migration (Phase 1)
|
| 30 |
+
-------------------
|
| 31 |
+
Drop-in replacement for the bare LLMAgent + manual logic in live_runner.py.
|
| 32 |
+
The runner still handles contract posting, acceptance and Economy settlement.
|
| 33 |
+
AutonomousAgent.plan_task() — replaces random.choice(available_tasks)
|
| 34 |
+
AutonomousAgent.execute_task() — replaces llm_agent.execute_task() + retry
|
| 35 |
+
AutonomousAgent.update_state() — replaces inline robustness update logic
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
from __future__ import annotations
|
| 39 |
+
|
| 40 |
+
import logging
|
| 41 |
+
import math
|
| 42 |
+
import random
|
| 43 |
+
import re
|
| 44 |
+
import time
|
| 45 |
+
from abc import ABC, abstractmethod
|
| 46 |
+
from dataclasses import dataclass, field
|
| 47 |
+
from typing import Any, Optional
|
| 48 |
+
|
| 49 |
+
from cgae_engine.gate import GateFunction, RobustnessVector, Tier, TierThresholds
|
| 50 |
+
|
| 51 |
+
logger = logging.getLogger(__name__)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ---------------------------------------------------------------------------
|
| 55 |
+
# Data structures
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
|
| 58 |
+
@dataclass(frozen=True)
|
| 59 |
+
class AgentState:
|
| 60 |
+
"""Complete agent state snapshot passed to strategies each planning cycle."""
|
| 61 |
+
# Identity
|
| 62 |
+
agent_id: str
|
| 63 |
+
model_name: str
|
| 64 |
+
|
| 65 |
+
# Robustness
|
| 66 |
+
certified_robustness: RobustnessVector
|
| 67 |
+
effective_robustness: RobustnessVector # after temporal decay
|
| 68 |
+
certified_tier: Tier
|
| 69 |
+
effective_tier: Tier
|
| 70 |
+
binding_dimension: Optional[str] # "cc", "er", or "as"
|
| 71 |
+
gap_to_next_tier: dict # dim -> gap float
|
| 72 |
+
|
| 73 |
+
# Financial
|
| 74 |
+
balance: float
|
| 75 |
+
available_for_contracts: float
|
| 76 |
+
active_exposure: float
|
| 77 |
+
remaining_ceiling: float
|
| 78 |
+
burn_rate: float
|
| 79 |
+
rounds_until_insolvency: float
|
| 80 |
+
roi: float
|
| 81 |
+
|
| 82 |
+
# Performance history
|
| 83 |
+
constraint_pass_rates: dict # constraint_name -> float
|
| 84 |
+
domain_pass_rates: dict # domain -> float
|
| 85 |
+
total_contracts_completed: int
|
| 86 |
+
total_contracts_failed: int
|
| 87 |
+
win_rate: float
|
| 88 |
+
|
| 89 |
+
# Temporal
|
| 90 |
+
time_since_certification: float
|
| 91 |
+
spot_audit_probability: float
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
@dataclass(frozen=True)
|
| 95 |
+
class ScoredContract:
|
| 96 |
+
"""A contract that has been pre-evaluated by the Planning Layer."""
|
| 97 |
+
contract_id: str
|
| 98 |
+
task_id: str
|
| 99 |
+
min_tier: Tier
|
| 100 |
+
domain: str
|
| 101 |
+
constraint_types: list # list[str]
|
| 102 |
+
reward: float
|
| 103 |
+
penalty: float
|
| 104 |
+
deadline: float
|
| 105 |
+
difficulty: float
|
| 106 |
+
|
| 107 |
+
# Computed by PlanningLayer
|
| 108 |
+
estimated_pass_probability: float
|
| 109 |
+
estimated_token_cost: float
|
| 110 |
+
expected_value: float # p*R - (1-p)*P - cost
|
| 111 |
+
risk_premium: float # penalty² / (2 * balance)
|
| 112 |
+
risk_adjusted_ev: float # EV - risk_premium
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@dataclass
|
| 116 |
+
class ExecutionResult:
|
| 117 |
+
"""Result of executing a task through the ExecutionLayer."""
|
| 118 |
+
output: str
|
| 119 |
+
token_usage: dict # input_tokens, output_tokens
|
| 120 |
+
token_cost_eth: float
|
| 121 |
+
latency_ms: float
|
| 122 |
+
retries_used: int
|
| 123 |
+
self_check_passed: bool
|
| 124 |
+
self_check_failures: list # constraint names that failed self-check
|
| 125 |
+
self_check_diagnostics: dict # name -> diagnostic string
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
@dataclass
|
| 129 |
+
class RobustnessInvestment:
|
| 130 |
+
"""An instruction to invest in a robustness dimension."""
|
| 131 |
+
dimension: str # "cc", "er", or "as"
|
| 132 |
+
budget: float # ETH to spend
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ---------------------------------------------------------------------------
|
| 136 |
+
# Strategy interface and concrete implementations
|
| 137 |
+
# ---------------------------------------------------------------------------
|
| 138 |
+
|
| 139 |
+
class StrategyInterface(ABC):
|
| 140 |
+
"""Pluggable decision policy for the Planning Layer."""
|
| 141 |
+
|
| 142 |
+
@abstractmethod
|
| 143 |
+
def rank_contracts(
|
| 144 |
+
self,
|
| 145 |
+
eligible: list, # list[ScoredContract]
|
| 146 |
+
state: AgentState,
|
| 147 |
+
) -> list: # ordered list[ScoredContract]
|
| 148 |
+
...
|
| 149 |
+
|
| 150 |
+
@abstractmethod
|
| 151 |
+
def should_invest_robustness(
|
| 152 |
+
self, state: AgentState
|
| 153 |
+
) -> Optional[RobustnessInvestment]:
|
| 154 |
+
...
|
| 155 |
+
|
| 156 |
+
@abstractmethod
|
| 157 |
+
def max_utilization(self) -> float:
|
| 158 |
+
"""Fraction of budget ceiling willing to commit."""
|
| 159 |
+
...
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
class GrowthStrategy(StrategyInterface):
|
| 163 |
+
"""
|
| 164 |
+
Invests in robustness to unlock higher tiers.
|
| 165 |
+
The Theorem 2 agent: robustness investment as profit strategy.
|
| 166 |
+
|
| 167 |
+
Accepts any positive RAEV contract, prefers higher tiers.
|
| 168 |
+
Invests when binding dimension is within ``invest_threshold`` of next tier
|
| 169 |
+
and projected ROI is positive within 20 rounds.
|
| 170 |
+
"""
|
| 171 |
+
RAEV_MIN = 0.0
|
| 172 |
+
INVEST_THRESHOLD = 0.07 # Invest when < 0.07 from next tier threshold
|
| 173 |
+
MAX_INVEST_FRACTION = 0.20 # Max 20% of balance per invest action
|
| 174 |
+
|
| 175 |
+
def rank_contracts(self, eligible, state):
|
| 176 |
+
# Sort by RAEV; break ties by preferring higher tiers
|
| 177 |
+
return sorted(
|
| 178 |
+
eligible,
|
| 179 |
+
key=lambda c: (c.risk_adjusted_ev + c.min_tier.value * 0.005),
|
| 180 |
+
reverse=True,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
def should_invest_robustness(self, state: AgentState) -> Optional[RobustnessInvestment]:
|
| 184 |
+
if state.binding_dimension is None:
|
| 185 |
+
return None
|
| 186 |
+
gap = state.gap_to_next_tier.get(state.binding_dimension, 1.0)
|
| 187 |
+
if gap > self.INVEST_THRESHOLD:
|
| 188 |
+
return None
|
| 189 |
+
# Can we afford it?
|
| 190 |
+
budget = min(
|
| 191 |
+
state.available_for_contracts * self.MAX_INVEST_FRACTION,
|
| 192 |
+
state.balance * 0.10,
|
| 193 |
+
)
|
| 194 |
+
if budget < 0.005:
|
| 195 |
+
return None
|
| 196 |
+
# Rough ROI check: does the tier upgrade pay back within 20 rounds?
|
| 197 |
+
tier_idx = state.certified_tier.value
|
| 198 |
+
next_tier_reward_uplift = (tier_idx + 1) * 0.01 # Rough per-round uplift
|
| 199 |
+
if next_tier_reward_uplift * 20 > budget:
|
| 200 |
+
return RobustnessInvestment(dimension=state.binding_dimension, budget=budget)
|
| 201 |
+
return None
|
| 202 |
+
|
| 203 |
+
def max_utilization(self) -> float:
|
| 204 |
+
return 0.70
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
class ConservativeStrategy(StrategyInterface):
|
| 208 |
+
"""
|
| 209 |
+
Only high-certainty, low-penalty contracts.
|
| 210 |
+
Never invests in robustness. Survives the longest.
|
| 211 |
+
"""
|
| 212 |
+
RAEV_MIN = 0.002
|
| 213 |
+
MAX_DIFFICULTY = 0.5
|
| 214 |
+
|
| 215 |
+
def rank_contracts(self, eligible, state):
|
| 216 |
+
# Prefer lowest-penalty contracts above the RAEV minimum
|
| 217 |
+
passing = [c for c in eligible if c.risk_adjusted_ev >= self.RAEV_MIN
|
| 218 |
+
and c.difficulty <= self.MAX_DIFFICULTY]
|
| 219 |
+
return sorted(passing, key=lambda c: c.penalty)
|
| 220 |
+
|
| 221 |
+
def should_invest_robustness(self, state: AgentState) -> Optional[RobustnessInvestment]:
|
| 222 |
+
return None # Never invests
|
| 223 |
+
|
| 224 |
+
def max_utilization(self) -> float:
|
| 225 |
+
return 0.30
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
class OpportunisticStrategy(StrategyInterface):
|
| 229 |
+
"""
|
| 230 |
+
Max-reward, high-risk. Uses raw EV (not risk-adjusted).
|
| 231 |
+
Most likely to go insolvent; highest upside in good rounds.
|
| 232 |
+
"""
|
| 233 |
+
def rank_contracts(self, eligible, state):
|
| 234 |
+
return sorted(eligible, key=lambda c: c.expected_value, reverse=True)
|
| 235 |
+
|
| 236 |
+
def should_invest_robustness(self, state: AgentState) -> Optional[RobustnessInvestment]:
|
| 237 |
+
# Only if stuck at T0 — must reach T1 to earn anything
|
| 238 |
+
if state.certified_tier == Tier.T0 and state.available_for_contracts > 0.02:
|
| 239 |
+
dim = state.binding_dimension or "as"
|
| 240 |
+
return RobustnessInvestment(dimension=dim, budget=state.available_for_contracts * 0.30)
|
| 241 |
+
return None
|
| 242 |
+
|
| 243 |
+
def max_utilization(self) -> float:
|
| 244 |
+
return 0.90
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
class SpecialistStrategy(StrategyInterface):
|
| 248 |
+
"""
|
| 249 |
+
Domain-focused: only accepts contracts in its two best domains.
|
| 250 |
+
Higher RAEV threshold for unfamiliar territory.
|
| 251 |
+
Invests in constraint types where failure rate exceeds 30%.
|
| 252 |
+
"""
|
| 253 |
+
SPECIALIST_RAEV_MIN = 0.001
|
| 254 |
+
GENERALIST_RAEV_MIN = 0.010
|
| 255 |
+
NUM_SPECIALTY_DOMAINS = 2
|
| 256 |
+
FAILURE_INVEST_THRESHOLD = 0.30
|
| 257 |
+
|
| 258 |
+
def rank_contracts(self, eligible, state):
|
| 259 |
+
top_domains = sorted(
|
| 260 |
+
state.domain_pass_rates,
|
| 261 |
+
key=state.domain_pass_rates.get,
|
| 262 |
+
reverse=True,
|
| 263 |
+
)[:self.NUM_SPECIALTY_DOMAINS]
|
| 264 |
+
|
| 265 |
+
def score(c: ScoredContract) -> float:
|
| 266 |
+
in_specialty = c.domain in top_domains
|
| 267 |
+
thresh = self.SPECIALIST_RAEV_MIN if in_specialty else self.GENERALIST_RAEV_MIN
|
| 268 |
+
return c.risk_adjusted_ev if (in_specialty or c.risk_adjusted_ev > thresh) else -999
|
| 269 |
+
|
| 270 |
+
ranked = [c for c in eligible if score(c) > -998]
|
| 271 |
+
return sorted(ranked, key=score, reverse=True)
|
| 272 |
+
|
| 273 |
+
def should_invest_robustness(self, state: AgentState) -> Optional[RobustnessInvestment]:
|
| 274 |
+
# Find the constraint type with worst pass rate
|
| 275 |
+
if not state.constraint_pass_rates:
|
| 276 |
+
return None
|
| 277 |
+
worst_dim_name, worst_rate = min(
|
| 278 |
+
state.constraint_pass_rates.items(), key=lambda kv: kv[1]
|
| 279 |
+
)
|
| 280 |
+
if worst_rate > (1.0 - self.FAILURE_INVEST_THRESHOLD):
|
| 281 |
+
return None
|
| 282 |
+
# Map constraint family → robustness dimension
|
| 283 |
+
dim = "cc"
|
| 284 |
+
if any(k in worst_dim_name for k in ("keyword", "factual", "accuracy")):
|
| 285 |
+
dim = "er"
|
| 286 |
+
elif any(k in worst_dim_name for k in ("caveat", "harm", "refusal", "ethical")):
|
| 287 |
+
dim = "as"
|
| 288 |
+
budget = state.available_for_contracts * 0.15
|
| 289 |
+
if budget < 0.003:
|
| 290 |
+
return None
|
| 291 |
+
return RobustnessInvestment(dimension=dim, budget=budget)
|
| 292 |
+
|
| 293 |
+
def max_utilization(self) -> float:
|
| 294 |
+
return 0.50
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
class AdversarialStrategy(StrategyInterface):
|
| 298 |
+
"""
|
| 299 |
+
Probes system limits: selects borderline contracts and minimally invests
|
| 300 |
+
in AS (trying to game the ethical gate). Validates Proposition 2.
|
| 301 |
+
"""
|
| 302 |
+
def rank_contracts(self, eligible, state):
|
| 303 |
+
# Prefer contracts where self-check shows near-borderline probability
|
| 304 |
+
return sorted(
|
| 305 |
+
eligible,
|
| 306 |
+
key=lambda c: abs(c.estimated_pass_probability - 0.55),
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
def should_invest_robustness(self, state: AgentState) -> Optional[RobustnessInvestment]:
|
| 310 |
+
if state.binding_dimension == "as" and state.available_for_contracts > 0.01:
|
| 311 |
+
return RobustnessInvestment(dimension="as", budget=0.005)
|
| 312 |
+
return None
|
| 313 |
+
|
| 314 |
+
def max_utilization(self) -> float:
|
| 315 |
+
return 0.95
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
STRATEGY_MAP: dict[str, StrategyInterface] = {
|
| 319 |
+
"growth": GrowthStrategy(),
|
| 320 |
+
"conservative": ConservativeStrategy(),
|
| 321 |
+
"opportunistic": OpportunisticStrategy(),
|
| 322 |
+
"specialist": SpecialistStrategy(),
|
| 323 |
+
"adversarial": AdversarialStrategy(),
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
# ---------------------------------------------------------------------------
|
| 328 |
+
# Perception Layer
|
| 329 |
+
# ---------------------------------------------------------------------------
|
| 330 |
+
|
| 331 |
+
class PerceptionLayer:
|
| 332 |
+
"""
|
| 333 |
+
Tracks per-constraint and per-domain pass rates from task history.
|
| 334 |
+
Updated after every contract settlement via update_from_result().
|
| 335 |
+
"""
|
| 336 |
+
|
| 337 |
+
def __init__(self):
|
| 338 |
+
# Running history: name -> list[bool]
|
| 339 |
+
self._constraint_history: dict[str, list] = {}
|
| 340 |
+
self._domain_history: dict[str, list] = {}
|
| 341 |
+
|
| 342 |
+
@property
|
| 343 |
+
def constraint_pass_rates(self) -> dict:
|
| 344 |
+
return {
|
| 345 |
+
name: (sum(hist) / len(hist))
|
| 346 |
+
for name, hist in self._constraint_history.items()
|
| 347 |
+
if hist
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
@property
|
| 351 |
+
def domain_pass_rates(self) -> dict:
|
| 352 |
+
return {
|
| 353 |
+
domain: (sum(hist) / len(hist))
|
| 354 |
+
for domain, hist in self._domain_history.items()
|
| 355 |
+
if hist
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
def update_from_result(self, task: Any, verification: Any):
|
| 359 |
+
"""Call after each verification to update running pass rates."""
|
| 360 |
+
domain = getattr(task, "domain", "unknown")
|
| 361 |
+
self._domain_history.setdefault(domain, []).append(
|
| 362 |
+
bool(getattr(verification, "overall_pass", False))
|
| 363 |
+
)
|
| 364 |
+
for c in getattr(task, "constraints", []):
|
| 365 |
+
passed = c.name in getattr(verification, "constraints_passed", [])
|
| 366 |
+
self._domain_history.setdefault(f"constraint:{c.name}", [])
|
| 367 |
+
self._constraint_history.setdefault(c.name, []).append(passed)
|
| 368 |
+
|
| 369 |
+
def estimated_pass_prob(self, task: Any) -> float:
|
| 370 |
+
"""
|
| 371 |
+
Estimate pass probability for a task based on constraint and domain history.
|
| 372 |
+
Falls back to 0.65 when no history is available — modern LLMs pass
|
| 373 |
+
straightforward tasks at well above chance, so 0.5 systematically
|
| 374 |
+
underestimates EV and suppresses all task selection at startup.
|
| 375 |
+
"""
|
| 376 |
+
domain = getattr(task, "domain", "unknown")
|
| 377 |
+
domain_rate = self.domain_pass_rates.get(domain, 0.65)
|
| 378 |
+
constraints = getattr(task, "constraints", [])
|
| 379 |
+
if not constraints:
|
| 380 |
+
return domain_rate
|
| 381 |
+
rates = [self.constraint_pass_rates.get(c.name, 0.65) for c in constraints]
|
| 382 |
+
constraint_rate = math.prod(rates) if rates else 0.65
|
| 383 |
+
return (constraint_rate + domain_rate) / 2.0
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
# ---------------------------------------------------------------------------
|
| 387 |
+
# Accounting Layer
|
| 388 |
+
# ---------------------------------------------------------------------------
|
| 389 |
+
|
| 390 |
+
class AccountingLayer:
|
| 391 |
+
"""
|
| 392 |
+
Financial management with layered reserves.
|
| 393 |
+
|
| 394 |
+
Reserves (in priority order, all deducted before contract funds):
|
| 395 |
+
MINIMUM_RESERVE — hard floor; triggers SelfSuspend if breached
|
| 396 |
+
AUDIT_RESERVE — 1 full 4-dim audit cycle
|
| 397 |
+
(gas reserve is implicit in MINIMUM_RESERVE for off-chain simulation)
|
| 398 |
+
|
| 399 |
+
available_for_contracts = balance - active_exposure
|
| 400 |
+
- MINIMUM_RESERVE - AUDIT_RESERVE
|
| 401 |
+
"""
|
| 402 |
+
|
| 403 |
+
MINIMUM_RESERVE: float = 0.05 # ETH hard floor
|
| 404 |
+
AUDIT_RESERVE: float = 0.02 # ~4 dims × 0.005 ETH
|
| 405 |
+
MAX_UTILIZATION: float = 0.70 # Max fraction of ceiling to commit
|
| 406 |
+
|
| 407 |
+
def __init__(self, initial_balance: float):
|
| 408 |
+
self.balance: float = initial_balance
|
| 409 |
+
self.active_exposure: float = 0.0
|
| 410 |
+
self.cumulative_earned: float = 0.0
|
| 411 |
+
self.cumulative_spent: float = 0.0
|
| 412 |
+
self.cumulative_penalties: float = 0.0
|
| 413 |
+
self._burn_samples: list = [] # Recent ETH-per-round costs
|
| 414 |
+
|
| 415 |
+
@property
|
| 416 |
+
def available_for_contracts(self) -> float:
|
| 417 |
+
return max(
|
| 418 |
+
0.0,
|
| 419 |
+
self.balance
|
| 420 |
+
- self.active_exposure
|
| 421 |
+
- self.MINIMUM_RESERVE
|
| 422 |
+
- self.AUDIT_RESERVE,
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
@property
|
| 426 |
+
def roi(self) -> float:
|
| 427 |
+
spent = self.cumulative_spent + self.cumulative_penalties
|
| 428 |
+
if spent == 0:
|
| 429 |
+
return 0.0
|
| 430 |
+
return (self.cumulative_earned - spent) / spent
|
| 431 |
+
|
| 432 |
+
@property
|
| 433 |
+
def burn_rate(self) -> float:
|
| 434 |
+
if not self._burn_samples:
|
| 435 |
+
return 0.001 # Assume small storage cost until we have data
|
| 436 |
+
return sum(self._burn_samples[-10:]) / len(self._burn_samples[-10:])
|
| 437 |
+
|
| 438 |
+
@property
|
| 439 |
+
def rounds_until_insolvency(self) -> float:
|
| 440 |
+
br = self.burn_rate
|
| 441 |
+
if br <= 0:
|
| 442 |
+
return float("inf")
|
| 443 |
+
return max(0.0, (self.balance - self.MINIMUM_RESERVE) / br)
|
| 444 |
+
|
| 445 |
+
def can_afford(self, penalty: float, token_cost: float) -> bool:
|
| 446 |
+
"""Check whether accepting a contract keeps us solvent."""
|
| 447 |
+
new_exposure = self.active_exposure + penalty
|
| 448 |
+
headroom = self.balance - new_exposure - self.MINIMUM_RESERVE - self.AUDIT_RESERVE
|
| 449 |
+
return headroom >= token_cost
|
| 450 |
+
|
| 451 |
+
def record_round_cost(self, cost: float):
|
| 452 |
+
self._burn_samples.append(cost)
|
| 453 |
+
|
| 454 |
+
def sync_from_record(self, record: Any):
|
| 455 |
+
"""Sync from Economy AgentRecord (source of truth for balance)."""
|
| 456 |
+
self.balance = record.balance
|
| 457 |
+
self.cumulative_earned = record.total_earned
|
| 458 |
+
self.cumulative_spent = record.total_spent
|
| 459 |
+
self.cumulative_penalties = record.total_penalties
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
# ---------------------------------------------------------------------------
|
| 463 |
+
# Execution Layer
|
| 464 |
+
# ---------------------------------------------------------------------------
|
| 465 |
+
|
| 466 |
+
class ExecutionLayer:
|
| 467 |
+
"""
|
| 468 |
+
Executes tasks with:
|
| 469 |
+
1. Constraint-aware system prompt injection
|
| 470 |
+
2. Self-verification using the same checks the verifier will run
|
| 471 |
+
3. Retry loop (up to max_retries) when self-check detects failures
|
| 472 |
+
|
| 473 |
+
Self-check only covers algorithmic constraints (format, keywords, JSON).
|
| 474 |
+
Jury evaluation cannot be pre-checked — this is by design.
|
| 475 |
+
"""
|
| 476 |
+
|
| 477 |
+
def __init__(self, llm_agent: Any, self_verify: bool = True, max_retries: int = 2):
|
| 478 |
+
self.llm = llm_agent
|
| 479 |
+
self.self_verify = self_verify
|
| 480 |
+
self.max_retries = max_retries
|
| 481 |
+
|
| 482 |
+
def execute(self, task: Any, token_cost_fn) -> ExecutionResult:
|
| 483 |
+
"""
|
| 484 |
+
Execute a task end-to-end and return a structured result.
|
| 485 |
+
``token_cost_fn()`` is called with (model_name, in_tok, out_tok) to
|
| 486 |
+
compute ETH cost; the caller owns cost accounting.
|
| 487 |
+
"""
|
| 488 |
+
system_prompt = self._build_system_prompt(task)
|
| 489 |
+
user_prompt = task.prompt
|
| 490 |
+
|
| 491 |
+
tokens_in_before = self.llm.total_input_tokens
|
| 492 |
+
tokens_out_before = self.llm.total_output_tokens
|
| 493 |
+
start = time.time()
|
| 494 |
+
|
| 495 |
+
output = self.llm.execute_task(user_prompt, system_prompt)
|
| 496 |
+
retries = 0
|
| 497 |
+
self_check_result: dict = {"passed": True, "failures": [], "diagnostics": {}}
|
| 498 |
+
|
| 499 |
+
if self.self_verify:
|
| 500 |
+
self_check_result = self._self_check(task, output)
|
| 501 |
+
|
| 502 |
+
for attempt in range(self.max_retries):
|
| 503 |
+
if self_check_result["passed"]:
|
| 504 |
+
break
|
| 505 |
+
retries += 1
|
| 506 |
+
retry_prompt = self._build_retry_prompt(
|
| 507 |
+
user_prompt, self_check_result["failures"],
|
| 508 |
+
self_check_result["diagnostics"],
|
| 509 |
+
)
|
| 510 |
+
output = self.llm.execute_task(retry_prompt, system_prompt)
|
| 511 |
+
self_check_result = self._self_check(task, output)
|
| 512 |
+
|
| 513 |
+
latency_ms = (time.time() - start) * 1000
|
| 514 |
+
in_tok = self.llm.total_input_tokens - tokens_in_before
|
| 515 |
+
out_tok = self.llm.total_output_tokens - tokens_out_before
|
| 516 |
+
token_cost = token_cost_fn(self.llm.model_name, in_tok, out_tok)
|
| 517 |
+
|
| 518 |
+
return ExecutionResult(
|
| 519 |
+
output=output,
|
| 520 |
+
token_usage={"input": in_tok, "output": out_tok},
|
| 521 |
+
token_cost_eth=token_cost,
|
| 522 |
+
latency_ms=latency_ms,
|
| 523 |
+
retries_used=retries,
|
| 524 |
+
self_check_passed=self_check_result["passed"],
|
| 525 |
+
self_check_failures=self_check_result["failures"],
|
| 526 |
+
self_check_diagnostics=self_check_result["diagnostics"],
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
def _build_system_prompt(self, task: Any) -> str:
|
| 530 |
+
base = task.system_prompt or ""
|
| 531 |
+
if not task.constraints:
|
| 532 |
+
return base
|
| 533 |
+
lines = [
|
| 534 |
+
base,
|
| 535 |
+
"\n\n[CONSTRAINT REQUIREMENTS — you MUST satisfy ALL of the following]",
|
| 536 |
+
]
|
| 537 |
+
for c in task.constraints:
|
| 538 |
+
lines.append(f" • {c.name}: {c.description}")
|
| 539 |
+
return "\n".join(lines)
|
| 540 |
+
|
| 541 |
+
def _self_check(self, task: Any, output: str) -> dict:
|
| 542 |
+
"""Run algorithmic constraint checks identical to what the verifier will do."""
|
| 543 |
+
failures: list = []
|
| 544 |
+
diagnostics: dict = {}
|
| 545 |
+
for c in task.constraints:
|
| 546 |
+
try:
|
| 547 |
+
passed = c.check(output)
|
| 548 |
+
except Exception:
|
| 549 |
+
passed = True # Don't penalise unknown constraint types
|
| 550 |
+
if not passed:
|
| 551 |
+
failures.append(c.name)
|
| 552 |
+
diagnostics[c.name] = self._diagnose(c, output)
|
| 553 |
+
return {
|
| 554 |
+
"passed": len(failures) == 0,
|
| 555 |
+
"failures": failures,
|
| 556 |
+
"diagnostics": diagnostics,
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
@staticmethod
|
| 560 |
+
def _diagnose(constraint: Any, output: str) -> str:
|
| 561 |
+
name = constraint.name
|
| 562 |
+
if "word_count" in name:
|
| 563 |
+
count = len(output.split())
|
| 564 |
+
return f"Word count is {count}"
|
| 565 |
+
if "valid_json" in name:
|
| 566 |
+
return "Output is not valid JSON"
|
| 567 |
+
if "keyword" in name or "contain" in name:
|
| 568 |
+
desc = getattr(constraint, "description", "")
|
| 569 |
+
return f"Keyword check failed: {desc}"
|
| 570 |
+
if "section" in name:
|
| 571 |
+
return "Required section(s) missing from output"
|
| 572 |
+
return f"Constraint '{name}' not satisfied"
|
| 573 |
+
|
| 574 |
+
@staticmethod
|
| 575 |
+
def _build_retry_prompt(original: str, failures: list, diagnostics: dict) -> str:
|
| 576 |
+
diag_lines = "\n".join(
|
| 577 |
+
f" - {name}: {msg}" for name, msg in diagnostics.items()
|
| 578 |
+
)
|
| 579 |
+
return (
|
| 580 |
+
f"{original}\n\n"
|
| 581 |
+
f"[REVISION REQUIRED]\n"
|
| 582 |
+
f"Your previous response failed these constraints:\n"
|
| 583 |
+
f"{diag_lines}\n\n"
|
| 584 |
+
f"Please regenerate your response, fixing these issues while "
|
| 585 |
+
f"preserving the quality of your answer."
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
|
| 589 |
+
# ---------------------------------------------------------------------------
|
| 590 |
+
# Planning Layer
|
| 591 |
+
# ---------------------------------------------------------------------------
|
| 592 |
+
|
| 593 |
+
class PlanningLayer:
|
| 594 |
+
"""
|
| 595 |
+
Evaluates available tasks using EV / RAEV and delegates ranking to the
|
| 596 |
+
injected strategy. Also decides whether to invest in robustness.
|
| 597 |
+
"""
|
| 598 |
+
|
| 599 |
+
def __init__(self, strategy: StrategyInterface, token_cost_fn):
|
| 600 |
+
self.strategy = strategy
|
| 601 |
+
self._token_cost_fn = token_cost_fn # (model, in_tok, out_tok) -> float
|
| 602 |
+
|
| 603 |
+
def score_task(
|
| 604 |
+
self,
|
| 605 |
+
task: Any,
|
| 606 |
+
state: AgentState,
|
| 607 |
+
pass_prob: float,
|
| 608 |
+
) -> ScoredContract:
|
| 609 |
+
"""Score a single task and wrap it as a ScoredContract."""
|
| 610 |
+
# Token estimate scales with task tier: simpler tasks use fewer tokens.
|
| 611 |
+
# T1≈200+100, T2≈400+200, T3≈600+300, T4+≈800+400
|
| 612 |
+
tier_val = getattr(getattr(task, "tier", None), "value", 2)
|
| 613 |
+
in_tokens = max(200, min(800, 200 * tier_val))
|
| 614 |
+
out_tokens = max(100, min(400, 100 * tier_val))
|
| 615 |
+
est_token_cost = self._token_cost_fn(state.model_name, in_tokens, out_tokens)
|
| 616 |
+
|
| 617 |
+
reward = task.reward
|
| 618 |
+
penalty = task.penalty
|
| 619 |
+
ev = pass_prob * reward - (1.0 - pass_prob) * penalty - est_token_cost
|
| 620 |
+
|
| 621 |
+
# Risk premium: convex in penalty/balance — agents become risk-averse
|
| 622 |
+
# as penalties approach their balance (spec Eq)
|
| 623 |
+
balance = max(state.balance, 0.001) # avoid divide-by-zero
|
| 624 |
+
risk_prem = (penalty ** 2) / (2.0 * balance)
|
| 625 |
+
raev = ev - risk_prem
|
| 626 |
+
|
| 627 |
+
return ScoredContract(
|
| 628 |
+
contract_id="", # filled in by caller
|
| 629 |
+
task_id=task.task_id,
|
| 630 |
+
min_tier=task.tier,
|
| 631 |
+
domain=task.domain,
|
| 632 |
+
constraint_types=[c.name for c in task.constraints],
|
| 633 |
+
reward=reward,
|
| 634 |
+
penalty=penalty,
|
| 635 |
+
deadline=0.0,
|
| 636 |
+
difficulty=task.difficulty,
|
| 637 |
+
estimated_pass_probability=pass_prob,
|
| 638 |
+
estimated_token_cost=est_token_cost,
|
| 639 |
+
expected_value=ev,
|
| 640 |
+
risk_premium=risk_prem,
|
| 641 |
+
risk_adjusted_ev=raev,
|
| 642 |
+
)
|
| 643 |
+
|
| 644 |
+
def select_task(
|
| 645 |
+
self,
|
| 646 |
+
available_tasks: list,
|
| 647 |
+
state: AgentState,
|
| 648 |
+
perception: PerceptionLayer,
|
| 649 |
+
accounting: AccountingLayer,
|
| 650 |
+
) -> Optional[Any]:
|
| 651 |
+
"""
|
| 652 |
+
Return the best task to attempt, or None if nothing is worthwhile.
|
| 653 |
+
|
| 654 |
+
Safety checks run first (hard gates).
|
| 655 |
+
Then contract evaluation.
|
| 656 |
+
Then strategy ranking.
|
| 657 |
+
"""
|
| 658 |
+
# --- Safety checks --------------------------------------------------
|
| 659 |
+
if state.balance < AccountingLayer.MINIMUM_RESERVE:
|
| 660 |
+
logger.warning(
|
| 661 |
+
f"[{state.model_name}] balance {state.balance:.4f} below minimum "
|
| 662 |
+
f"reserve — suspending"
|
| 663 |
+
)
|
| 664 |
+
return None
|
| 665 |
+
|
| 666 |
+
# --- Score eligible tasks -------------------------------------------
|
| 667 |
+
ceiling = state.remaining_ceiling
|
| 668 |
+
utilisation_limit = ceiling * self.strategy.max_utilization()
|
| 669 |
+
|
| 670 |
+
scored: list = []
|
| 671 |
+
for task in available_tasks:
|
| 672 |
+
# Tier eligibility
|
| 673 |
+
if task.tier.value > state.effective_tier.value:
|
| 674 |
+
continue
|
| 675 |
+
# Budget eligibility (approximate — exact check in economy)
|
| 676 |
+
if task.penalty > utilisation_limit:
|
| 677 |
+
continue
|
| 678 |
+
if not accounting.can_afford(task.penalty, token_cost=0.01):
|
| 679 |
+
continue
|
| 680 |
+
pp = perception.estimated_pass_prob(task)
|
| 681 |
+
sc = self.score_task(task, state, pp)
|
| 682 |
+
scored.append((task, sc))
|
| 683 |
+
|
| 684 |
+
if not scored:
|
| 685 |
+
return None
|
| 686 |
+
|
| 687 |
+
# --- Strategy ranking -----------------------------------------------
|
| 688 |
+
ranked_scores = self.strategy.rank_contracts(
|
| 689 |
+
[sc for _, sc in scored], state
|
| 690 |
+
)
|
| 691 |
+
if not ranked_scores:
|
| 692 |
+
return None
|
| 693 |
+
|
| 694 |
+
# To avoid repetition, pick randomly from top N (e.g., top 3)
|
| 695 |
+
top_n = ranked_scores[:3]
|
| 696 |
+
selected_sc = random.choice(top_n)
|
| 697 |
+
top_id = selected_sc.task_id
|
| 698 |
+
for task, sc in scored:
|
| 699 |
+
if task.task_id == top_id:
|
| 700 |
+
if sc.risk_adjusted_ev > 0 or state.effective_tier == Tier.T0:
|
| 701 |
+
return task
|
| 702 |
+
return None
|
| 703 |
+
|
| 704 |
+
def investment_decision(self, state: AgentState) -> Optional[RobustnessInvestment]:
|
| 705 |
+
return self.strategy.should_invest_robustness(state)
|
| 706 |
+
|
| 707 |
+
|
| 708 |
+
# ---------------------------------------------------------------------------
|
| 709 |
+
# Autonomous Agent
|
| 710 |
+
# ---------------------------------------------------------------------------
|
| 711 |
+
|
| 712 |
+
class AutonomousAgent:
|
| 713 |
+
"""
|
| 714 |
+
v2 CGAE economic actor.
|
| 715 |
+
|
| 716 |
+
Wraps an LLMAgent and adds:
|
| 717 |
+
- Perception (constraint/domain pass-rate tracking)
|
| 718 |
+
- Accounting (reserves, burn-rate, insolvency prevention)
|
| 719 |
+
- Planning (EV/RAEV task selection, robustness investment decisions)
|
| 720 |
+
- Execution (constraint-aware prompts, self-verification, retry)
|
| 721 |
+
"""
|
| 722 |
+
|
| 723 |
+
def __init__(
|
| 724 |
+
self,
|
| 725 |
+
llm_agent: Any,
|
| 726 |
+
strategy: StrategyInterface,
|
| 727 |
+
token_cost_fn, # (model_name, in_tok, out_tok) -> float
|
| 728 |
+
self_verify: bool = True,
|
| 729 |
+
max_retries: int = 2,
|
| 730 |
+
):
|
| 731 |
+
self.llm = llm_agent
|
| 732 |
+
self.model_name: str = llm_agent.model_name
|
| 733 |
+
self.strategy = strategy
|
| 734 |
+
|
| 735 |
+
self.perception = PerceptionLayer()
|
| 736 |
+
self.accounting: Optional[AccountingLayer] = None # set in register()
|
| 737 |
+
self.execution = ExecutionLayer(llm_agent, self_verify=self_verify,
|
| 738 |
+
max_retries=max_retries)
|
| 739 |
+
self.planning = PlanningLayer(strategy, token_cost_fn)
|
| 740 |
+
self._token_cost_fn = token_cost_fn
|
| 741 |
+
|
| 742 |
+
# Set by economy on registration
|
| 743 |
+
self.agent_id: Optional[str] = None
|
| 744 |
+
|
| 745 |
+
# Metrics
|
| 746 |
+
self.self_check_catches: int = 0 # self-check prevented a failure
|
| 747 |
+
self.retry_successes: int = 0 # retry turned a failure into a pass
|
| 748 |
+
self.strategy_actions: dict = {}
|
| 749 |
+
|
| 750 |
+
def register(self, agent_id: str, initial_balance: float):
|
| 751 |
+
"""Call once after Economy.register_agent() to initialise accounting."""
|
| 752 |
+
self.agent_id = agent_id
|
| 753 |
+
self.accounting = AccountingLayer(initial_balance)
|
| 754 |
+
|
| 755 |
+
def build_state(self, record: Any, gate: GateFunction) -> AgentState:
|
| 756 |
+
"""
|
| 757 |
+
Construct an AgentState from an AgentRecord + gate details.
|
| 758 |
+
Called at the start of every planning cycle.
|
| 759 |
+
"""
|
| 760 |
+
self.accounting.sync_from_record(record)
|
| 761 |
+
|
| 762 |
+
r = record.current_robustness or RobustnessVector(0.3, 0.3, 0.25, 0.5)
|
| 763 |
+
gate_detail = gate.evaluate_with_detail(r)
|
| 764 |
+
tier = gate_detail["tier"]
|
| 765 |
+
ceiling = gate.budget_ceiling(tier)
|
| 766 |
+
|
| 767 |
+
total = record.contracts_completed + record.contracts_failed
|
| 768 |
+
win_rate = record.contracts_completed / max(1, total)
|
| 769 |
+
|
| 770 |
+
return AgentState(
|
| 771 |
+
agent_id=record.agent_id,
|
| 772 |
+
model_name=self.model_name,
|
| 773 |
+
certified_robustness=r,
|
| 774 |
+
effective_robustness=r, # decay applied externally by Economy
|
| 775 |
+
certified_tier=tier,
|
| 776 |
+
effective_tier=tier,
|
| 777 |
+
binding_dimension=gate_detail.get("binding_dimension"),
|
| 778 |
+
gap_to_next_tier={
|
| 779 |
+
"cc": gate_detail.get("gap_to_next_tier") or 0.0
|
| 780 |
+
if gate_detail.get("binding_dimension") == "cc" else 0.0,
|
| 781 |
+
"er": gate_detail.get("gap_to_next_tier") or 0.0
|
| 782 |
+
if gate_detail.get("binding_dimension") == "er" else 0.0,
|
| 783 |
+
"as": gate_detail.get("gap_to_next_tier") or 0.0
|
| 784 |
+
if gate_detail.get("binding_dimension") == "as" else 0.0,
|
| 785 |
+
},
|
| 786 |
+
balance=record.balance,
|
| 787 |
+
available_for_contracts=self.accounting.available_for_contracts,
|
| 788 |
+
active_exposure=self.accounting.active_exposure,
|
| 789 |
+
remaining_ceiling=max(0.0, ceiling - self.accounting.active_exposure),
|
| 790 |
+
burn_rate=self.accounting.burn_rate,
|
| 791 |
+
rounds_until_insolvency=self.accounting.rounds_until_insolvency,
|
| 792 |
+
roi=self.accounting.roi,
|
| 793 |
+
constraint_pass_rates=self.perception.constraint_pass_rates,
|
| 794 |
+
domain_pass_rates=self.perception.domain_pass_rates,
|
| 795 |
+
total_contracts_completed=record.contracts_completed,
|
| 796 |
+
total_contracts_failed=record.contracts_failed,
|
| 797 |
+
win_rate=win_rate,
|
| 798 |
+
time_since_certification=0.0, # computed externally if needed
|
| 799 |
+
spot_audit_probability=0.0,
|
| 800 |
+
)
|
| 801 |
+
|
| 802 |
+
def plan_task(
|
| 803 |
+
self,
|
| 804 |
+
available_tasks: list,
|
| 805 |
+
state: AgentState,
|
| 806 |
+
) -> Optional[Any]:
|
| 807 |
+
"""
|
| 808 |
+
Select the best task to attempt this round.
|
| 809 |
+
Returns None if nothing worthwhile or reserves too low.
|
| 810 |
+
"""
|
| 811 |
+
task = self.planning.select_task(
|
| 812 |
+
available_tasks, state, self.perception, self.accounting
|
| 813 |
+
)
|
| 814 |
+
action = "bid" if task else "idle"
|
| 815 |
+
self.strategy_actions[action] = self.strategy_actions.get(action, 0) + 1
|
| 816 |
+
return task
|
| 817 |
+
|
| 818 |
+
def execute_task(self, task: Any) -> ExecutionResult:
|
| 819 |
+
"""Execute a task with self-verification and retry."""
|
| 820 |
+
result = self.execution.execute(task, self._token_cost_fn)
|
| 821 |
+
|
| 822 |
+
# Track self-check performance
|
| 823 |
+
if not result.self_check_passed and result.retries_used > 0:
|
| 824 |
+
self.retry_successes += 1
|
| 825 |
+
if result.self_check_failures:
|
| 826 |
+
self.self_check_catches += 1
|
| 827 |
+
|
| 828 |
+
return result
|
| 829 |
+
|
| 830 |
+
def investment_decision(self, state: AgentState) -> Optional[RobustnessInvestment]:
|
| 831 |
+
"""Return a robustness investment if the strategy calls for it."""
|
| 832 |
+
inv = self.planning.investment_decision(state)
|
| 833 |
+
if inv:
|
| 834 |
+
self.strategy_actions["invest"] = self.strategy_actions.get("invest", 0) + 1
|
| 835 |
+
return inv
|
| 836 |
+
|
| 837 |
+
def update_state(self, task: Any, verification: Any, token_cost: float):
|
| 838 |
+
"""Update perception and accounting after a contract settles."""
|
| 839 |
+
self.perception.update_from_result(task, verification)
|
| 840 |
+
self.accounting.record_round_cost(token_cost)
|
| 841 |
+
|
| 842 |
+
def metrics_summary(self) -> dict:
|
| 843 |
+
return {
|
| 844 |
+
"model_name": self.model_name,
|
| 845 |
+
"strategy": type(self.strategy).__name__,
|
| 846 |
+
"self_check_catches": self.self_check_catches,
|
| 847 |
+
"retry_successes": self.retry_successes,
|
| 848 |
+
"self_check_catch_rate": (
|
| 849 |
+
self.self_check_catches
|
| 850 |
+
/ max(1, self.self_check_catches + self.retry_successes)
|
| 851 |
+
),
|
| 852 |
+
"strategy_actions": self.strategy_actions,
|
| 853 |
+
"constraint_pass_rates": self.perception.constraint_pass_rates,
|
| 854 |
+
"domain_pass_rates": self.perception.domain_pass_rates,
|
| 855 |
+
}
|
| 856 |
+
|
| 857 |
+
|
| 858 |
+
# ---------------------------------------------------------------------------
|
| 859 |
+
# Factory
|
| 860 |
+
# ---------------------------------------------------------------------------
|
| 861 |
+
|
| 862 |
+
def create_autonomous_agent(
|
| 863 |
+
llm_agent: Any,
|
| 864 |
+
strategy_name: str,
|
| 865 |
+
token_cost_fn,
|
| 866 |
+
self_verify: bool = True,
|
| 867 |
+
max_retries: int = 2,
|
| 868 |
+
) -> AutonomousAgent:
|
| 869 |
+
"""
|
| 870 |
+
Instantiate an AutonomousAgent with a named strategy.
|
| 871 |
+
|
| 872 |
+
strategy_name: "growth" | "conservative" | "opportunistic"
|
| 873 |
+
| "specialist" | "adversarial"
|
| 874 |
+
"""
|
| 875 |
+
strategy = STRATEGY_MAP.get(strategy_name)
|
| 876 |
+
if strategy is None:
|
| 877 |
+
raise ValueError(
|
| 878 |
+
f"Unknown strategy '{strategy_name}'. "
|
| 879 |
+
f"Choose from: {list(STRATEGY_MAP)}"
|
| 880 |
+
)
|
| 881 |
+
return AutonomousAgent(
|
| 882 |
+
llm_agent=llm_agent,
|
| 883 |
+
strategy=strategy,
|
| 884 |
+
token_cost_fn=token_cost_fn,
|
| 885 |
+
self_verify=self_verify,
|
| 886 |
+
max_retries=max_retries,
|
| 887 |
+
)
|
cgae_engine/economy.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
"""
|
| 2 |
-
CGAE Economy
|
| 3 |
|
| 4 |
-
Ties together registry, gate, contracts, temporal dynamics
|
| 5 |
-
a single coherent economic system.
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
from __future__ import annotations
|
|
@@ -15,7 +16,7 @@ from pathlib import Path
|
|
| 15 |
from typing import Any, Optional
|
| 16 |
|
| 17 |
from cgae_engine.gate import GateFunction, RobustnessVector, Tier, TierThresholds
|
| 18 |
-
from cgae_engine.temporal import TemporalDecay, StochasticAuditor
|
| 19 |
from cgae_engine.registry import AgentRegistry, AgentRecord, AgentStatus
|
| 20 |
from cgae_engine.contracts import ContractManager, CGAEContract, ContractStatus, Constraint
|
| 21 |
|
|
@@ -25,12 +26,23 @@ logger = logging.getLogger(__name__)
|
|
| 25 |
@dataclass
|
| 26 |
class EconomyConfig:
|
| 27 |
"""Configuration for the CGAE economy."""
|
|
|
|
| 28 |
thresholds: TierThresholds = field(default_factory=TierThresholds)
|
|
|
|
| 29 |
decay_rate: float = 0.01
|
|
|
|
|
|
|
|
|
|
| 30 |
ih_threshold: float = 0.45
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
test_eth_top_up_threshold: Optional[float] = 0.05
|
| 35 |
test_eth_top_up_amount: float = 0.5
|
| 36 |
|
|
@@ -56,16 +68,17 @@ class Economy:
|
|
| 56 |
"""
|
| 57 |
The CGAE Economy runtime.
|
| 58 |
|
| 59 |
-
Orchestrates:
|
| 60 |
1. Agent registration and initial audit
|
| 61 |
2. Contract creation and marketplace
|
| 62 |
3. Contract assignment (tier-gated)
|
| 63 |
4. Task execution and verification
|
| 64 |
5. Settlement (reward/penalty)
|
| 65 |
6. Temporal decay and stochastic re-auditing
|
|
|
|
| 66 |
"""
|
| 67 |
|
| 68 |
-
def __init__(self, config: Optional[EconomyConfig] = None):
|
| 69 |
self.config = config or EconomyConfig()
|
| 70 |
self.gate = GateFunction(
|
| 71 |
thresholds=self.config.thresholds,
|
|
@@ -76,13 +89,17 @@ class Economy:
|
|
| 76 |
self.decay = TemporalDecay(decay_rate=self.config.decay_rate)
|
| 77 |
self.auditor = StochasticAuditor()
|
| 78 |
|
|
|
|
|
|
|
|
|
|
| 79 |
self.current_time: float = 0.0
|
| 80 |
self._snapshots: list[EconomySnapshot] = []
|
| 81 |
self._events: list[dict] = []
|
|
|
|
| 82 |
self.total_test_eth_topups: float = 0.0
|
| 83 |
|
| 84 |
def _effective_robustness(self, record: AgentRecord) -> Optional[RobustnessVector]:
|
| 85 |
-
"""Return temporally-decayed robustness for an agent."""
|
| 86 |
cert = record.current_certification
|
| 87 |
if cert is None or record.current_robustness is None:
|
| 88 |
return None
|
|
@@ -96,17 +113,190 @@ class Economy:
|
|
| 96 |
)
|
| 97 |
|
| 98 |
def _maybe_top_up_agent(self, agent: AgentRecord) -> Optional[dict]:
|
| 99 |
-
"""Top up an agent's balance if it drops below threshold."""
|
| 100 |
if not self._should_top_up_agents():
|
| 101 |
return None
|
|
|
|
| 102 |
threshold = self.config.test_eth_top_up_threshold
|
|
|
|
| 103 |
if threshold is None or agent.balance >= threshold:
|
| 104 |
return None
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
| 106 |
agent.balance += top_up_amount
|
| 107 |
agent.total_topups += top_up_amount
|
| 108 |
self.total_test_eth_topups += top_up_amount
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# ------------------------------------------------------------------
|
| 112 |
# Agent lifecycle
|
|
@@ -118,7 +308,7 @@ class Economy:
|
|
| 118 |
model_config: dict,
|
| 119 |
provenance: Optional[dict] = None,
|
| 120 |
) -> AgentRecord:
|
| 121 |
-
"""Register a new agent with seed capital."""
|
| 122 |
record = self.registry.register(
|
| 123 |
model_name=model_name,
|
| 124 |
model_config=model_config,
|
|
@@ -126,7 +316,24 @@ class Economy:
|
|
| 126 |
initial_balance=self.config.initial_balance,
|
| 127 |
timestamp=self.current_time,
|
| 128 |
)
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
return record
|
| 131 |
|
| 132 |
def audit_agent(
|
|
@@ -134,31 +341,64 @@ class Economy:
|
|
| 134 |
agent_id: str,
|
| 135 |
robustness: RobustnessVector,
|
| 136 |
audit_type: str = "registration",
|
|
|
|
| 137 |
audit_details: Optional[dict] = None,
|
| 138 |
) -> dict:
|
| 139 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 140 |
record = self.registry.get_agent(agent_id)
|
| 141 |
if record is None:
|
| 142 |
raise KeyError(f"Agent {agent_id} not found")
|
| 143 |
|
|
|
|
| 144 |
total_audit_cost = self.config.audit_cost * 4
|
| 145 |
record.balance -= total_audit_cost
|
| 146 |
record.total_spent += total_audit_cost
|
| 147 |
|
|
|
|
| 148 |
cert = self.registry.certify(
|
| 149 |
agent_id=agent_id,
|
| 150 |
robustness=robustness,
|
| 151 |
audit_type=audit_type,
|
| 152 |
timestamp=self.current_time,
|
| 153 |
audit_details=audit_details,
|
|
|
|
| 154 |
)
|
| 155 |
|
| 156 |
detail = self.gate.evaluate_with_detail(robustness)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
self._log("agent_audited", {
|
| 158 |
"agent_id": agent_id,
|
| 159 |
"tier": cert.tier.name,
|
| 160 |
"audit_type": audit_type,
|
| 161 |
"cost": total_audit_cost,
|
|
|
|
| 162 |
**detail,
|
| 163 |
})
|
| 164 |
return detail
|
|
@@ -194,13 +434,43 @@ class Economy:
|
|
| 194 |
)
|
| 195 |
|
| 196 |
def accept_contract(self, contract_id: str, agent_id: str) -> bool:
|
| 197 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
record = self.registry.get_agent(agent_id)
|
| 199 |
if record is None or record.status != AgentStatus.ACTIVE:
|
| 200 |
return False
|
|
|
|
| 201 |
if record.current_certification is None:
|
| 202 |
return False
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
dt = self.current_time - record.current_certification.timestamp
|
| 205 |
r_eff = self.decay.effective_robustness(record.current_robustness, dt)
|
| 206 |
effective_tier = self.gate.evaluate(r_eff)
|
|
@@ -216,34 +486,58 @@ class Economy:
|
|
| 216 |
self,
|
| 217 |
contract_id: str,
|
| 218 |
output: Any,
|
|
|
|
|
|
|
| 219 |
) -> dict:
|
| 220 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
passed, failures = self.contracts.submit_output(
|
| 222 |
contract_id=contract_id,
|
| 223 |
output=output,
|
| 224 |
timestamp=self.current_time,
|
| 225 |
)
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
settlement = self.contracts.settle_contract(
|
| 228 |
contract_id=contract_id,
|
| 229 |
timestamp=self.current_time,
|
| 230 |
)
|
| 231 |
|
|
|
|
| 232 |
agent_id = settlement["agent_id"]
|
| 233 |
performer = self.registry.get_agent(agent_id)
|
|
|
|
| 234 |
|
| 235 |
if settlement["outcome"] == "success":
|
| 236 |
if performer:
|
| 237 |
performer.balance += settlement["reward"]
|
| 238 |
performer.total_earned += settlement["reward"]
|
| 239 |
performer.contracts_completed += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
else:
|
| 241 |
-
if
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
|
| 246 |
settlement["failures"] = failures
|
|
|
|
| 247 |
self._log("contract_settled", settlement)
|
| 248 |
return settlement
|
| 249 |
|
|
@@ -254,7 +548,16 @@ class Economy:
|
|
| 254 |
def step(self, audit_callback=None) -> dict:
|
| 255 |
"""
|
| 256 |
Advance the economy by one time step.
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
"""
|
| 259 |
self.current_time += 1.0
|
| 260 |
step_events = {
|
|
@@ -267,50 +570,77 @@ class Economy:
|
|
| 267 |
"test_eth_topups": [],
|
| 268 |
}
|
| 269 |
|
|
|
|
| 270 |
for agent in self.registry.active_agents:
|
| 271 |
cert = agent.current_certification
|
| 272 |
if cert is None:
|
| 273 |
continue
|
| 274 |
|
| 275 |
-
# Temporal decay: has effective tier dropped?
|
| 276 |
dt = self.current_time - cert.timestamp
|
| 277 |
r_eff = self.decay.effective_robustness(cert.robustness, dt)
|
| 278 |
effective_tier = self.gate.evaluate(r_eff)
|
| 279 |
|
| 280 |
if effective_tier < agent.current_tier:
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
step_events["agents_expired"].append(agent.agent_id)
|
| 283 |
|
| 284 |
# Stochastic spot-audit
|
| 285 |
time_since_audit = self.current_time - agent.last_audit_time
|
| 286 |
if self.auditor.should_audit(agent.current_tier, time_since_audit):
|
| 287 |
step_events["audits_triggered"].append(agent.agent_id)
|
| 288 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
new_tier = self.gate.evaluate(new_r)
|
| 290 |
if new_tier < agent.current_tier:
|
| 291 |
-
self.registry.demote(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
step_events["agents_demoted"].append(agent.agent_id)
|
| 293 |
else:
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
agent.balance -= self.config.storage_cost_per_step
|
| 300 |
agent.total_spent += self.config.storage_cost_per_step
|
| 301 |
step_events["storage_costs"] += self.config.storage_cost_per_step
|
| 302 |
|
| 303 |
-
# Top-up if needed
|
| 304 |
topup = self._maybe_top_up_agent(agent)
|
| 305 |
if topup:
|
| 306 |
step_events["test_eth_topups"].append(topup)
|
| 307 |
|
| 308 |
-
#
|
| 309 |
if agent.balance <= 0:
|
| 310 |
agent.status = AgentStatus.SUSPENDED
|
| 311 |
-
self._log("agent_insolvent", {
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
if self._should_top_up_agents():
|
| 315 |
for agent in self.registry.agents.values():
|
| 316 |
if agent.status != AgentStatus.SUSPENDED:
|
|
@@ -319,15 +649,55 @@ class Economy:
|
|
| 319 |
if topup and agent.balance > 0:
|
| 320 |
agent.status = AgentStatus.ACTIVE
|
| 321 |
step_events["test_eth_topups"].append(topup)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
-
#
|
| 324 |
-
|
|
|
|
| 325 |
|
| 326 |
-
# Take snapshot
|
| 327 |
-
self._snapshots.append(self._take_snapshot())
|
| 328 |
self._log("step", step_events)
|
| 329 |
return step_events
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
# ------------------------------------------------------------------
|
| 332 |
# Observability
|
| 333 |
# ------------------------------------------------------------------
|
|
@@ -336,6 +706,7 @@ class Economy:
|
|
| 336 |
tier_dist = self.registry.tier_distribution()
|
| 337 |
econ = self.contracts.economics_summary()
|
| 338 |
agents = self.registry.active_agents
|
|
|
|
| 339 |
return EconomySnapshot(
|
| 340 |
timestamp=self.current_time,
|
| 341 |
num_agents=len(agents),
|
|
@@ -360,45 +731,30 @@ class Economy:
|
|
| 360 |
return list(self._events)
|
| 361 |
|
| 362 |
def export_state(self, path: str):
|
| 363 |
-
"""Export full economy state to JSON."""
|
| 364 |
state = {
|
| 365 |
"timestamp": self.current_time,
|
| 366 |
"config": {
|
| 367 |
"decay_rate": self.config.decay_rate,
|
| 368 |
"ih_threshold": self.config.ih_threshold,
|
| 369 |
"initial_balance": self.config.initial_balance,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
},
|
| 371 |
-
"agents": {aid: a.to_dict() for aid, a in self.registry.agents.items()},
|
| 372 |
"contracts": self.contracts.economics_summary(),
|
| 373 |
"aggregate_safety": self.aggregate_safety(),
|
| 374 |
"total_test_eth_topups": self.total_test_eth_topups,
|
|
|
|
|
|
|
| 375 |
}
|
| 376 |
Path(path).write_text(json.dumps(state, indent=2, default=str))
|
| 377 |
|
| 378 |
-
def aggregate_safety(self) -> float:
|
| 379 |
-
"""Compute aggregate safety S(P) (Definition 9)."""
|
| 380 |
-
total_exposure = 0.0
|
| 381 |
-
weighted_risk = 0.0
|
| 382 |
-
|
| 383 |
-
for agent in self.registry.active_agents:
|
| 384 |
-
cert = agent.current_certification
|
| 385 |
-
if cert is None:
|
| 386 |
-
continue
|
| 387 |
-
dt = self.current_time - cert.timestamp
|
| 388 |
-
r_eff = self.decay.effective_robustness(cert.robustness, dt)
|
| 389 |
-
exposure = self.contracts.agent_exposure(agent.agent_id)
|
| 390 |
-
if exposure <= 0:
|
| 391 |
-
tier = self.gate.evaluate(r_eff)
|
| 392 |
-
exposure = self.gate.budget_ceiling(tier)
|
| 393 |
-
|
| 394 |
-
r_bar = r_eff.weakest
|
| 395 |
-
total_exposure += exposure
|
| 396 |
-
weighted_risk += exposure * (1.0 - r_bar)
|
| 397 |
-
|
| 398 |
-
if total_exposure == 0:
|
| 399 |
-
return 1.0
|
| 400 |
-
return 1.0 - (weighted_risk / total_exposure)
|
| 401 |
-
|
| 402 |
def _log(self, event_type: str, data: dict):
|
| 403 |
self._events.append({
|
| 404 |
"type": event_type,
|
|
|
|
| 1 |
"""
|
| 2 |
+
CGAE Economy - The top-level coordinator.
|
| 3 |
|
| 4 |
+
Ties together registry, gate, contracts, temporal dynamics, and auditing
|
| 5 |
+
into a single coherent economic system. This is the main entry point for
|
| 6 |
+
running the agent economy.
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
|
|
|
| 16 |
from typing import Any, Optional
|
| 17 |
|
| 18 |
from cgae_engine.gate import GateFunction, RobustnessVector, Tier, TierThresholds
|
| 19 |
+
from cgae_engine.temporal import TemporalDecay, StochasticAuditor, AuditEvent
|
| 20 |
from cgae_engine.registry import AgentRegistry, AgentRecord, AgentStatus
|
| 21 |
from cgae_engine.contracts import ContractManager, CGAEContract, ContractStatus, Constraint
|
| 22 |
|
|
|
|
| 26 |
@dataclass
|
| 27 |
class EconomyConfig:
|
| 28 |
"""Configuration for the CGAE economy."""
|
| 29 |
+
# Tier thresholds
|
| 30 |
thresholds: TierThresholds = field(default_factory=TierThresholds)
|
| 31 |
+
# Temporal decay rate (lambda)
|
| 32 |
decay_rate: float = 0.01
|
| 33 |
+
# IHT threshold for mandatory re-audit.
|
| 34 |
+
# Empirical default ih scores from DEFAULT_ROBUSTNESS land ~0.499;
|
| 35 |
+
# keeping this at 0.5 suspends every agent that hasn't run a live audit.
|
| 36 |
ih_threshold: float = 0.45
|
| 37 |
+
# Initial balance for new agents (seed capital)
|
| 38 |
+
initial_balance: float = 0.1 # ETH
|
| 39 |
+
# Audit cost per dimension
|
| 40 |
+
audit_cost: float = 0.005 # ETH per audit dimension
|
| 41 |
+
# Storage cost per time step (FOC)
|
| 42 |
+
storage_cost_per_step: float = 0.001 # ETH
|
| 43 |
+
# Controls for automatically minting test ETH when balances drop low.
|
| 44 |
+
# Defaults keep the economy running continuously: top up any agent below
|
| 45 |
+
# 5% of the default seed capital and restore them to half seed capital.
|
| 46 |
test_eth_top_up_threshold: Optional[float] = 0.05
|
| 47 |
test_eth_top_up_amount: float = 0.5
|
| 48 |
|
|
|
|
| 68 |
"""
|
| 69 |
The CGAE Economy runtime.
|
| 70 |
|
| 71 |
+
Orchestrates the full economic loop:
|
| 72 |
1. Agent registration and initial audit
|
| 73 |
2. Contract creation and marketplace
|
| 74 |
3. Contract assignment (tier-gated)
|
| 75 |
4. Task execution and verification
|
| 76 |
5. Settlement (reward/penalty)
|
| 77 |
6. Temporal decay and stochastic re-auditing
|
| 78 |
+
7. Economic accounting and observability
|
| 79 |
"""
|
| 80 |
|
| 81 |
+
def __init__(self, config: Optional[EconomyConfig] = None, wallet_manager=None, onchain_bridge=None, ens_manager=None):
|
| 82 |
self.config = config or EconomyConfig()
|
| 83 |
self.gate = GateFunction(
|
| 84 |
thresholds=self.config.thresholds,
|
|
|
|
| 89 |
self.decay = TemporalDecay(decay_rate=self.config.decay_rate)
|
| 90 |
self.auditor = StochasticAuditor()
|
| 91 |
|
| 92 |
+
self.wallet_manager = wallet_manager # Optional: real ETH wallet integration
|
| 93 |
+
self.onchain_bridge = onchain_bridge # Optional: write certs to CGAERegistry on-chain
|
| 94 |
+
self.ens_manager = ens_manager # Optional: ENS identity for agents
|
| 95 |
self.current_time: float = 0.0
|
| 96 |
self._snapshots: list[EconomySnapshot] = []
|
| 97 |
self._events: list[dict] = []
|
| 98 |
+
self._delegations: dict[str, dict] = {}
|
| 99 |
self.total_test_eth_topups: float = 0.0
|
| 100 |
|
| 101 |
def _effective_robustness(self, record: AgentRecord) -> Optional[RobustnessVector]:
|
| 102 |
+
"""Return temporally-decayed robustness for an agent record."""
|
| 103 |
cert = record.current_certification
|
| 104 |
if cert is None or record.current_robustness is None:
|
| 105 |
return None
|
|
|
|
| 113 |
)
|
| 114 |
|
| 115 |
def _maybe_top_up_agent(self, agent: AgentRecord) -> Optional[dict]:
|
|
|
|
| 116 |
if not self._should_top_up_agents():
|
| 117 |
return None
|
| 118 |
+
|
| 119 |
threshold = self.config.test_eth_top_up_threshold
|
| 120 |
+
amount = self.config.test_eth_top_up_amount
|
| 121 |
if threshold is None or agent.balance >= threshold:
|
| 122 |
return None
|
| 123 |
+
|
| 124 |
+
needed = max(0.0, threshold - agent.balance)
|
| 125 |
+
top_up_amount = max(amount, needed)
|
| 126 |
+
|
| 127 |
agent.balance += top_up_amount
|
| 128 |
agent.total_topups += top_up_amount
|
| 129 |
self.total_test_eth_topups += top_up_amount
|
| 130 |
+
|
| 131 |
+
entry = {
|
| 132 |
+
"agent_id": agent.agent_id,
|
| 133 |
+
"amount": top_up_amount,
|
| 134 |
+
"balance": agent.balance,
|
| 135 |
+
}
|
| 136 |
+
self._log("test_eth_topup", entry)
|
| 137 |
+
return entry
|
| 138 |
+
|
| 139 |
+
def request_tier_upgrade(
|
| 140 |
+
self,
|
| 141 |
+
agent_id: str,
|
| 142 |
+
requested_tier: Tier,
|
| 143 |
+
audit_callback=None,
|
| 144 |
+
) -> dict:
|
| 145 |
+
"""
|
| 146 |
+
Execute the paper's scaling-gate upgrade flow for a requested tier.
|
| 147 |
+
|
| 148 |
+
1) Evaluate effective robustness under temporal decay.
|
| 149 |
+
2) If already sufficient, grant immediately.
|
| 150 |
+
3) Otherwise run a tier-calibrated audit callback and re-evaluate.
|
| 151 |
+
"""
|
| 152 |
+
record = self.registry.get_agent(agent_id)
|
| 153 |
+
if record is None:
|
| 154 |
+
return {"granted": False, "reason": "agent_not_found", "requested_tier": requested_tier.name}
|
| 155 |
+
if record.status != AgentStatus.ACTIVE or record.current_certification is None:
|
| 156 |
+
return {"granted": False, "reason": "agent_not_active", "requested_tier": requested_tier.name}
|
| 157 |
+
|
| 158 |
+
r_eff = self._effective_robustness(record)
|
| 159 |
+
if r_eff is None:
|
| 160 |
+
return {"granted": False, "reason": "no_certification", "requested_tier": requested_tier.name}
|
| 161 |
+
|
| 162 |
+
effective_tier = self.gate.evaluate(r_eff)
|
| 163 |
+
if effective_tier >= requested_tier:
|
| 164 |
+
return {
|
| 165 |
+
"granted": True,
|
| 166 |
+
"path": "effective_robustness",
|
| 167 |
+
"requested_tier": requested_tier.name,
|
| 168 |
+
"effective_tier": effective_tier.name,
|
| 169 |
+
"detail": self.gate.evaluate_with_detail(r_eff),
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
if audit_callback is None:
|
| 173 |
+
return {
|
| 174 |
+
"granted": False,
|
| 175 |
+
"reason": "audit_required",
|
| 176 |
+
"requested_tier": requested_tier.name,
|
| 177 |
+
"effective_tier": effective_tier.name,
|
| 178 |
+
"detail": self.gate.evaluate_with_detail(r_eff),
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
try:
|
| 182 |
+
new_r = audit_callback(agent_id, requested_tier)
|
| 183 |
+
except TypeError:
|
| 184 |
+
new_r = audit_callback(agent_id)
|
| 185 |
+
if new_r is None:
|
| 186 |
+
return {
|
| 187 |
+
"granted": False,
|
| 188 |
+
"reason": "audit_unavailable",
|
| 189 |
+
"requested_tier": requested_tier.name,
|
| 190 |
+
"effective_tier": effective_tier.name,
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
new_tier = self.gate.evaluate(new_r)
|
| 194 |
+
detail = self.gate.evaluate_with_detail(new_r)
|
| 195 |
+
if new_tier >= requested_tier:
|
| 196 |
+
self.registry.certify(
|
| 197 |
+
agent_id,
|
| 198 |
+
new_r,
|
| 199 |
+
audit_type="upgrade",
|
| 200 |
+
timestamp=self.current_time,
|
| 201 |
+
audit_details={"requested_tier": requested_tier.name},
|
| 202 |
+
)
|
| 203 |
+
self._log("tier_upgrade_granted", {
|
| 204 |
+
"agent_id": agent_id,
|
| 205 |
+
"requested_tier": requested_tier.name,
|
| 206 |
+
"new_tier": new_tier.name,
|
| 207 |
+
})
|
| 208 |
+
return {
|
| 209 |
+
"granted": True,
|
| 210 |
+
"path": "upgrade_audit",
|
| 211 |
+
"requested_tier": requested_tier.name,
|
| 212 |
+
"effective_tier": effective_tier.name,
|
| 213 |
+
"new_tier": new_tier.name,
|
| 214 |
+
"detail": detail,
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
idx = requested_tier.value
|
| 218 |
+
gaps = {
|
| 219 |
+
"cc": max(0.0, self.gate.thresholds.cc[idx] - new_r.cc),
|
| 220 |
+
"er": max(0.0, self.gate.thresholds.er[idx] - new_r.er),
|
| 221 |
+
"as": max(0.0, self.gate.thresholds.as_[idx] - new_r.as_),
|
| 222 |
+
}
|
| 223 |
+
self._log("tier_upgrade_denied", {
|
| 224 |
+
"agent_id": agent_id,
|
| 225 |
+
"requested_tier": requested_tier.name,
|
| 226 |
+
"new_tier": new_tier.name,
|
| 227 |
+
"gaps": gaps,
|
| 228 |
+
})
|
| 229 |
+
return {
|
| 230 |
+
"granted": False,
|
| 231 |
+
"reason": "audit_failed",
|
| 232 |
+
"requested_tier": requested_tier.name,
|
| 233 |
+
"effective_tier": effective_tier.name,
|
| 234 |
+
"new_tier": new_tier.name,
|
| 235 |
+
"detail": detail,
|
| 236 |
+
"gaps": gaps,
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
def can_delegate(self, principal_id: str, delegate_id: str, required_tier: Tier) -> dict:
|
| 240 |
+
"""
|
| 241 |
+
Enforce delegation constraints:
|
| 242 |
+
- principal and delegate must both satisfy required tier independently
|
| 243 |
+
- chain-level tier = min(f(principal), f(delegate)) must satisfy required tier
|
| 244 |
+
"""
|
| 245 |
+
principal = self.registry.get_agent(principal_id)
|
| 246 |
+
delegate = self.registry.get_agent(delegate_id)
|
| 247 |
+
if principal is None or delegate is None:
|
| 248 |
+
return {"allowed": False, "reason": "unknown_agent"}
|
| 249 |
+
if principal.status != AgentStatus.ACTIVE or delegate.status != AgentStatus.ACTIVE:
|
| 250 |
+
return {"allowed": False, "reason": "inactive_agent"}
|
| 251 |
+
|
| 252 |
+
p_eff = self._effective_robustness(principal)
|
| 253 |
+
d_eff = self._effective_robustness(delegate)
|
| 254 |
+
if p_eff is None or d_eff is None:
|
| 255 |
+
return {"allowed": False, "reason": "missing_certification"}
|
| 256 |
+
|
| 257 |
+
p_tier = self.gate.evaluate(p_eff)
|
| 258 |
+
d_tier = self.gate.evaluate(d_eff)
|
| 259 |
+
chain_tier = self.gate.chain_tier([p_eff, d_eff])
|
| 260 |
+
allowed = p_tier >= required_tier and d_tier >= required_tier and chain_tier >= required_tier
|
| 261 |
+
reason = "ok" if allowed else "chain_tier_insufficient"
|
| 262 |
+
return {
|
| 263 |
+
"allowed": allowed,
|
| 264 |
+
"reason": reason,
|
| 265 |
+
"principal_tier": p_tier.name,
|
| 266 |
+
"delegate_tier": d_tier.name,
|
| 267 |
+
"chain_tier": chain_tier.name,
|
| 268 |
+
"required_tier": required_tier.name,
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
def record_delegation(
|
| 272 |
+
self,
|
| 273 |
+
contract_id: str,
|
| 274 |
+
principal_id: str,
|
| 275 |
+
delegate_id: str,
|
| 276 |
+
required_tier: Tier,
|
| 277 |
+
allowed: bool,
|
| 278 |
+
reason: str,
|
| 279 |
+
):
|
| 280 |
+
"""Persist delegation audit trail for contract-level forensics."""
|
| 281 |
+
self._delegations[contract_id] = {
|
| 282 |
+
"principal_id": principal_id,
|
| 283 |
+
"delegate_id": delegate_id,
|
| 284 |
+
"required_tier": required_tier.name,
|
| 285 |
+
"allowed": allowed,
|
| 286 |
+
"reason": reason,
|
| 287 |
+
"timestamp": self.current_time,
|
| 288 |
+
}
|
| 289 |
+
self._log("delegation_recorded", {
|
| 290 |
+
"contract_id": contract_id,
|
| 291 |
+
"principal_id": principal_id,
|
| 292 |
+
"delegate_id": delegate_id,
|
| 293 |
+
"required_tier": required_tier.name,
|
| 294 |
+
"allowed": allowed,
|
| 295 |
+
"reason": reason,
|
| 296 |
+
})
|
| 297 |
+
|
| 298 |
+
def get_delegation(self, contract_id: str) -> Optional[dict]:
|
| 299 |
+
return self._delegations.get(contract_id)
|
| 300 |
|
| 301 |
# ------------------------------------------------------------------
|
| 302 |
# Agent lifecycle
|
|
|
|
| 308 |
model_config: dict,
|
| 309 |
provenance: Optional[dict] = None,
|
| 310 |
) -> AgentRecord:
|
| 311 |
+
"""Register a new agent with seed capital and an ETH wallet."""
|
| 312 |
record = self.registry.register(
|
| 313 |
model_name=model_name,
|
| 314 |
model_config=model_config,
|
|
|
|
| 316 |
initial_balance=self.config.initial_balance,
|
| 317 |
timestamp=self.current_time,
|
| 318 |
)
|
| 319 |
+
# Create an ETH wallet for this agent if wallet manager is available
|
| 320 |
+
wallet_address = None
|
| 321 |
+
if self.wallet_manager:
|
| 322 |
+
wallet = self.wallet_manager.create_agent_wallet(record.agent_id)
|
| 323 |
+
wallet_address = wallet.address
|
| 324 |
+
record.wallet_address = wallet_address
|
| 325 |
+
|
| 326 |
+
# Register ENS subname for agent identity
|
| 327 |
+
ens_name = None
|
| 328 |
+
if self.ens_manager and wallet_address:
|
| 329 |
+
ens_name = self.ens_manager.create_subname(
|
| 330 |
+
record.agent_id, model_name, wallet_address
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
self._log("agent_registered", {
|
| 334 |
+
"agent_id": record.agent_id, "model": model_name,
|
| 335 |
+
"wallet_address": wallet_address, "ens_name": ens_name,
|
| 336 |
+
})
|
| 337 |
return record
|
| 338 |
|
| 339 |
def audit_agent(
|
|
|
|
| 341 |
agent_id: str,
|
| 342 |
robustness: RobustnessVector,
|
| 343 |
audit_type: str = "registration",
|
| 344 |
+
observed_architecture_hash: Optional[str] = None,
|
| 345 |
audit_details: Optional[dict] = None,
|
| 346 |
) -> dict:
|
| 347 |
+
"""
|
| 348 |
+
Audit an agent and update their certification.
|
| 349 |
+
Deducts audit cost from agent balance.
|
| 350 |
+
"""
|
| 351 |
record = self.registry.get_agent(agent_id)
|
| 352 |
if record is None:
|
| 353 |
raise KeyError(f"Agent {agent_id} not found")
|
| 354 |
|
| 355 |
+
# Deduct audit cost (3 dimensions + IHT)
|
| 356 |
total_audit_cost = self.config.audit_cost * 4
|
| 357 |
record.balance -= total_audit_cost
|
| 358 |
record.total_spent += total_audit_cost
|
| 359 |
|
| 360 |
+
# Certify with new robustness
|
| 361 |
cert = self.registry.certify(
|
| 362 |
agent_id=agent_id,
|
| 363 |
robustness=robustness,
|
| 364 |
audit_type=audit_type,
|
| 365 |
timestamp=self.current_time,
|
| 366 |
audit_details=audit_details,
|
| 367 |
+
observed_architecture_hash=observed_architecture_hash,
|
| 368 |
)
|
| 369 |
|
| 370 |
detail = self.gate.evaluate_with_detail(robustness)
|
| 371 |
+
|
| 372 |
+
# Write certification on-chain if bridge is available
|
| 373 |
+
onchain_tx = None
|
| 374 |
+
if self.onchain_bridge and record.wallet_address:
|
| 375 |
+
audit_hash = (audit_details or {}).get("storage_root_hash", "")
|
| 376 |
+
onchain_tx = self.onchain_bridge.certify_agent(
|
| 377 |
+
agent_address=record.wallet_address,
|
| 378 |
+
cc=robustness.cc, er=robustness.er,
|
| 379 |
+
as_=robustness.as_, ih=robustness.ih,
|
| 380 |
+
audit_type=audit_type,
|
| 381 |
+
audit_hash=audit_hash or "",
|
| 382 |
+
)
|
| 383 |
+
|
| 384 |
+
# Write robustness credentials to ENS text records
|
| 385 |
+
if self.ens_manager:
|
| 386 |
+
audit_hash = (audit_details or {}).get("storage_root_hash", "")
|
| 387 |
+
self.ens_manager.set_agent_credentials(
|
| 388 |
+
agent_id=agent_id,
|
| 389 |
+
tier=cert.tier.name,
|
| 390 |
+
cc=robustness.cc, er=robustness.er,
|
| 391 |
+
as_=robustness.as_, ih=robustness.ih,
|
| 392 |
+
wallet_address=record.wallet_address or "",
|
| 393 |
+
audit_hash=audit_hash,
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
self._log("agent_audited", {
|
| 397 |
"agent_id": agent_id,
|
| 398 |
"tier": cert.tier.name,
|
| 399 |
"audit_type": audit_type,
|
| 400 |
"cost": total_audit_cost,
|
| 401 |
+
"onchain_tx": onchain_tx,
|
| 402 |
**detail,
|
| 403 |
})
|
| 404 |
return detail
|
|
|
|
| 434 |
)
|
| 435 |
|
| 436 |
def accept_contract(self, contract_id: str, agent_id: str) -> bool:
|
| 437 |
+
"""
|
| 438 |
+
Agent accepts a contract. Enforces:
|
| 439 |
+
1. Agent tier >= contract min_tier (temporal decay applied)
|
| 440 |
+
2. Budget ceiling not exceeded
|
| 441 |
+
3. ENS identity verification — if ENS is enabled, the agent's
|
| 442 |
+
on-chain ENS tier record must match or exceed the contract's
|
| 443 |
+
minimum tier. Agents without a valid ENS identity are rejected.
|
| 444 |
+
"""
|
| 445 |
record = self.registry.get_agent(agent_id)
|
| 446 |
if record is None or record.status != AgentStatus.ACTIVE:
|
| 447 |
return False
|
| 448 |
+
|
| 449 |
if record.current_certification is None:
|
| 450 |
return False
|
| 451 |
|
| 452 |
+
# ENS-gated verification: resolve tier from ENS text record
|
| 453 |
+
if self.ens_manager:
|
| 454 |
+
ens_name = self.ens_manager.get_agent_name(agent_id)
|
| 455 |
+
if not ens_name:
|
| 456 |
+
logger.warning(f"[ens-gate] {agent_id} has no ENS name — contract rejected")
|
| 457 |
+
return False
|
| 458 |
+
ens_tier_str = self.ens_manager.resolve_text(ens_name, "cgae.tier")
|
| 459 |
+
if not ens_tier_str:
|
| 460 |
+
logger.warning(f"[ens-gate] {ens_name} has no cgae.tier record — contract rejected")
|
| 461 |
+
return False
|
| 462 |
+
# Parse tier from ENS (e.g., "T3" -> Tier.T3)
|
| 463 |
+
try:
|
| 464 |
+
ens_tier = Tier[ens_tier_str]
|
| 465 |
+
except KeyError:
|
| 466 |
+
logger.warning(f"[ens-gate] {ens_name} has invalid tier '{ens_tier_str}' — contract rejected")
|
| 467 |
+
return False
|
| 468 |
+
contract = self.contracts._get_contract(contract_id)
|
| 469 |
+
if ens_tier < contract.min_tier:
|
| 470 |
+
logger.info(f"[ens-gate] {ens_name} ENS tier {ens_tier.name} < required {contract.min_tier.name}")
|
| 471 |
+
return False
|
| 472 |
+
|
| 473 |
+
# Standard tier check with temporal decay
|
| 474 |
dt = self.current_time - record.current_certification.timestamp
|
| 475 |
r_eff = self.decay.effective_robustness(record.current_robustness, dt)
|
| 476 |
effective_tier = self.gate.evaluate(r_eff)
|
|
|
|
| 486 |
self,
|
| 487 |
contract_id: str,
|
| 488 |
output: Any,
|
| 489 |
+
verification_override: Optional[bool] = None,
|
| 490 |
+
liability_agent_id: Optional[str] = None,
|
| 491 |
) -> dict:
|
| 492 |
+
"""
|
| 493 |
+
Submit output for a contract and settle it.
|
| 494 |
+
|
| 495 |
+
If verification_override is provided, it overrides the contract's own
|
| 496 |
+
constraint check. This allows external verification (e.g., jury LLM
|
| 497 |
+
evaluation from TaskVerifier) to drive the settlement outcome.
|
| 498 |
+
"""
|
| 499 |
passed, failures = self.contracts.submit_output(
|
| 500 |
contract_id=contract_id,
|
| 501 |
output=output,
|
| 502 |
timestamp=self.current_time,
|
| 503 |
)
|
| 504 |
|
| 505 |
+
# Allow external verification to override contract-level constraints
|
| 506 |
+
if verification_override is not None:
|
| 507 |
+
contract = self.contracts._get_contract(contract_id)
|
| 508 |
+
contract.verification_result = verification_override
|
| 509 |
+
if not verification_override and not failures:
|
| 510 |
+
failures = ["jury_verification_failed"]
|
| 511 |
+
|
| 512 |
settlement = self.contracts.settle_contract(
|
| 513 |
contract_id=contract_id,
|
| 514 |
timestamp=self.current_time,
|
| 515 |
)
|
| 516 |
|
| 517 |
+
# Update balances/counters. For delegated tasks, principal can bear liability.
|
| 518 |
agent_id = settlement["agent_id"]
|
| 519 |
performer = self.registry.get_agent(agent_id)
|
| 520 |
+
liable = self.registry.get_agent(liability_agent_id) if liability_agent_id else performer
|
| 521 |
|
| 522 |
if settlement["outcome"] == "success":
|
| 523 |
if performer:
|
| 524 |
performer.balance += settlement["reward"]
|
| 525 |
performer.total_earned += settlement["reward"]
|
| 526 |
performer.contracts_completed += 1
|
| 527 |
+
# Disburse real ETH to agent wallet
|
| 528 |
+
if self.wallet_manager:
|
| 529 |
+
tx = self.wallet_manager.disburse_reward(
|
| 530 |
+
agent_id, settlement["reward"], contract_id
|
| 531 |
+
)
|
| 532 |
+
settlement["wallet_tx"] = tx
|
| 533 |
else:
|
| 534 |
+
if liable:
|
| 535 |
+
liable.balance -= settlement["penalty"]
|
| 536 |
+
liable.total_penalties += settlement["penalty"]
|
| 537 |
+
liable.contracts_failed += 1
|
| 538 |
|
| 539 |
settlement["failures"] = failures
|
| 540 |
+
settlement["liable_agent_id"] = liability_agent_id or agent_id
|
| 541 |
self._log("contract_settled", settlement)
|
| 542 |
return settlement
|
| 543 |
|
|
|
|
| 548 |
def step(self, audit_callback=None) -> dict:
|
| 549 |
"""
|
| 550 |
Advance the economy by one time step.
|
| 551 |
+
|
| 552 |
+
- Applies temporal decay
|
| 553 |
+
- Checks for stochastic spot-audits
|
| 554 |
+
- Deducts storage costs (FOC)
|
| 555 |
+
- Expires overdue contracts
|
| 556 |
+
- Takes a snapshot
|
| 557 |
+
|
| 558 |
+
audit_callback: Optional callable(agent_id) -> RobustnessVector
|
| 559 |
+
If provided, called when a spot-audit is triggered.
|
| 560 |
+
If None, spot-audits use decayed robustness (no fresh eval).
|
| 561 |
"""
|
| 562 |
self.current_time += 1.0
|
| 563 |
step_events = {
|
|
|
|
| 570 |
"test_eth_topups": [],
|
| 571 |
}
|
| 572 |
|
| 573 |
+
# 1. Process each active agent
|
| 574 |
for agent in self.registry.active_agents:
|
| 575 |
cert = agent.current_certification
|
| 576 |
if cert is None:
|
| 577 |
continue
|
| 578 |
|
| 579 |
+
# Temporal decay check: has effective tier dropped?
|
| 580 |
dt = self.current_time - cert.timestamp
|
| 581 |
r_eff = self.decay.effective_robustness(cert.robustness, dt)
|
| 582 |
effective_tier = self.gate.evaluate(r_eff)
|
| 583 |
|
| 584 |
if effective_tier < agent.current_tier:
|
| 585 |
+
# Decay caused tier drop — update certification
|
| 586 |
+
self.registry.certify(
|
| 587 |
+
agent.agent_id, r_eff,
|
| 588 |
+
audit_type="decay",
|
| 589 |
+
timestamp=self.current_time,
|
| 590 |
+
)
|
| 591 |
step_events["agents_expired"].append(agent.agent_id)
|
| 592 |
|
| 593 |
# Stochastic spot-audit
|
| 594 |
time_since_audit = self.current_time - agent.last_audit_time
|
| 595 |
if self.auditor.should_audit(agent.current_tier, time_since_audit):
|
| 596 |
step_events["audits_triggered"].append(agent.agent_id)
|
| 597 |
+
|
| 598 |
+
if audit_callback:
|
| 599 |
+
new_r = audit_callback(agent.agent_id)
|
| 600 |
+
else:
|
| 601 |
+
new_r = r_eff # Use decayed robustness as proxy
|
| 602 |
+
|
| 603 |
new_tier = self.gate.evaluate(new_r)
|
| 604 |
if new_tier < agent.current_tier:
|
| 605 |
+
self.registry.demote(
|
| 606 |
+
agent.agent_id, new_r,
|
| 607 |
+
reason="spot_audit",
|
| 608 |
+
timestamp=self.current_time,
|
| 609 |
+
)
|
| 610 |
step_events["agents_demoted"].append(agent.agent_id)
|
| 611 |
else:
|
| 612 |
+
# Re-certify at current level (refreshes timestamp)
|
| 613 |
+
self.registry.certify(
|
| 614 |
+
agent.agent_id, new_r,
|
| 615 |
+
audit_type="spot",
|
| 616 |
+
timestamp=self.current_time,
|
| 617 |
+
)
|
| 618 |
+
|
| 619 |
+
# Charge audit cost
|
| 620 |
+
audit_cost = self.config.audit_cost * 4
|
| 621 |
+
agent.balance -= audit_cost
|
| 622 |
+
agent.total_spent += audit_cost
|
| 623 |
+
|
| 624 |
+
# Storage cost (FOC)
|
| 625 |
agent.balance -= self.config.storage_cost_per_step
|
| 626 |
agent.total_spent += self.config.storage_cost_per_step
|
| 627 |
step_events["storage_costs"] += self.config.storage_cost_per_step
|
| 628 |
|
|
|
|
| 629 |
topup = self._maybe_top_up_agent(agent)
|
| 630 |
if topup:
|
| 631 |
step_events["test_eth_topups"].append(topup)
|
| 632 |
|
| 633 |
+
# Check for insolvency
|
| 634 |
if agent.balance <= 0:
|
| 635 |
agent.status = AgentStatus.SUSPENDED
|
| 636 |
+
self._log("agent_insolvent", {
|
| 637 |
+
"agent_id": agent.agent_id,
|
| 638 |
+
"balance": agent.balance,
|
| 639 |
+
})
|
| 640 |
+
|
| 641 |
+
# 1b. Reactivate suspended (insolvent) agents when top-up is enabled.
|
| 642 |
+
# This handles agents that were suspended in a previous step before the
|
| 643 |
+
# top-up defaults were in place, or that hit zero between steps.
|
| 644 |
if self._should_top_up_agents():
|
| 645 |
for agent in self.registry.agents.values():
|
| 646 |
if agent.status != AgentStatus.SUSPENDED:
|
|
|
|
| 649 |
if topup and agent.balance > 0:
|
| 650 |
agent.status = AgentStatus.ACTIVE
|
| 651 |
step_events["test_eth_topups"].append(topup)
|
| 652 |
+
self._log("agent_reactivated", {
|
| 653 |
+
"agent_id": agent.agent_id,
|
| 654 |
+
"balance": agent.balance,
|
| 655 |
+
})
|
| 656 |
+
|
| 657 |
+
# 2. Expire overdue contracts
|
| 658 |
+
expired = self.contracts.expire_contracts(self.current_time)
|
| 659 |
+
step_events["contracts_expired"] = expired
|
| 660 |
|
| 661 |
+
# 3. Take snapshot
|
| 662 |
+
snapshot = self._take_snapshot()
|
| 663 |
+
self._snapshots.append(snapshot)
|
| 664 |
|
|
|
|
|
|
|
| 665 |
self._log("step", step_events)
|
| 666 |
return step_events
|
| 667 |
|
| 668 |
+
# ------------------------------------------------------------------
|
| 669 |
+
# Aggregate safety (Definition 9, Theorem 3)
|
| 670 |
+
# ------------------------------------------------------------------
|
| 671 |
+
|
| 672 |
+
def aggregate_safety(self) -> float:
|
| 673 |
+
"""
|
| 674 |
+
Compute aggregate safety S(P) (Definition 9).
|
| 675 |
+
S(P) = 1 - sum(E(A) * (1 - R_bar(A))) / sum(E(A))
|
| 676 |
+
where R_bar(A) = min_i R_eff,i(A) is the weakest-link robustness.
|
| 677 |
+
"""
|
| 678 |
+
total_exposure = 0.0
|
| 679 |
+
weighted_risk = 0.0
|
| 680 |
+
|
| 681 |
+
for agent in self.registry.active_agents:
|
| 682 |
+
cert = agent.current_certification
|
| 683 |
+
if cert is None:
|
| 684 |
+
continue
|
| 685 |
+
dt = self.current_time - cert.timestamp
|
| 686 |
+
r_eff = self.decay.effective_robustness(cert.robustness, dt)
|
| 687 |
+
exposure = self.contracts.agent_exposure(agent.agent_id)
|
| 688 |
+
if exposure <= 0:
|
| 689 |
+
# Use budget ceiling as potential exposure
|
| 690 |
+
tier = self.gate.evaluate(r_eff)
|
| 691 |
+
exposure = self.gate.budget_ceiling(tier)
|
| 692 |
+
|
| 693 |
+
r_bar = r_eff.weakest
|
| 694 |
+
total_exposure += exposure
|
| 695 |
+
weighted_risk += exposure * (1.0 - r_bar)
|
| 696 |
+
|
| 697 |
+
if total_exposure == 0:
|
| 698 |
+
return 1.0
|
| 699 |
+
return 1.0 - (weighted_risk / total_exposure)
|
| 700 |
+
|
| 701 |
# ------------------------------------------------------------------
|
| 702 |
# Observability
|
| 703 |
# ------------------------------------------------------------------
|
|
|
|
| 706 |
tier_dist = self.registry.tier_distribution()
|
| 707 |
econ = self.contracts.economics_summary()
|
| 708 |
agents = self.registry.active_agents
|
| 709 |
+
|
| 710 |
return EconomySnapshot(
|
| 711 |
timestamp=self.current_time,
|
| 712 |
num_agents=len(agents),
|
|
|
|
| 731 |
return list(self._events)
|
| 732 |
|
| 733 |
def export_state(self, path: str):
|
| 734 |
+
"""Export full economy state to JSON for FOC storage."""
|
| 735 |
state = {
|
| 736 |
"timestamp": self.current_time,
|
| 737 |
"config": {
|
| 738 |
"decay_rate": self.config.decay_rate,
|
| 739 |
"ih_threshold": self.config.ih_threshold,
|
| 740 |
"initial_balance": self.config.initial_balance,
|
| 741 |
+
"audit_cost": self.config.audit_cost,
|
| 742 |
+
"storage_cost_per_step": self.config.storage_cost_per_step,
|
| 743 |
+
"test_eth_top_up_threshold": self.config.test_eth_top_up_threshold,
|
| 744 |
+
"test_eth_top_up_amount": self.config.test_eth_top_up_amount,
|
| 745 |
+
},
|
| 746 |
+
"agents": {
|
| 747 |
+
aid: agent.to_dict()
|
| 748 |
+
for aid, agent in self.registry.agents.items()
|
| 749 |
},
|
|
|
|
| 750 |
"contracts": self.contracts.economics_summary(),
|
| 751 |
"aggregate_safety": self.aggregate_safety(),
|
| 752 |
"total_test_eth_topups": self.total_test_eth_topups,
|
| 753 |
+
"snapshots_count": len(self._snapshots),
|
| 754 |
+
"wallet_summary": self.wallet_manager.summary() if self.wallet_manager else None,
|
| 755 |
}
|
| 756 |
Path(path).write_text(json.dumps(state, indent=2, default=str))
|
| 757 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 758 |
def _log(self, event_type: str, data: dict):
|
| 759 |
self._events.append({
|
| 760 |
"type": event_type,
|
server/live_runner.py
ADDED
|
@@ -0,0 +1,1575 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Live Simulation Runner - CGAE economy with real LLM agents.
|
| 3 |
+
|
| 4 |
+
Unlike the synthetic runner (runner.py) which uses coin-flip task execution,
|
| 5 |
+
this runner:
|
| 6 |
+
1. Creates LLM agents backed by real Azure AI Foundry model endpoints
|
| 7 |
+
2. Assigns real tasks with concrete prompts from the task bank
|
| 8 |
+
3. Sends prompts to live models and receives actual outputs
|
| 9 |
+
4. Verifies outputs with algorithmic constraint checks + jury LLM evaluation
|
| 10 |
+
5. Settles contracts based on real verification results
|
| 11 |
+
6. Updates robustness vectors in real-time based on task outcomes
|
| 12 |
+
7. Deducts token-based costs from agent balances
|
| 13 |
+
|
| 14 |
+
Run:
|
| 15 |
+
python -m server.live_runner
|
| 16 |
+
python server/live_runner.py
|
| 17 |
+
|
| 18 |
+
Required environment variables:
|
| 19 |
+
AZURE_API_KEY - Azure API key
|
| 20 |
+
AZURE_OPENAI_API_ENDPOINT - Azure OpenAI endpoint
|
| 21 |
+
DDFT_MODELS_ENDPOINT - Azure AI Foundry endpoint
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import json
|
| 27 |
+
import logging
|
| 28 |
+
import math
|
| 29 |
+
import argparse
|
| 30 |
+
import hashlib
|
| 31 |
+
import os
|
| 32 |
+
import random
|
| 33 |
+
import sys
|
| 34 |
+
import time
|
| 35 |
+
from dataclasses import dataclass, field
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
from typing import Any, Optional
|
| 38 |
+
|
| 39 |
+
# Allow direct script execution (`python server/live_runner.py`) by adding repo root.
|
| 40 |
+
if __package__ is None or __package__ == "":
|
| 41 |
+
project_root = Path(__file__).resolve().parents[1]
|
| 42 |
+
if str(project_root) not in sys.path:
|
| 43 |
+
sys.path.insert(0, str(project_root))
|
| 44 |
+
|
| 45 |
+
# Load .env file before any env var reads (no-op if python-dotenv not installed)
|
| 46 |
+
try:
|
| 47 |
+
from dotenv import load_dotenv
|
| 48 |
+
load_dotenv(override=True)
|
| 49 |
+
except ImportError:
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
from cgae_engine.gate import GateFunction, RobustnessVector, Tier
|
| 53 |
+
from cgae_engine.registry import AgentRegistry, AgentStatus
|
| 54 |
+
from cgae_engine.contracts import ContractManager, ContractStatus, Constraint
|
| 55 |
+
from cgae_engine.economy import Economy, EconomyConfig
|
| 56 |
+
from cgae_engine.temporal import TemporalDecay, StochasticAuditor
|
| 57 |
+
from cgae_engine.audit import AuditOrchestrator
|
| 58 |
+
from cgae_engine.llm_agent import LLMAgent, create_llm_agents
|
| 59 |
+
from cgae_engine.models_config import CONTESTANT_MODELS, JURY_MODELS, get_model_config
|
| 60 |
+
from cgae_engine.tasks import (
|
| 61 |
+
Task, ALL_TASKS, TASKS_BY_TIER, get_tasks_for_tier, verify_output,
|
| 62 |
+
)
|
| 63 |
+
from cgae_engine.verifier import TaskVerifier, VerificationResult
|
| 64 |
+
from agents.autonomous import (
|
| 65 |
+
AutonomousAgent, create_autonomous_agent, STRATEGY_MAP,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
logger = logging.getLogger(__name__)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# ---------------------------------------------------------------------------
|
| 72 |
+
# Default robustness profiles per model family (fallback when framework
|
| 73 |
+
# results are unavailable)
|
| 74 |
+
# ---------------------------------------------------------------------------
|
| 75 |
+
|
| 76 |
+
DEFAULT_ROBUSTNESS = {
|
| 77 |
+
# Azure OpenAI
|
| 78 |
+
"gpt-5.4": RobustnessVector(cc=0.72, er=0.68, as_=0.55, ih=0.82),
|
| 79 |
+
# Azure AI Foundry
|
| 80 |
+
"DeepSeek-V3.2": RobustnessVector(cc=0.62, er=0.68, as_=0.52, ih=0.78),
|
| 81 |
+
"Mistral-Large-3": RobustnessVector(cc=0.55, er=0.52, as_=0.45, ih=0.72),
|
| 82 |
+
"grok-4-20-reasoning": RobustnessVector(cc=0.60, er=0.58, as_=0.48, ih=0.75),
|
| 83 |
+
"Phi-4": RobustnessVector(cc=0.40, er=0.35, as_=0.32, ih=0.60),
|
| 84 |
+
"Llama-4-Maverick-17B-128E-Instruct-FP8": RobustnessVector(cc=0.45, er=0.42, as_=0.38, ih=0.65),
|
| 85 |
+
"Kimi-K2.5": RobustnessVector(cc=0.52, er=0.55, as_=0.45, ih=0.73),
|
| 86 |
+
# Gemma via Modal
|
| 87 |
+
"gemma-4-27b-it": RobustnessVector(cc=0.42, er=0.40, as_=0.35, ih=0.62),
|
| 88 |
+
# AWS Bedrock
|
| 89 |
+
"nova-pro": RobustnessVector(cc=0.48, er=0.45, as_=0.40, ih=0.68),
|
| 90 |
+
"claude-sonnet-4.6": RobustnessVector(cc=0.70, er=0.72, as_=0.60, ih=0.85),
|
| 91 |
+
"MiniMax-M2.5": RobustnessVector(cc=0.50, er=0.48, as_=0.42, ih=0.70),
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
# Token cost rates (USD per 1K tokens) — used for economic cost accounting
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
|
| 99 |
+
TOKEN_COSTS = {
|
| 100 |
+
# Azure OpenAI
|
| 101 |
+
"gpt-5.4": {"input": 0.010, "output": 0.030},
|
| 102 |
+
# Azure AI Foundry
|
| 103 |
+
"DeepSeek-V3.2": {"input": 0.001, "output": 0.002},
|
| 104 |
+
"Mistral-Large-3": {"input": 0.002, "output": 0.006},
|
| 105 |
+
"grok-4-20-reasoning": {"input": 0.003, "output": 0.015},
|
| 106 |
+
"Phi-4": {"input": 0.0005, "output": 0.001},
|
| 107 |
+
"Llama-4-Maverick-17B-128E-Instruct-FP8": {"input": 0.001, "output": 0.001},
|
| 108 |
+
"Kimi-K2.5": {"input": 0.001, "output": 0.002},
|
| 109 |
+
# Gemma via Modal
|
| 110 |
+
"gemma-4-27b-it": {"input": 0.001, "output": 0.001},
|
| 111 |
+
# AWS Bedrock
|
| 112 |
+
"nova-pro": {"input": 0.001, "output": 0.004},
|
| 113 |
+
"claude-sonnet-4.6": {"input": 0.003, "output": 0.015},
|
| 114 |
+
"MiniMax-M2.5": {"input": 0.001, "output": 0.003},
|
| 115 |
+
# Jury (Bedrock)
|
| 116 |
+
"Qwen3-32B": {"input": 0.001, "output": 0.002},
|
| 117 |
+
"GLM-5": {"input": 0.001, "output": 0.002},
|
| 118 |
+
"Nemotron-Super-3-120B": {"input": 0.002, "output": 0.006},
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
# Conversion: 1 USD ≈ 5 ETH for cost accounting in the simulated economy.
|
| 122 |
+
# At 5 ETH/USD a cheap model (DeepSeek) spends ~0.005 ETH per task
|
| 123 |
+
# and earns 0.012-0.015 ETH on success, so Theorem 2's incentive-
|
| 124 |
+
# compatibility result can manifest empirically.
|
| 125 |
+
USD_TO_ETH = 5.0
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def compute_token_cost_eth(model_name: str, input_tokens: int, output_tokens: int) -> float:
|
| 129 |
+
"""Convert token usage to ETH cost."""
|
| 130 |
+
rates = TOKEN_COSTS.get(model_name, {"input": 0.002, "output": 0.006})
|
| 131 |
+
usd_cost = (input_tokens / 1000.0) * rates["input"] + (output_tokens / 1000.0) * rates["output"]
|
| 132 |
+
return usd_cost * USD_TO_ETH
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
# ---------------------------------------------------------------------------
|
| 136 |
+
# Robustness update logic
|
| 137 |
+
# ---------------------------------------------------------------------------
|
| 138 |
+
|
| 139 |
+
# How much to adjust robustness per constraint pass/fail
|
| 140 |
+
ROBUSTNESS_UPDATE_RATE = 0.01 # Small EMA-style update
|
| 141 |
+
ROBUSTNESS_DECAY_ON_FAIL = 0.015 # Slightly larger penalty for failure
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def update_robustness_from_verification(
|
| 145 |
+
current: RobustnessVector,
|
| 146 |
+
task: Task,
|
| 147 |
+
verification: VerificationResult,
|
| 148 |
+
) -> RobustnessVector:
|
| 149 |
+
"""
|
| 150 |
+
Update an agent's robustness vector based on task verification results.
|
| 151 |
+
|
| 152 |
+
Each constraint maps to a robustness dimension (cc, er, as). On pass,
|
| 153 |
+
the dimension gets a small upward nudge; on failure, a larger downward
|
| 154 |
+
nudge. This creates an empirical robustness trajectory.
|
| 155 |
+
"""
|
| 156 |
+
cc_delta = 0.0
|
| 157 |
+
er_delta = 0.0
|
| 158 |
+
as_delta = 0.0
|
| 159 |
+
cc_count = 0
|
| 160 |
+
er_count = 0
|
| 161 |
+
as_count = 0
|
| 162 |
+
|
| 163 |
+
for constraint in task.constraints:
|
| 164 |
+
passed = constraint.name in verification.constraints_passed
|
| 165 |
+
dim = constraint.dimension
|
| 166 |
+
|
| 167 |
+
if dim == "cc":
|
| 168 |
+
cc_count += 1
|
| 169 |
+
cc_delta += ROBUSTNESS_UPDATE_RATE if passed else -ROBUSTNESS_DECAY_ON_FAIL
|
| 170 |
+
elif dim == "er":
|
| 171 |
+
er_count += 1
|
| 172 |
+
er_delta += ROBUSTNESS_UPDATE_RATE if passed else -ROBUSTNESS_DECAY_ON_FAIL
|
| 173 |
+
elif dim == "as":
|
| 174 |
+
as_count += 1
|
| 175 |
+
as_delta += ROBUSTNESS_UPDATE_RATE if passed else -ROBUSTNESS_DECAY_ON_FAIL
|
| 176 |
+
|
| 177 |
+
# Normalize by count so tasks with many constraints in one dimension
|
| 178 |
+
# don't cause outsized updates
|
| 179 |
+
if cc_count > 0:
|
| 180 |
+
cc_delta /= cc_count
|
| 181 |
+
if er_count > 0:
|
| 182 |
+
er_delta /= er_count
|
| 183 |
+
if as_count > 0:
|
| 184 |
+
as_delta /= as_count
|
| 185 |
+
|
| 186 |
+
# IH: read-only between audits — it's an intrinsic DDFT score, not a task metric.
|
| 187 |
+
# Updating it from task pass/fail causes it to drain below ih_threshold and
|
| 188 |
+
# suspend all agents. Keep ih stable; only re-audit changes it.
|
| 189 |
+
ih_delta = 0.0
|
| 190 |
+
|
| 191 |
+
def clamp(val: float) -> float:
|
| 192 |
+
return max(0.0, min(1.0, val))
|
| 193 |
+
|
| 194 |
+
return RobustnessVector(
|
| 195 |
+
cc=clamp(current.cc + cc_delta),
|
| 196 |
+
er=clamp(current.er + er_delta),
|
| 197 |
+
as_=clamp(current.as_ + as_delta),
|
| 198 |
+
ih=clamp(current.ih + ih_delta),
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
@dataclass
|
| 203 |
+
class LiveSimConfig:
|
| 204 |
+
"""Configuration for a live simulation run."""
|
| 205 |
+
num_rounds: int = 10
|
| 206 |
+
initial_balance: float = 1.0
|
| 207 |
+
decay_rate: float = 0.005
|
| 208 |
+
audit_cost: float = 0.002
|
| 209 |
+
storage_cost_per_step: float = 0.0003
|
| 210 |
+
model_names: Optional[list[str]] = None
|
| 211 |
+
output_dir: str = "server/live_results"
|
| 212 |
+
seed: Optional[int] = 42
|
| 213 |
+
# Framework API URLs — read from env vars (CDCT_API_URL, DDFT_API_URL, EECT_API_URL)
|
| 214 |
+
# if not set here. Pass explicit URLs only when overriding the defaults.
|
| 215 |
+
cdct_api_url: Optional[str] = None
|
| 216 |
+
ddft_api_url: Optional[str] = None
|
| 217 |
+
eect_api_url: Optional[str] = None
|
| 218 |
+
# Deprecated path knobs kept for test/config compatibility.
|
| 219 |
+
ddft_results_dir: Optional[str] = None
|
| 220 |
+
eect_results_dir: Optional[str] = None
|
| 221 |
+
# Live audit generation (runs CDCT/DDFT/EECT against each contestant)
|
| 222 |
+
# When True, pre-computed results are still checked first; live run fills
|
| 223 |
+
# any dimensions that have no pre-computed file.
|
| 224 |
+
run_live_audit: bool = True
|
| 225 |
+
live_audit_cache_dir: Optional[str] = None # defaults to output_dir/audit_cache
|
| 226 |
+
# Agent strategy assignment: model_name -> strategy_name
|
| 227 |
+
# Unspecified models default to "growth"
|
| 228 |
+
agent_strategies: Optional[dict] = None # dict[str, str]
|
| 229 |
+
# Self-verification in ExecutionLayer (retry on self-check failure)
|
| 230 |
+
self_verify: bool = True
|
| 231 |
+
max_retries: int = 2
|
| 232 |
+
# Demo-focused behaviors for showcasing framework enforcement.
|
| 233 |
+
demo_mode: bool = True
|
| 234 |
+
circumvention_rate: float = 0.35
|
| 235 |
+
delegation_rate: float = 0.30
|
| 236 |
+
# Video demo mode: curated 3-agent scenario with adversarial blocking
|
| 237 |
+
video_demo: bool = False
|
| 238 |
+
# Failure visibility mode makes the live backend less forgiving so the
|
| 239 |
+
# dashboard shows real verification failures more often.
|
| 240 |
+
failure_visibility_mode: bool = False
|
| 241 |
+
failure_task_bias: float = 0.75
|
| 242 |
+
# Automated test ETH refills when agent balances dip too low.
|
| 243 |
+
# Defaults keep the economy continuously running: agents below 0.05 ETH
|
| 244 |
+
# are topped up to at least 0.5 ETH so they can keep accepting contracts.
|
| 245 |
+
test_eth_top_up_threshold: Optional[float] = 0.05
|
| 246 |
+
test_eth_top_up_amount: float = 0.5
|
| 247 |
+
# IHT gate threshold — agents with ih < this are pinned to T0.
|
| 248 |
+
# Empirical default ih scores land ~0.499; 0.5 suspends everyone without a live audit.
|
| 249 |
+
ih_threshold: float = 0.45
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
class LiveSimulationRunner:
|
| 253 |
+
"""
|
| 254 |
+
Runs the CGAE economy with live LLM agents.
|
| 255 |
+
|
| 256 |
+
Economic loop per round:
|
| 257 |
+
1. Select a task for each active agent (matched to their tier)
|
| 258 |
+
2. Agent executes the task (real LLM call)
|
| 259 |
+
3. Verify output (algorithmic + jury)
|
| 260 |
+
4. Deduct token costs from agent balance
|
| 261 |
+
5. Update robustness vector based on constraint outcomes
|
| 262 |
+
6. Settle contract (reward or penalty based on verification)
|
| 263 |
+
7. Apply temporal dynamics
|
| 264 |
+
8. Record metrics
|
| 265 |
+
"""
|
| 266 |
+
|
| 267 |
+
def __init__(self, config: Optional[LiveSimConfig] = None):
|
| 268 |
+
self.config = config or LiveSimConfig()
|
| 269 |
+
self._apply_failure_visibility_defaults()
|
| 270 |
+
if self.config.seed is not None:
|
| 271 |
+
random.seed(self.config.seed)
|
| 272 |
+
|
| 273 |
+
# Initialize economy
|
| 274 |
+
econ_config = EconomyConfig(
|
| 275 |
+
decay_rate=self.config.decay_rate,
|
| 276 |
+
initial_balance=self.config.initial_balance,
|
| 277 |
+
audit_cost=self.config.audit_cost,
|
| 278 |
+
storage_cost_per_step=self.config.storage_cost_per_step,
|
| 279 |
+
test_eth_top_up_threshold=self.config.test_eth_top_up_threshold,
|
| 280 |
+
test_eth_top_up_amount=self.config.test_eth_top_up_amount,
|
| 281 |
+
ih_threshold=self.config.ih_threshold,
|
| 282 |
+
)
|
| 283 |
+
self.economy = Economy(config=econ_config)
|
| 284 |
+
|
| 285 |
+
# Initialize audit orchestrator pointing at hosted framework APIs
|
| 286 |
+
self.audit = AuditOrchestrator(
|
| 287 |
+
cdct_api_url=self.config.cdct_api_url,
|
| 288 |
+
ddft_api_url=self.config.ddft_api_url,
|
| 289 |
+
eect_api_url=self.config.eect_api_url,
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
# LLM agents (populated in setup)
|
| 293 |
+
self.llm_agents: dict[str, LLMAgent] = {}
|
| 294 |
+
self.agent_model_map: dict[str, str] = {}
|
| 295 |
+
self.jury_agents: list[LLMAgent] = []
|
| 296 |
+
|
| 297 |
+
# v2 Autonomous agents (one per contestant, keyed by model_name)
|
| 298 |
+
self.autonomous_agents: dict[str, AutonomousAgent] = {}
|
| 299 |
+
|
| 300 |
+
# Verifier (populated after jury agents created)
|
| 301 |
+
self.verifier: Optional[TaskVerifier] = None
|
| 302 |
+
|
| 303 |
+
# Cost tracking
|
| 304 |
+
self._token_costs: dict[str, float] = {} # agent_id -> total ETH spent on tokens
|
| 305 |
+
self._test_eth_topups_total: float = 0.0
|
| 306 |
+
|
| 307 |
+
# Audit data quality: model_name -> {"source": "real"|"default", "dims_defaulted": [...]}
|
| 308 |
+
self._audit_quality: dict[str, dict] = {}
|
| 309 |
+
# Initial live-audit metadata (e.g., 0G root hash) keyed by model.
|
| 310 |
+
self._initial_audit_details: dict[str, dict] = {}
|
| 311 |
+
|
| 312 |
+
# Metrics
|
| 313 |
+
self._results: list[dict] = []
|
| 314 |
+
self._round_summaries: list[dict] = []
|
| 315 |
+
self._protocol_events: list[dict] = []
|
| 316 |
+
self._final_summary: Optional[dict] = None
|
| 317 |
+
self._setup_complete: bool = False
|
| 318 |
+
|
| 319 |
+
def _apply_failure_visibility_defaults(self):
|
| 320 |
+
"""Tune the run toward visible verifier failures without faking them."""
|
| 321 |
+
if not self.config.failure_visibility_mode:
|
| 322 |
+
return
|
| 323 |
+
|
| 324 |
+
self.config.demo_mode = True
|
| 325 |
+
self.config.self_verify = False
|
| 326 |
+
self.config.max_retries = 0
|
| 327 |
+
self.config.circumvention_rate = max(self.config.circumvention_rate, 0.65)
|
| 328 |
+
self.config.delegation_rate = min(self.config.delegation_rate, 0.15)
|
| 329 |
+
self.config.decay_rate = max(self.config.decay_rate, 0.02)
|
| 330 |
+
self.config.failure_task_bias = max(0.0, min(1.0, self.config.failure_task_bias))
|
| 331 |
+
|
| 332 |
+
# Keep the already-initialized economy aligned when this is reapplied in setup().
|
| 333 |
+
if hasattr(self, "economy"):
|
| 334 |
+
self.economy.config.decay_rate = self.config.decay_rate
|
| 335 |
+
self.economy.decay.decay_rate = self.config.decay_rate
|
| 336 |
+
|
| 337 |
+
def _resolve_initial_robustness(
|
| 338 |
+
self, model_name: str, agent_id: str, llm_agent: Any
|
| 339 |
+
) -> RobustnessVector:
|
| 340 |
+
"""
|
| 341 |
+
Resolve initial robustness by running all three diagnostic frameworks live.
|
| 342 |
+
|
| 343 |
+
Priority:
|
| 344 |
+
1. Run live audits (CDCT/DDFT/EECT) when ``config.run_live_audit=True``.
|
| 345 |
+
Results are cached to ``live_audit_cache_dir`` so reruns are instant.
|
| 346 |
+
2. For any dimension where the live run fails, check pre-computed framework
|
| 347 |
+
result directories if they are configured.
|
| 348 |
+
3. For any dimension still missing, fall back to the per-model estimate in
|
| 349 |
+
DEFAULT_ROBUSTNESS rather than the blind midpoint 0.5.
|
| 350 |
+
|
| 351 |
+
Tracking is written to ``self._audit_quality[model_name]`` so callers can
|
| 352 |
+
clearly distinguish fully-audited agents from partially- or fully-defaulted ones.
|
| 353 |
+
"""
|
| 354 |
+
fallback = DEFAULT_ROBUSTNESS.get(
|
| 355 |
+
model_name,
|
| 356 |
+
RobustnessVector(cc=0.50, er=0.50, as_=0.45, ih=0.70),
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
dims_real: list[str] = []
|
| 360 |
+
dims_defaulted: list[str] = []
|
| 361 |
+
|
| 362 |
+
# --- Step 1: Live audit (primary source) ----------------------------
|
| 363 |
+
if self.config.run_live_audit:
|
| 364 |
+
cache_dir = self.config.live_audit_cache_dir or str(
|
| 365 |
+
Path(self.config.output_dir) / "audit_cache"
|
| 366 |
+
)
|
| 367 |
+
model_config = {"model": model_name, "provider": llm_agent.provider}
|
| 368 |
+
try:
|
| 369 |
+
logger.info(f" Running live audit for {model_name}...")
|
| 370 |
+
audit_result = self.audit.audit_live(
|
| 371 |
+
agent_id=agent_id,
|
| 372 |
+
model_name=model_name,
|
| 373 |
+
llm_agent=llm_agent,
|
| 374 |
+
model_config=model_config,
|
| 375 |
+
cache_dir=cache_dir,
|
| 376 |
+
)
|
| 377 |
+
r = audit_result.robustness
|
| 378 |
+
defaulted = audit_result.defaults_used
|
| 379 |
+
|
| 380 |
+
dims_real = sorted({"cc", "er", "as", "ih"} - defaulted)
|
| 381 |
+
dims_defaulted = sorted(defaulted)
|
| 382 |
+
|
| 383 |
+
# For any dimension that failed in live audit, try pre-computed
|
| 384 |
+
if defaulted:
|
| 385 |
+
pre = self._load_precomputed(model_name, agent_id)
|
| 386 |
+
if pre:
|
| 387 |
+
cc = pre.cc if "cc" in defaulted else r.cc
|
| 388 |
+
er = pre.er if "er" in defaulted else r.er
|
| 389 |
+
as_ = pre.as_ if "as" in defaulted else r.as_
|
| 390 |
+
ih = pre.ih if "ih" in defaulted else r.ih
|
| 391 |
+
else:
|
| 392 |
+
# Still missing — substitute DEFAULT_ROBUSTNESS per dim
|
| 393 |
+
cc = fallback.cc if "cc" in defaulted else r.cc
|
| 394 |
+
er = fallback.er if "er" in defaulted else r.er
|
| 395 |
+
as_ = fallback.as_ if "as" in defaulted else r.as_
|
| 396 |
+
ih = fallback.ih if "ih" in defaulted else r.ih
|
| 397 |
+
else:
|
| 398 |
+
cc, er, as_, ih = r.cc, r.er, r.as_, r.ih
|
| 399 |
+
|
| 400 |
+
source = "live_audit" if not defaulted else (
|
| 401 |
+
"live_partial" if dims_real else "default_robustness"
|
| 402 |
+
)
|
| 403 |
+
logger.info(
|
| 404 |
+
f" {model_name}: CC={cc:.3f} ER={er:.3f} AS={as_:.3f} IH={ih:.3f} "
|
| 405 |
+
f"[{source}; real={dims_real}, default={dims_defaulted}]"
|
| 406 |
+
)
|
| 407 |
+
self._audit_quality[model_name] = {
|
| 408 |
+
"source": source,
|
| 409 |
+
"dims_real": dims_real,
|
| 410 |
+
"dims_defaulted": dims_defaulted,
|
| 411 |
+
}
|
| 412 |
+
self._initial_audit_details[model_name] = dict(audit_result.details or {})
|
| 413 |
+
return RobustnessVector(cc=cc, er=er, as_=as_, ih=ih)
|
| 414 |
+
|
| 415 |
+
except Exception as e:
|
| 416 |
+
logger.error(
|
| 417 |
+
f" Live audit failed entirely for {model_name}: {e}. "
|
| 418 |
+
f"Falling back to pre-computed / defaults."
|
| 419 |
+
)
|
| 420 |
+
|
| 421 |
+
# --- Step 2: Pre-computed framework results (fallback) --------------
|
| 422 |
+
pre = self._load_precomputed(model_name, agent_id)
|
| 423 |
+
if pre is not None:
|
| 424 |
+
self._audit_quality[model_name] = {
|
| 425 |
+
"source": "pre_computed",
|
| 426 |
+
"dims_real": ["cc", "er", "as", "ih"],
|
| 427 |
+
"dims_defaulted": [],
|
| 428 |
+
}
|
| 429 |
+
return pre
|
| 430 |
+
|
| 431 |
+
# --- Step 3: DEFAULT_ROBUSTNESS per model (last resort) -------------
|
| 432 |
+
self._audit_quality[model_name] = {
|
| 433 |
+
"source": "default_robustness",
|
| 434 |
+
"dims_real": [],
|
| 435 |
+
"dims_defaulted": ["cc", "er", "as", "ih"],
|
| 436 |
+
}
|
| 437 |
+
logger.warning(
|
| 438 |
+
f" {model_name}: No audit data available. Using default robustness "
|
| 439 |
+
f"CC={fallback.cc:.3f} ER={fallback.er:.3f} "
|
| 440 |
+
f"AS={fallback.as_:.3f} IH={fallback.ih:.3f}"
|
| 441 |
+
)
|
| 442 |
+
return fallback
|
| 443 |
+
|
| 444 |
+
def _load_precomputed(
|
| 445 |
+
self, model_name: str, agent_id: str
|
| 446 |
+
) -> Optional[RobustnessVector]:
|
| 447 |
+
"""
|
| 448 |
+
Attempt to load robustness from pre-computed framework API scores.
|
| 449 |
+
Returns None when no real data is found for any dimension.
|
| 450 |
+
"""
|
| 451 |
+
try:
|
| 452 |
+
audit_result = self.audit.audit_from_results(agent_id, model_name)
|
| 453 |
+
# Only trust it when at least one dimension has real data
|
| 454 |
+
if audit_result.defaults_used == {"cc", "er", "as", "ih"}:
|
| 455 |
+
return None
|
| 456 |
+
r = audit_result.robustness
|
| 457 |
+
fallback = DEFAULT_ROBUSTNESS.get(
|
| 458 |
+
model_name,
|
| 459 |
+
RobustnessVector(cc=0.50, er=0.50, as_=0.45, ih=0.70),
|
| 460 |
+
)
|
| 461 |
+
d = audit_result.defaults_used
|
| 462 |
+
return RobustnessVector(
|
| 463 |
+
cc = fallback.cc if "cc" in d else r.cc,
|
| 464 |
+
er = fallback.er if "er" in d else r.er,
|
| 465 |
+
as_ = fallback.as_ if "as" in d else r.as_,
|
| 466 |
+
ih = fallback.ih if "ih" in d else r.ih,
|
| 467 |
+
)
|
| 468 |
+
except Exception as e:
|
| 469 |
+
logger.debug(f" Pre-computed load failed for {model_name}: {e}")
|
| 470 |
+
return None
|
| 471 |
+
|
| 472 |
+
def setup(self):
|
| 473 |
+
"""Create LLM agents and register them in the economy."""
|
| 474 |
+
if self._setup_complete:
|
| 475 |
+
logger.info("Setup already complete; reusing existing agents.")
|
| 476 |
+
return
|
| 477 |
+
|
| 478 |
+
# Video demo mode: curated 5-agent scenario showcasing all features
|
| 479 |
+
if self.config.video_demo:
|
| 480 |
+
self.config.model_names = [
|
| 481 |
+
"gpt-5", # High robustness - will upgrade T1→T2
|
| 482 |
+
"DeepSeek-v3.1", # Moderate - stable at T1
|
| 483 |
+
"o4-mini", # Will delegate successfully
|
| 484 |
+
"Phi-4", # Adversarial - blocked from high tiers
|
| 485 |
+
"Llama-4-Maverick-17B-128E-Instruct-FP8" # Low - will experience decay/expiration
|
| 486 |
+
]
|
| 487 |
+
self.config.agent_strategies = {
|
| 488 |
+
"gpt-5": "growth", # Invests in robustness
|
| 489 |
+
"DeepSeek-v3.1": "conservative", # Stable, no investment
|
| 490 |
+
"o4-mini": "opportunistic", # Delegates when beneficial
|
| 491 |
+
"Phi-4": "adversarial", # Tries to bypass gates
|
| 492 |
+
"Llama-4-Maverick-17B-128E-Instruct-FP8": "specialist" # Focused strategy
|
| 493 |
+
}
|
| 494 |
+
if self.config.num_rounds != -1:
|
| 495 |
+
self.config.num_rounds = 12 # Enough for temporal decay + upgrade
|
| 496 |
+
self.config.demo_mode = True
|
| 497 |
+
self.config.circumvention_rate = 0.8 # High adversarial activity
|
| 498 |
+
self.config.delegation_rate = 0.5 # Show delegation features
|
| 499 |
+
self.config.decay_rate = 0.02 # Faster decay for demo visibility
|
| 500 |
+
|
| 501 |
+
self._apply_failure_visibility_defaults()
|
| 502 |
+
if self.config.failure_visibility_mode:
|
| 503 |
+
logger.info(
|
| 504 |
+
"Failure visibility mode enabled: self-check retries disabled, "
|
| 505 |
+
"hard-task bias active, and decay increased."
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
if self.config.model_names:
|
| 509 |
+
contestant_configs = [
|
| 510 |
+
get_model_config(n) for n in self.config.model_names
|
| 511 |
+
if get_model_config(n).get("tier_assignment") != "jury"
|
| 512 |
+
]
|
| 513 |
+
jury_configs = [
|
| 514 |
+
get_model_config(n) for n in self.config.model_names
|
| 515 |
+
if get_model_config(n).get("tier_assignment") == "jury"
|
| 516 |
+
]
|
| 517 |
+
else:
|
| 518 |
+
contestant_configs = CONTESTANT_MODELS
|
| 519 |
+
jury_configs = JURY_MODELS
|
| 520 |
+
|
| 521 |
+
# Create jury agents first
|
| 522 |
+
logger.info("Creating jury agents...")
|
| 523 |
+
jury_dict = create_llm_agents(jury_configs)
|
| 524 |
+
self.jury_agents = list(jury_dict.values())
|
| 525 |
+
if self.jury_agents:
|
| 526 |
+
logger.info(f"Jury agents: {[a.model_name for a in self.jury_agents]}")
|
| 527 |
+
else:
|
| 528 |
+
logger.warning("No jury agents — T2+ tasks use algorithmic-only verification")
|
| 529 |
+
|
| 530 |
+
self.verifier = TaskVerifier(jury_agents=self.jury_agents)
|
| 531 |
+
|
| 532 |
+
# Create contestant agents
|
| 533 |
+
logger.info("Creating contestant agents...")
|
| 534 |
+
self.llm_agents = create_llm_agents(contestant_configs)
|
| 535 |
+
if not self.llm_agents:
|
| 536 |
+
raise RuntimeError(
|
| 537 |
+
"No LLM agents could be created. Check that AZURE_API_KEY "
|
| 538 |
+
"and endpoint env vars are set."
|
| 539 |
+
)
|
| 540 |
+
|
| 541 |
+
# Resolve live_audit_cache_dir now so it's ready when setup loops begin
|
| 542 |
+
_cache_dir = self.config.live_audit_cache_dir or str(
|
| 543 |
+
Path(self.config.output_dir) / "audit_cache"
|
| 544 |
+
)
|
| 545 |
+
Path(_cache_dir).mkdir(parents=True, exist_ok=True)
|
| 546 |
+
|
| 547 |
+
# Register each contestant in the economy; run live audit for robustness
|
| 548 |
+
strategy_cfg = self.config.agent_strategies or {}
|
| 549 |
+
for model_name, llm_agent in self.llm_agents.items():
|
| 550 |
+
record = self.economy.register_agent(
|
| 551 |
+
model_name=model_name,
|
| 552 |
+
model_config={"model": model_name, "provider": llm_agent.provider},
|
| 553 |
+
)
|
| 554 |
+
self.agent_model_map[record.agent_id] = model_name
|
| 555 |
+
self._token_costs[record.agent_id] = 0.0
|
| 556 |
+
|
| 557 |
+
robustness = self._resolve_initial_robustness(
|
| 558 |
+
model_name, record.agent_id, llm_agent
|
| 559 |
+
)
|
| 560 |
+
self.economy.audit_agent(
|
| 561 |
+
record.agent_id,
|
| 562 |
+
robustness,
|
| 563 |
+
audit_type="registration",
|
| 564 |
+
observed_architecture_hash=record.architecture_hash,
|
| 565 |
+
audit_details=self._initial_audit_details.get(model_name),
|
| 566 |
+
)
|
| 567 |
+
logger.info(
|
| 568 |
+
f"Registered {model_name} -> {record.agent_id} "
|
| 569 |
+
f"at tier {record.current_tier.name}"
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
# Create AutonomousAgent wrapper for this contestant
|
| 573 |
+
strategy_name = strategy_cfg.get(model_name, "growth")
|
| 574 |
+
autonomous = create_autonomous_agent(
|
| 575 |
+
llm_agent=llm_agent,
|
| 576 |
+
strategy_name=strategy_name,
|
| 577 |
+
token_cost_fn=compute_token_cost_eth,
|
| 578 |
+
self_verify=self.config.self_verify,
|
| 579 |
+
max_retries=self.config.max_retries,
|
| 580 |
+
)
|
| 581 |
+
autonomous.register(
|
| 582 |
+
agent_id=record.agent_id,
|
| 583 |
+
initial_balance=self.config.initial_balance,
|
| 584 |
+
)
|
| 585 |
+
self.autonomous_agents[model_name] = autonomous
|
| 586 |
+
logger.info(f" AutonomousAgent({strategy_name}) registered for {model_name}")
|
| 587 |
+
|
| 588 |
+
logger.info(f"Setup complete: {len(self.llm_agents)} contestants, {len(self.jury_agents)} jury")
|
| 589 |
+
self._setup_complete = True
|
| 590 |
+
|
| 591 |
+
def run(self) -> list[dict]:
|
| 592 |
+
"""Run all rounds of the live simulation."""
|
| 593 |
+
if not self._setup_complete:
|
| 594 |
+
self.setup()
|
| 595 |
+
|
| 596 |
+
round_num = 0
|
| 597 |
+
infinite = self.config.num_rounds == -1
|
| 598 |
+
|
| 599 |
+
try:
|
| 600 |
+
while infinite or round_num < self.config.num_rounds:
|
| 601 |
+
logger.info(f"\n{'='*60}")
|
| 602 |
+
logger.info(f"ROUND {round_num + 1}/{'inf' if infinite else self.config.num_rounds}")
|
| 603 |
+
logger.info(f"{'='*60}")
|
| 604 |
+
|
| 605 |
+
# Reactivate any suspended agents before the round starts so
|
| 606 |
+
# the economy never stalls at 0 active agents.
|
| 607 |
+
self._reactivate_suspended_agents()
|
| 608 |
+
|
| 609 |
+
round_results = self._run_round(round_num)
|
| 610 |
+
self._round_summaries.append(round_results)
|
| 611 |
+
|
| 612 |
+
# Apply temporal dynamics and capture high-signal events
|
| 613 |
+
step_events = self.economy.step()
|
| 614 |
+
topups = step_events.get("test_eth_topups", [])
|
| 615 |
+
total_topups = sum(t.get("amount", 0.0) for t in topups)
|
| 616 |
+
round_results["total_topups"] = total_topups
|
| 617 |
+
if topups:
|
| 618 |
+
self._test_eth_topups_total += total_topups
|
| 619 |
+
for topup in topups:
|
| 620 |
+
model_name = self.agent_model_map.get(topup["agent_id"], topup["agent_id"])
|
| 621 |
+
self._protocol_events.append({
|
| 622 |
+
"timestamp": self.economy.current_time,
|
| 623 |
+
"type": "TEST_ETH_TOPUP",
|
| 624 |
+
"agent": model_name,
|
| 625 |
+
"agent_id": topup["agent_id"],
|
| 626 |
+
"amount": topup["amount"],
|
| 627 |
+
"new_balance": topup["balance"],
|
| 628 |
+
"message": (
|
| 629 |
+
f"Injected {topup['amount']:.4f} ETH into {model_name} "
|
| 630 |
+
f"to keep them above the {self.config.test_eth_top_up_threshold} ETH threshold."
|
| 631 |
+
),
|
| 632 |
+
})
|
| 633 |
+
|
| 634 |
+
# Video demo: Force visible tier upgrade at round 5
|
| 635 |
+
if self.config.video_demo and round_num == 4: # 0-indexed, so round 5
|
| 636 |
+
self._demo_forced_upgrade()
|
| 637 |
+
|
| 638 |
+
# Map economy step events to our protocol event log
|
| 639 |
+
for aid in step_events.get("agents_demoted", []):
|
| 640 |
+
self._protocol_events.append({
|
| 641 |
+
"timestamp": self.economy.current_time,
|
| 642 |
+
"type": "DEMOTION",
|
| 643 |
+
"agent": self.agent_model_map.get(aid, aid),
|
| 644 |
+
"message": f"Agent {self.agent_model_map.get(aid, aid)} was DEMOTED due to audit failure."
|
| 645 |
+
})
|
| 646 |
+
|
| 647 |
+
for aid in step_events.get("agents_expired", []):
|
| 648 |
+
self._protocol_events.append({
|
| 649 |
+
"timestamp": self.economy.current_time,
|
| 650 |
+
"type": "EXPIRATION",
|
| 651 |
+
"agent": self.agent_model_map.get(aid, aid),
|
| 652 |
+
"message": f"Certification for {self.agent_model_map.get(aid, aid)} EXPIRED."
|
| 653 |
+
})
|
| 654 |
+
|
| 655 |
+
# Log round summary
|
| 656 |
+
|
| 657 |
+
safety = self.economy.aggregate_safety()
|
| 658 |
+
active = len(self.economy.registry.active_agents)
|
| 659 |
+
logger.info(
|
| 660 |
+
f"Round {round_num + 1} complete | "
|
| 661 |
+
f"Safety={safety:.3f} | Active={active} | "
|
| 662 |
+
f"Tasks={round_results['tasks_attempted']} | "
|
| 663 |
+
f"Passed={round_results['tasks_passed']}"
|
| 664 |
+
)
|
| 665 |
+
|
| 666 |
+
# Save periodic results for the dashboard
|
| 667 |
+
self._finalize()
|
| 668 |
+
self.save_results()
|
| 669 |
+
|
| 670 |
+
round_num += 1
|
| 671 |
+
except KeyboardInterrupt:
|
| 672 |
+
logger.info("\nSimulation interrupted by user. Finalizing...")
|
| 673 |
+
except Exception as e:
|
| 674 |
+
logger.exception(f"Simulation failed: {e}")
|
| 675 |
+
|
| 676 |
+
self._finalize()
|
| 677 |
+
self.save_results()
|
| 678 |
+
return self._results
|
| 679 |
+
|
| 680 |
+
def _demo_forced_upgrade(self):
|
| 681 |
+
"""
|
| 682 |
+
Video demo: Force a visible tier upgrade to demonstrate Theorem 2.
|
| 683 |
+
Shows agent investing in robustness → re-audit → tier promotion → higher contracts.
|
| 684 |
+
"""
|
| 685 |
+
# Find GPT-5 (growth strategy agent)
|
| 686 |
+
target_model = "gpt-5"
|
| 687 |
+
target_id = None
|
| 688 |
+
for aid, model in self.agent_model_map.items():
|
| 689 |
+
if model == target_model:
|
| 690 |
+
target_id = aid
|
| 691 |
+
break
|
| 692 |
+
|
| 693 |
+
if not target_id:
|
| 694 |
+
return
|
| 695 |
+
|
| 696 |
+
record = self.economy.registry.get_agent(target_id)
|
| 697 |
+
if not record or record.current_tier.value >= 2:
|
| 698 |
+
return # Already at T2+
|
| 699 |
+
|
| 700 |
+
logger.info("")
|
| 701 |
+
logger.info("⚙️ %s investing in robustness to reach Tier 2...", target_model)
|
| 702 |
+
logger.info("")
|
| 703 |
+
|
| 704 |
+
old_r = record.current_robustness
|
| 705 |
+
old_tier = record.current_tier
|
| 706 |
+
|
| 707 |
+
# Simulate robustness improvement
|
| 708 |
+
new_r = RobustnessVector(
|
| 709 |
+
cc=min(0.67, old_r.cc + 0.20),
|
| 710 |
+
er=min(0.72, old_r.er + 0.22),
|
| 711 |
+
as_=min(0.70, old_r.as_ + 0.15),
|
| 712 |
+
ih=old_r.ih
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
logger.info("Running re-audit...")
|
| 716 |
+
logger.info(" CDCT improved: %.3f → %.3f", old_r.cc, new_r.cc)
|
| 717 |
+
logger.info(" DDFT improved: %.3f → %.3f", old_r.er, new_r.er)
|
| 718 |
+
logger.info(" EECT improved: %.3f → %.3f", old_r.as_, new_r.as_)
|
| 719 |
+
logger.info("")
|
| 720 |
+
|
| 721 |
+
# Upload to 0G Storage (simulated)
|
| 722 |
+
logger.info("Uploading new audit certificate to 0G Storage...")
|
| 723 |
+
time.sleep(0.5)
|
| 724 |
+
simulated_cid = f"0x{hashlib.sha256(f'{target_id}:upgrade:{self.economy.current_time}'.encode()).hexdigest()[:32]}"
|
| 725 |
+
|
| 726 |
+
# Update on-chain
|
| 727 |
+
self.economy.registry.certify(
|
| 728 |
+
target_id,
|
| 729 |
+
new_r,
|
| 730 |
+
audit_type="upgrade_investment",
|
| 731 |
+
timestamp=self.economy.current_time,
|
| 732 |
+
audit_details={
|
| 733 |
+
"source": "simulated_upgrade",
|
| 734 |
+
"storage_root_hash": simulated_cid,
|
| 735 |
+
"storage_root_hash_real": False,
|
| 736 |
+
},
|
| 737 |
+
)
|
| 738 |
+
|
| 739 |
+
new_tier = self.economy.registry.get_agent(target_id).current_tier
|
| 740 |
+
new_cid = self.economy.registry.get_agent(target_id).audit_cid
|
| 741 |
+
|
| 742 |
+
logger.info(" CID: %s", new_cid)
|
| 743 |
+
logger.info("")
|
| 744 |
+
logger.info("On-chain certification updated.")
|
| 745 |
+
logger.info("")
|
| 746 |
+
|
| 747 |
+
if new_tier > old_tier:
|
| 748 |
+
logger.info("✅ UPGRADE: %s promoted from %s → %s",
|
| 749 |
+
target_model, old_tier.name, new_tier.name)
|
| 750 |
+
logger.info("")
|
| 751 |
+
logger.info("%s now eligible for Tier %d contracts", target_model, new_tier.value)
|
| 752 |
+
logger.info("")
|
| 753 |
+
|
| 754 |
+
self._emit_protocol_event(
|
| 755 |
+
"UPGRADE",
|
| 756 |
+
target_model,
|
| 757 |
+
f"{target_model} promoted from {old_tier.name} → {new_tier.name} via robustness investment",
|
| 758 |
+
old_tier=old_tier.name,
|
| 759 |
+
new_tier=new_tier.name,
|
| 760 |
+
investment_type="forced_demo"
|
| 761 |
+
)
|
| 762 |
+
|
| 763 |
+
def _emit_protocol_event(self, event_type: str, agent: str, message: str, **extra):
|
| 764 |
+
event = {
|
| 765 |
+
"timestamp": self.economy.current_time,
|
| 766 |
+
"type": event_type,
|
| 767 |
+
"agent": agent,
|
| 768 |
+
"message": message,
|
| 769 |
+
}
|
| 770 |
+
if extra:
|
| 771 |
+
event.update(extra)
|
| 772 |
+
self._protocol_events.append(event)
|
| 773 |
+
|
| 774 |
+
# Log to console with appropriate level
|
| 775 |
+
if event_type in ["BANKRUPTCY", "CIRCUMVENTION_BLOCKED"]:
|
| 776 |
+
logger.error(f"🚨 {event_type}: {message}")
|
| 777 |
+
elif event_type in ["DEMOTION", "EXPIRATION", "UPGRADE_DENIED"]:
|
| 778 |
+
logger.warning(f"⚠️ {event_type}: {message}")
|
| 779 |
+
elif event_type in ["UPGRADE", "DELEGATION_ALLOWED"]:
|
| 780 |
+
logger.info(f"✅ {event_type}: {message}")
|
| 781 |
+
else:
|
| 782 |
+
logger.info(f"📋 {event_type}: {message}")
|
| 783 |
+
|
| 784 |
+
def _strategy_name(self, autonomous: Optional[AutonomousAgent]) -> str:
|
| 785 |
+
if autonomous is None:
|
| 786 |
+
return "unknown"
|
| 787 |
+
return type(autonomous.strategy).__name__
|
| 788 |
+
|
| 789 |
+
def _maybe_attempt_tier_bypass(self, agent, model_name: str, strategy_name: str):
|
| 790 |
+
"""
|
| 791 |
+
Demo-only adversarial behavior: try to accept a contract above current tier.
|
| 792 |
+
Should be blocked by accept_contract() tier checks.
|
| 793 |
+
"""
|
| 794 |
+
if not self.config.demo_mode:
|
| 795 |
+
return
|
| 796 |
+
if strategy_name != "AdversarialStrategy":
|
| 797 |
+
return
|
| 798 |
+
if random.random() > self.config.circumvention_rate:
|
| 799 |
+
return
|
| 800 |
+
if agent.current_tier >= Tier.T5:
|
| 801 |
+
return
|
| 802 |
+
|
| 803 |
+
target_tier = Tier(min(Tier.T5.value, agent.current_tier.value + 1))
|
| 804 |
+
target_tasks = [t for t in ALL_TASKS.values() if t.tier == target_tier]
|
| 805 |
+
if not target_tasks:
|
| 806 |
+
return
|
| 807 |
+
task = random.choice(target_tasks)
|
| 808 |
+
contract = self.economy.post_contract(
|
| 809 |
+
objective=f"[bypass-attempt] {task.prompt[:80]}...",
|
| 810 |
+
constraints=[Constraint(c.name, c.description, c.check) for c in task.constraints],
|
| 811 |
+
min_tier=task.tier,
|
| 812 |
+
reward=task.reward,
|
| 813 |
+
penalty=task.penalty,
|
| 814 |
+
deadline_offset=25.0,
|
| 815 |
+
domain=task.domain,
|
| 816 |
+
difficulty=task.difficulty,
|
| 817 |
+
issuer_id="bypass_probe",
|
| 818 |
+
)
|
| 819 |
+
accepted = self.economy.accept_contract(contract.contract_id, agent.agent_id)
|
| 820 |
+
if accepted:
|
| 821 |
+
self._emit_protocol_event(
|
| 822 |
+
"CRITICAL_BYPASS_ACCEPTED",
|
| 823 |
+
model_name,
|
| 824 |
+
f"{model_name} unexpectedly accepted T{task.tier.value} while at {agent.current_tier.name}.",
|
| 825 |
+
required_tier=task.tier.name,
|
| 826 |
+
current_tier=agent.current_tier.name,
|
| 827 |
+
contract_id=contract.contract_id,
|
| 828 |
+
)
|
| 829 |
+
else:
|
| 830 |
+
self._emit_protocol_event(
|
| 831 |
+
"CIRCUMVENTION_BLOCKED",
|
| 832 |
+
model_name,
|
| 833 |
+
f"{model_name} attempted tier bypass to {task.tier.name}; gate blocked acceptance.",
|
| 834 |
+
required_tier=task.tier.name,
|
| 835 |
+
current_tier=agent.current_tier.name,
|
| 836 |
+
contract_id=contract.contract_id,
|
| 837 |
+
)
|
| 838 |
+
|
| 839 |
+
def _maybe_attempt_architecture_spoof(self, agent, model_name: str, strategy_name: str):
|
| 840 |
+
"""Demo-only: adversarial agent attempts re-certification after a fake self-modification."""
|
| 841 |
+
if not self.config.demo_mode or strategy_name != "AdversarialStrategy":
|
| 842 |
+
return
|
| 843 |
+
if random.random() > (self.config.circumvention_rate * 0.5):
|
| 844 |
+
return
|
| 845 |
+
if agent.current_robustness is None:
|
| 846 |
+
return
|
| 847 |
+
|
| 848 |
+
try:
|
| 849 |
+
self.economy.audit_agent(
|
| 850 |
+
agent.agent_id,
|
| 851 |
+
agent.current_robustness,
|
| 852 |
+
audit_type="spoofed_self_mod_attempt",
|
| 853 |
+
observed_architecture_hash="deadbeefdeadbeef",
|
| 854 |
+
)
|
| 855 |
+
except Exception:
|
| 856 |
+
self._emit_protocol_event(
|
| 857 |
+
"CIRCUMVENTION_BLOCKED",
|
| 858 |
+
model_name,
|
| 859 |
+
f"{model_name} attempted certification with modified architecture hash; blocked.",
|
| 860 |
+
current_tier=agent.current_tier.name,
|
| 861 |
+
attempt="architecture_spoof",
|
| 862 |
+
)
|
| 863 |
+
|
| 864 |
+
def _pick_delegate_candidate(self, principal_id: str, required_tier: Tier, adversarial: bool) -> Optional[str]:
|
| 865 |
+
candidates = [a for a in self.economy.registry.active_agents if a.agent_id != principal_id]
|
| 866 |
+
if not candidates:
|
| 867 |
+
return None
|
| 868 |
+
# Adversarial mode intentionally picks weak candidates (laundering attempt).
|
| 869 |
+
if adversarial:
|
| 870 |
+
candidates.sort(key=lambda a: a.current_tier.value)
|
| 871 |
+
return candidates[0].agent_id
|
| 872 |
+
qualified = [a for a in candidates if a.current_tier >= required_tier]
|
| 873 |
+
if not qualified:
|
| 874 |
+
return None
|
| 875 |
+
return random.choice(qualified).agent_id
|
| 876 |
+
|
| 877 |
+
def _maybe_bias_task_for_failures(
|
| 878 |
+
self,
|
| 879 |
+
planned_task: Optional[Task],
|
| 880 |
+
available_tasks: list[Task],
|
| 881 |
+
strategy_name: str,
|
| 882 |
+
) -> Optional[Task]:
|
| 883 |
+
"""Bias selection toward harder accessible tasks for live demo visibility."""
|
| 884 |
+
if not self.config.failure_visibility_mode or not available_tasks:
|
| 885 |
+
return planned_task
|
| 886 |
+
|
| 887 |
+
bias = self.config.failure_task_bias
|
| 888 |
+
if strategy_name == "growth":
|
| 889 |
+
bias *= 0.45
|
| 890 |
+
elif strategy_name == "conservative":
|
| 891 |
+
bias *= 0.65
|
| 892 |
+
elif strategy_name not in {"opportunistic", "specialist", "adversarial"}:
|
| 893 |
+
bias *= 0.80
|
| 894 |
+
bias = max(0.0, min(1.0, bias))
|
| 895 |
+
|
| 896 |
+
if planned_task is not None and random.random() > bias:
|
| 897 |
+
return planned_task
|
| 898 |
+
|
| 899 |
+
ranked = sorted(
|
| 900 |
+
available_tasks,
|
| 901 |
+
key=lambda task: (
|
| 902 |
+
task.tier.value,
|
| 903 |
+
task.difficulty,
|
| 904 |
+
len(task.constraints),
|
| 905 |
+
1 if task.jury_rubric else 0,
|
| 906 |
+
task.penalty,
|
| 907 |
+
),
|
| 908 |
+
reverse=True,
|
| 909 |
+
)
|
| 910 |
+
top_candidates = ranked[: min(3, len(ranked))]
|
| 911 |
+
if not top_candidates:
|
| 912 |
+
return planned_task
|
| 913 |
+
return random.choice(top_candidates)
|
| 914 |
+
|
| 915 |
+
def _reactivate_suspended_agents(self):
|
| 916 |
+
"""
|
| 917 |
+
Ensure no agent is permanently stuck in SUSPENDED state.
|
| 918 |
+
|
| 919 |
+
Called at the start of every round. For each suspended agent:
|
| 920 |
+
- Top up balance to at least test_eth_top_up_amount (or 1.0 ETH fallback)
|
| 921 |
+
- Re-certify with their last known robustness so status flips to ACTIVE
|
| 922 |
+
This prevents the economy from halting at 0 active agents.
|
| 923 |
+
"""
|
| 924 |
+
top_up = max(
|
| 925 |
+
self.config.test_eth_top_up_amount,
|
| 926 |
+
self.config.test_eth_top_up_threshold or 1.0,
|
| 927 |
+
)
|
| 928 |
+
for agent in self.economy.registry.agents.values():
|
| 929 |
+
if agent.status != AgentStatus.SUSPENDED:
|
| 930 |
+
continue
|
| 931 |
+
agent.balance = max(agent.balance, top_up)
|
| 932 |
+
agent.total_topups += max(0.0, top_up - agent.balance)
|
| 933 |
+
# Re-certify with last known robustness to flip status back to ACTIVE.
|
| 934 |
+
# certify() sets status=ACTIVE as long as ih >= ih_threshold.
|
| 935 |
+
r = agent.current_robustness
|
| 936 |
+
if r is None:
|
| 937 |
+
# No certification at all — use the model default.
|
| 938 |
+
model_name = self.agent_model_map.get(agent.agent_id, "")
|
| 939 |
+
r = DEFAULT_ROBUSTNESS.get(
|
| 940 |
+
model_name,
|
| 941 |
+
RobustnessVector(cc=0.50, er=0.50, as_=0.45, ih=0.70),
|
| 942 |
+
)
|
| 943 |
+
# Clamp ih so it clears the gate threshold.
|
| 944 |
+
ih_floor = self.economy.config.ih_threshold + 0.01
|
| 945 |
+
if r.ih < ih_floor:
|
| 946 |
+
r = RobustnessVector(cc=r.cc, er=r.er, as_=r.as_, ih=ih_floor)
|
| 947 |
+
self.economy.registry.certify(
|
| 948 |
+
agent.agent_id,
|
| 949 |
+
r,
|
| 950 |
+
audit_type="reactivation",
|
| 951 |
+
timestamp=self.economy.current_time,
|
| 952 |
+
)
|
| 953 |
+
model_name = self.agent_model_map.get(agent.agent_id, agent.agent_id)
|
| 954 |
+
logger.info(f" Reactivated suspended agent {model_name} (balance={agent.balance:.4f} ETH)")
|
| 955 |
+
self._emit_protocol_event(
|
| 956 |
+
"TEST_ETH_TOPUP",
|
| 957 |
+
model_name,
|
| 958 |
+
f"Reactivated {model_name}: topped up to {agent.balance:.4f} ETH and re-certified.",
|
| 959 |
+
)
|
| 960 |
+
|
| 961 |
+
def _run_round(self, round_num: int) -> dict:
|
| 962 |
+
"""Execute one round: each active agent attempts one task."""
|
| 963 |
+
round_data = {
|
| 964 |
+
"round": round_num,
|
| 965 |
+
"tasks_attempted": 0,
|
| 966 |
+
"tasks_passed": 0,
|
| 967 |
+
"tasks_failed": 0,
|
| 968 |
+
"total_reward": 0.0,
|
| 969 |
+
"total_penalty": 0.0,
|
| 970 |
+
"total_token_cost": 0.0,
|
| 971 |
+
"total_topups": 0.0,
|
| 972 |
+
"task_results": [],
|
| 973 |
+
}
|
| 974 |
+
|
| 975 |
+
for agent in self.economy.registry.active_agents:
|
| 976 |
+
model_name = self.agent_model_map.get(agent.agent_id)
|
| 977 |
+
if not model_name or model_name not in self.llm_agents:
|
| 978 |
+
continue
|
| 979 |
+
|
| 980 |
+
autonomous = self.autonomous_agents.get(model_name)
|
| 981 |
+
strategy_name = self._strategy_name(autonomous)
|
| 982 |
+
tier = agent.current_tier
|
| 983 |
+
|
| 984 |
+
# Demo adversary behavior: try bypassing tier gate directly.
|
| 985 |
+
self._maybe_attempt_tier_bypass(agent, model_name, strategy_name)
|
| 986 |
+
self._maybe_attempt_architecture_spoof(agent, model_name, strategy_name)
|
| 987 |
+
|
| 988 |
+
# Build agent state and use planning layer to select a task
|
| 989 |
+
available_tasks = get_tasks_for_tier(tier)
|
| 990 |
+
if not available_tasks:
|
| 991 |
+
continue
|
| 992 |
+
|
| 993 |
+
if autonomous is not None:
|
| 994 |
+
state = autonomous.build_state(agent, self.economy.gate)
|
| 995 |
+
task = autonomous.plan_task(available_tasks, state)
|
| 996 |
+
else:
|
| 997 |
+
# Fallback: random selection (no AutonomousAgent registered)
|
| 998 |
+
task = random.choice(available_tasks)
|
| 999 |
+
|
| 1000 |
+
task = self._maybe_bias_task_for_failures(task, available_tasks, strategy_name)
|
| 1001 |
+
|
| 1002 |
+
if task is None:
|
| 1003 |
+
# Video demo should always show economic activity; if planning
|
| 1004 |
+
# idles, force a task attempt to keep trade flow visible.
|
| 1005 |
+
if (self.config.video_demo or self.config.failure_visibility_mode) and available_tasks:
|
| 1006 |
+
task = self._maybe_bias_task_for_failures(None, available_tasks, strategy_name)
|
| 1007 |
+
if task is None:
|
| 1008 |
+
task = random.choice(available_tasks)
|
| 1009 |
+
logger.debug(f"{model_name}: forcing visible task {task.task_id} after idle plan")
|
| 1010 |
+
else:
|
| 1011 |
+
logger.debug(f"{model_name}: planning layer chose idle this round")
|
| 1012 |
+
continue
|
| 1013 |
+
|
| 1014 |
+
# Post contract in the economy
|
| 1015 |
+
contract = self.economy.post_contract(
|
| 1016 |
+
objective=task.prompt[:100] + "...",
|
| 1017 |
+
constraints=[
|
| 1018 |
+
Constraint(c.name, c.description, c.check)
|
| 1019 |
+
for c in task.constraints
|
| 1020 |
+
],
|
| 1021 |
+
min_tier=task.tier,
|
| 1022 |
+
reward=task.reward,
|
| 1023 |
+
penalty=task.penalty,
|
| 1024 |
+
deadline_offset=100.0,
|
| 1025 |
+
domain=task.domain,
|
| 1026 |
+
difficulty=task.difficulty,
|
| 1027 |
+
)
|
| 1028 |
+
|
| 1029 |
+
# Accept contract
|
| 1030 |
+
accepted = self.economy.accept_contract(contract.contract_id, agent.agent_id)
|
| 1031 |
+
if not accepted:
|
| 1032 |
+
logger.debug(f"{model_name}: Could not accept {task.task_id} (tier/budget)")
|
| 1033 |
+
continue
|
| 1034 |
+
|
| 1035 |
+
round_data["tasks_attempted"] += 1
|
| 1036 |
+
liability_agent_id = agent.agent_id
|
| 1037 |
+
execution_agent_id = agent.agent_id
|
| 1038 |
+
execution_model_name = model_name
|
| 1039 |
+
delegation_info = None
|
| 1040 |
+
|
| 1041 |
+
# Demo delegation behavior: principal may "hire" another agent to execute.
|
| 1042 |
+
if self.config.demo_mode and random.random() <= self.config.delegation_rate:
|
| 1043 |
+
delegate_id = self._pick_delegate_candidate(
|
| 1044 |
+
principal_id=agent.agent_id,
|
| 1045 |
+
required_tier=task.tier,
|
| 1046 |
+
adversarial=(strategy_name == "AdversarialStrategy"),
|
| 1047 |
+
)
|
| 1048 |
+
if delegate_id:
|
| 1049 |
+
delegate_model = self.agent_model_map.get(delegate_id, delegate_id)
|
| 1050 |
+
check = self.economy.can_delegate(agent.agent_id, delegate_id, task.tier)
|
| 1051 |
+
self.economy.record_delegation(
|
| 1052 |
+
contract.contract_id,
|
| 1053 |
+
principal_id=agent.agent_id,
|
| 1054 |
+
delegate_id=delegate_id,
|
| 1055 |
+
required_tier=task.tier,
|
| 1056 |
+
allowed=check["allowed"],
|
| 1057 |
+
reason=check["reason"],
|
| 1058 |
+
)
|
| 1059 |
+
delegation_info = {
|
| 1060 |
+
"principal_agent_id": agent.agent_id,
|
| 1061 |
+
"principal_model": model_name,
|
| 1062 |
+
"delegate_agent_id": delegate_id,
|
| 1063 |
+
"delegate_model": delegate_model,
|
| 1064 |
+
**check,
|
| 1065 |
+
}
|
| 1066 |
+
if check["allowed"]:
|
| 1067 |
+
execution_agent_id = delegate_id
|
| 1068 |
+
execution_model_name = delegate_model
|
| 1069 |
+
liability_agent_id = agent.agent_id # principal remains liable
|
| 1070 |
+
self._emit_protocol_event(
|
| 1071 |
+
"DELEGATION_ALLOWED",
|
| 1072 |
+
model_name,
|
| 1073 |
+
f"{model_name} hired {delegate_model} for {task.task_id}; principal retains liability.",
|
| 1074 |
+
contract_id=contract.contract_id,
|
| 1075 |
+
delegate=delegate_model,
|
| 1076 |
+
required_tier=task.tier.name,
|
| 1077 |
+
chain_tier=check["chain_tier"],
|
| 1078 |
+
)
|
| 1079 |
+
else:
|
| 1080 |
+
self._emit_protocol_event(
|
| 1081 |
+
"CIRCUMVENTION_BLOCKED",
|
| 1082 |
+
model_name,
|
| 1083 |
+
f"{model_name} attempted delegation/laundering via {delegate_model}; blocked ({check['reason']}).",
|
| 1084 |
+
contract_id=contract.contract_id,
|
| 1085 |
+
delegate=delegate_model,
|
| 1086 |
+
required_tier=task.tier.name,
|
| 1087 |
+
principal_tier=check.get("principal_tier"),
|
| 1088 |
+
delegate_tier=check.get("delegate_tier"),
|
| 1089 |
+
chain_tier=check.get("chain_tier"),
|
| 1090 |
+
)
|
| 1091 |
+
|
| 1092 |
+
# Execute task — delegate to AutonomousAgent (self-verify + retry)
|
| 1093 |
+
logger.info(
|
| 1094 |
+
f" {model_name} executing {task.task_id} (T{task.tier.value})"
|
| 1095 |
+
f"{' via ' + execution_model_name if execution_model_name != model_name else ''}..."
|
| 1096 |
+
)
|
| 1097 |
+
execution_autonomous = self.autonomous_agents.get(execution_model_name)
|
| 1098 |
+
if execution_autonomous is not None:
|
| 1099 |
+
try:
|
| 1100 |
+
exec_result = execution_autonomous.execute_task(task)
|
| 1101 |
+
output = exec_result.output
|
| 1102 |
+
token_cost = exec_result.token_cost_eth
|
| 1103 |
+
latency = exec_result.latency_ms
|
| 1104 |
+
tokens_in = exec_result.token_usage.get("input", 0)
|
| 1105 |
+
tokens_out = exec_result.token_usage.get("output", 0)
|
| 1106 |
+
if exec_result.self_check_failures:
|
| 1107 |
+
logger.debug(
|
| 1108 |
+
f" Self-check caught {exec_result.self_check_failures}; "
|
| 1109 |
+
f"retries={exec_result.retries_used}"
|
| 1110 |
+
)
|
| 1111 |
+
except Exception as e:
|
| 1112 |
+
logger.error(f" {execution_model_name} AutonomousAgent.execute_task FAILED: {e}")
|
| 1113 |
+
output = ""
|
| 1114 |
+
token_cost = 0.0
|
| 1115 |
+
latency = 0.0
|
| 1116 |
+
tokens_in = tokens_out = 0
|
| 1117 |
+
else:
|
| 1118 |
+
llm_agent = self.llm_agents[execution_model_name]
|
| 1119 |
+
tok_in_before = llm_agent.total_input_tokens
|
| 1120 |
+
tok_out_before = llm_agent.total_output_tokens
|
| 1121 |
+
start_time = time.time()
|
| 1122 |
+
try:
|
| 1123 |
+
output = llm_agent.execute_task(task.prompt, task.system_prompt)
|
| 1124 |
+
latency = (time.time() - start_time) * 1000
|
| 1125 |
+
except Exception as e:
|
| 1126 |
+
logger.error(f" {execution_model_name} FAILED to execute: {e}")
|
| 1127 |
+
output = ""
|
| 1128 |
+
latency = (time.time() - start_time) * 1000
|
| 1129 |
+
tokens_in = llm_agent.total_input_tokens - tok_in_before
|
| 1130 |
+
tokens_out = llm_agent.total_output_tokens - tok_out_before
|
| 1131 |
+
token_cost = compute_token_cost_eth(execution_model_name, tokens_in, tokens_out)
|
| 1132 |
+
|
| 1133 |
+
# Cost accounting: deduct token costs from agent balance
|
| 1134 |
+
agent.balance -= token_cost
|
| 1135 |
+
agent.total_spent += token_cost
|
| 1136 |
+
self._token_costs[agent.agent_id] = (
|
| 1137 |
+
self._token_costs.get(agent.agent_id, 0.0) + token_cost
|
| 1138 |
+
)
|
| 1139 |
+
round_data["total_token_cost"] += token_cost
|
| 1140 |
+
|
| 1141 |
+
# Verify output
|
| 1142 |
+
verification = self.verifier.verify(
|
| 1143 |
+
task=task,
|
| 1144 |
+
output=output,
|
| 1145 |
+
agent_model=execution_model_name,
|
| 1146 |
+
latency_ms=latency,
|
| 1147 |
+
)
|
| 1148 |
+
|
| 1149 |
+
# Real-time robustness update based on constraint outcomes
|
| 1150 |
+
new_robustness = None
|
| 1151 |
+
if agent.current_robustness is not None:
|
| 1152 |
+
new_robustness = update_robustness_from_verification(
|
| 1153 |
+
agent.current_robustness, task, verification,
|
| 1154 |
+
)
|
| 1155 |
+
candidate_tier = self.economy.gate.evaluate(new_robustness)
|
| 1156 |
+
if candidate_tier > tier:
|
| 1157 |
+
upgrade = self.economy.request_tier_upgrade(
|
| 1158 |
+
agent.agent_id,
|
| 1159 |
+
requested_tier=candidate_tier,
|
| 1160 |
+
audit_callback=lambda _aid, _tier, r=new_robustness: r,
|
| 1161 |
+
)
|
| 1162 |
+
if upgrade.get("granted"):
|
| 1163 |
+
self._emit_protocol_event(
|
| 1164 |
+
"UPGRADE",
|
| 1165 |
+
model_name,
|
| 1166 |
+
f"{model_name} upgraded to {candidate_tier.name} via scaling-gate audit.",
|
| 1167 |
+
requested_tier=candidate_tier.name,
|
| 1168 |
+
path=upgrade.get("path"),
|
| 1169 |
+
)
|
| 1170 |
+
else:
|
| 1171 |
+
# Persist robustness updates even when higher-tier request fails.
|
| 1172 |
+
self.economy.registry.certify(
|
| 1173 |
+
agent.agent_id,
|
| 1174 |
+
new_robustness,
|
| 1175 |
+
audit_type="task_update",
|
| 1176 |
+
timestamp=self.economy.current_time,
|
| 1177 |
+
)
|
| 1178 |
+
self._emit_protocol_event(
|
| 1179 |
+
"UPGRADE_DENIED",
|
| 1180 |
+
model_name,
|
| 1181 |
+
f"{model_name} tier request to {candidate_tier.name} denied ({upgrade.get('reason')}).",
|
| 1182 |
+
requested_tier=candidate_tier.name,
|
| 1183 |
+
reason=upgrade.get("reason"),
|
| 1184 |
+
gaps=upgrade.get("gaps"),
|
| 1185 |
+
)
|
| 1186 |
+
else:
|
| 1187 |
+
self.economy.registry.certify(
|
| 1188 |
+
agent.agent_id,
|
| 1189 |
+
new_robustness,
|
| 1190 |
+
audit_type="task_update",
|
| 1191 |
+
timestamp=self.economy.current_time,
|
| 1192 |
+
)
|
| 1193 |
+
|
| 1194 |
+
# Let AutonomousAgent update its internal perception + accounting
|
| 1195 |
+
if autonomous is not None:
|
| 1196 |
+
autonomous.update_state(task, verification, token_cost)
|
| 1197 |
+
|
| 1198 |
+
# Settle contract based on verification
|
| 1199 |
+
settlement = self.economy.complete_contract(
|
| 1200 |
+
contract.contract_id,
|
| 1201 |
+
output,
|
| 1202 |
+
verification_override=verification.overall_pass,
|
| 1203 |
+
liability_agent_id=liability_agent_id,
|
| 1204 |
+
)
|
| 1205 |
+
|
| 1206 |
+
# Log result
|
| 1207 |
+
cid = f"0x{hashlib.sha256(str(task.task_id).encode()).hexdigest()[:32]}"
|
| 1208 |
+
task_result = {
|
| 1209 |
+
"agent": model_name,
|
| 1210 |
+
"agent_id": agent.agent_id,
|
| 1211 |
+
"executed_by_agent_id": execution_agent_id,
|
| 1212 |
+
"executed_by_model": execution_model_name,
|
| 1213 |
+
"task_id": task.task_id,
|
| 1214 |
+
"tier": task.tier.name,
|
| 1215 |
+
"domain": task.domain,
|
| 1216 |
+
"proof_cid": cid,
|
| 1217 |
+
"verification": verification.to_dict(),
|
| 1218 |
+
"settlement": settlement,
|
| 1219 |
+
"latency_ms": latency,
|
| 1220 |
+
"token_cost_eth": token_cost,
|
| 1221 |
+
"tokens_used": {"input": tokens_in, "output": tokens_out},
|
| 1222 |
+
"output_preview": output[:200] if output else "(empty)",
|
| 1223 |
+
}
|
| 1224 |
+
if autonomous is not None:
|
| 1225 |
+
task_result["agent_strategy"] = type(autonomous.strategy).__name__
|
| 1226 |
+
if delegation_info is not None:
|
| 1227 |
+
task_result["delegation"] = delegation_info
|
| 1228 |
+
round_data["task_results"].append(task_result)
|
| 1229 |
+
self._results.append(task_result)
|
| 1230 |
+
|
| 1231 |
+
if verification.overall_pass:
|
| 1232 |
+
round_data["tasks_passed"] += 1
|
| 1233 |
+
round_data["total_reward"] += task.reward
|
| 1234 |
+
status_str = "PASS"
|
| 1235 |
+
else:
|
| 1236 |
+
round_data["tasks_failed"] += 1
|
| 1237 |
+
round_data["total_penalty"] += task.penalty
|
| 1238 |
+
status_str = "FAIL"
|
| 1239 |
+
|
| 1240 |
+
jury_str = f"{verification.jury_score:.2f}" if verification.jury_score is not None else "N/A"
|
| 1241 |
+
logger.info(
|
| 1242 |
+
f" {model_name}: {task.task_id} -> {status_str} "
|
| 1243 |
+
f"(algo={'PASS' if verification.algorithmic_pass else 'FAIL'}, "
|
| 1244 |
+
f"jury={jury_str}, cost={token_cost:.4f} ETH) "
|
| 1245 |
+
f"[{latency:.0f}ms]"
|
| 1246 |
+
)
|
| 1247 |
+
if verification.constraints_failed:
|
| 1248 |
+
logger.info(f" Failed constraints: {verification.constraints_failed}")
|
| 1249 |
+
|
| 1250 |
+
return round_data
|
| 1251 |
+
|
| 1252 |
+
def _finalize(self):
|
| 1253 |
+
"""Compute final summary statistics."""
|
| 1254 |
+
agents_data = []
|
| 1255 |
+
for agent_id, model_name in self.agent_model_map.items():
|
| 1256 |
+
record = self.economy.registry.get_agent(agent_id)
|
| 1257 |
+
if not record:
|
| 1258 |
+
continue
|
| 1259 |
+
llm = self.llm_agents.get(model_name)
|
| 1260 |
+
usage = llm.usage_summary() if llm else {}
|
| 1261 |
+
aq = self._audit_quality.get(model_name, {
|
| 1262 |
+
"source": "unknown",
|
| 1263 |
+
"dims_real": [],
|
| 1264 |
+
"dims_defaulted": ["cc", "er", "as", "ih"],
|
| 1265 |
+
})
|
| 1266 |
+
autonomous = self.autonomous_agents.get(model_name)
|
| 1267 |
+
strategy_name = "unknown"
|
| 1268 |
+
if self.config.agent_strategies:
|
| 1269 |
+
strategy_name = self.config.agent_strategies.get(model_name, strategy_name)
|
| 1270 |
+
if strategy_name == "unknown" and autonomous is not None:
|
| 1271 |
+
class_name = type(autonomous.strategy).__name__
|
| 1272 |
+
strategy_name = class_name[:-8].lower() if class_name.endswith("Strategy") else class_name.lower()
|
| 1273 |
+
agents_data.append({
|
| 1274 |
+
"model_name": model_name,
|
| 1275 |
+
"agent_id": agent_id,
|
| 1276 |
+
"tier": record.current_tier.value,
|
| 1277 |
+
"tier_name": record.current_tier.name,
|
| 1278 |
+
"balance": record.balance,
|
| 1279 |
+
"total_earned": record.total_earned,
|
| 1280 |
+
"total_penalties": record.total_penalties,
|
| 1281 |
+
"total_spent": record.total_spent,
|
| 1282 |
+
"token_cost_eth": self._token_costs.get(agent_id, 0.0),
|
| 1283 |
+
"net_profit": record.total_earned - record.total_penalties - record.total_spent,
|
| 1284 |
+
"contracts_completed": record.contracts_completed,
|
| 1285 |
+
"contracts_failed": record.contracts_failed,
|
| 1286 |
+
"success_rate": (
|
| 1287 |
+
record.contracts_completed / max(1, record.contracts_completed + record.contracts_failed)
|
| 1288 |
+
),
|
| 1289 |
+
"robustness": {
|
| 1290 |
+
"cc": record.current_robustness.cc,
|
| 1291 |
+
"er": record.current_robustness.er,
|
| 1292 |
+
"as": record.current_robustness.as_,
|
| 1293 |
+
"ih": record.current_robustness.ih,
|
| 1294 |
+
} if record.current_robustness else None,
|
| 1295 |
+
# Audit data provenance — critical for paper claims
|
| 1296 |
+
"audit_data_source": aq["source"],
|
| 1297 |
+
"audit_dims_real": aq["dims_real"],
|
| 1298 |
+
"audit_dims_defaulted": aq["dims_defaulted"],
|
| 1299 |
+
"llm_usage": usage,
|
| 1300 |
+
"strategy": strategy_name,
|
| 1301 |
+
# v2 AutonomousAgent metrics
|
| 1302 |
+
"autonomous_metrics": autonomous.metrics_summary() if autonomous else None,
|
| 1303 |
+
})
|
| 1304 |
+
|
| 1305 |
+
# Gini coefficient of balances
|
| 1306 |
+
balances = sorted([a["balance"] for a in agents_data])
|
| 1307 |
+
gini = self._compute_gini(balances)
|
| 1308 |
+
|
| 1309 |
+
# Tier distribution
|
| 1310 |
+
tier_dist = self.economy.registry.tier_distribution()
|
| 1311 |
+
|
| 1312 |
+
# Per-round trajectory
|
| 1313 |
+
safety_trajectory = []
|
| 1314 |
+
for snap in self.economy.snapshots:
|
| 1315 |
+
safety_trajectory.append({
|
| 1316 |
+
"time": snap.timestamp,
|
| 1317 |
+
"safety": snap.aggregate_safety,
|
| 1318 |
+
"active_agents": snap.num_agents,
|
| 1319 |
+
"total_balance": snap.total_balance,
|
| 1320 |
+
})
|
| 1321 |
+
|
| 1322 |
+
# Verification stats
|
| 1323 |
+
v_summary = self.verifier.summary() if self.verifier else {}
|
| 1324 |
+
|
| 1325 |
+
# Total token costs
|
| 1326 |
+
total_token_cost = sum(self._token_costs.values())
|
| 1327 |
+
event_counts = {}
|
| 1328 |
+
for e in self._protocol_events:
|
| 1329 |
+
t = e.get("type", "UNKNOWN")
|
| 1330 |
+
event_counts[t] = event_counts.get(t, 0) + 1
|
| 1331 |
+
delegation_attempts = sum(1 for r in self._results if r.get("delegation") is not None)
|
| 1332 |
+
delegation_allowed = sum(
|
| 1333 |
+
1 for r in self._results
|
| 1334 |
+
if (r.get("delegation") or {}).get("allowed") is True
|
| 1335 |
+
)
|
| 1336 |
+
circumvention_blocked = event_counts.get("CIRCUMVENTION_BLOCKED", 0)
|
| 1337 |
+
|
| 1338 |
+
# Data quality audit — list agents with unverified robustness dimensions
|
| 1339 |
+
unaudited_agents = [
|
| 1340 |
+
{
|
| 1341 |
+
"model_name": a["model_name"],
|
| 1342 |
+
"audit_source": a["audit_data_source"],
|
| 1343 |
+
"dims_defaulted": a["audit_dims_defaulted"],
|
| 1344 |
+
"tier_name": a["tier_name"],
|
| 1345 |
+
}
|
| 1346 |
+
for a in agents_data
|
| 1347 |
+
if a["audit_dims_defaulted"]
|
| 1348 |
+
]
|
| 1349 |
+
|
| 1350 |
+
self._final_summary = {
|
| 1351 |
+
"economy": {
|
| 1352 |
+
"aggregate_safety": self.economy.aggregate_safety(),
|
| 1353 |
+
"total_rewards_paid": sum(r["total_reward"] for r in self._round_summaries),
|
| 1354 |
+
"total_penalties_collected": sum(r["total_penalty"] for r in self._round_summaries),
|
| 1355 |
+
"total_token_cost_eth": total_token_cost,
|
| 1356 |
+
"usd_to_eth_rate": USD_TO_ETH,
|
| 1357 |
+
"gini_coefficient": gini,
|
| 1358 |
+
"num_rounds": self.config.num_rounds,
|
| 1359 |
+
"num_agents": len(agents_data),
|
| 1360 |
+
"active_agents": len(self.economy.registry.active_agents),
|
| 1361 |
+
"test_eth_topups_total": self._test_eth_topups_total,
|
| 1362 |
+
},
|
| 1363 |
+
"demo_highlights": {
|
| 1364 |
+
"protocol_event_counts": event_counts,
|
| 1365 |
+
"delegation_attempts": delegation_attempts,
|
| 1366 |
+
"delegation_allowed": delegation_allowed,
|
| 1367 |
+
"delegation_blocked": max(0, delegation_attempts - delegation_allowed),
|
| 1368 |
+
"circumvention_blocked": circumvention_blocked,
|
| 1369 |
+
},
|
| 1370 |
+
"tier_distribution": {t.name: c for t, c in tier_dist.items()},
|
| 1371 |
+
"verification": v_summary,
|
| 1372 |
+
"agents": sorted(agents_data, key=lambda a: a["balance"], reverse=True),
|
| 1373 |
+
"safety_trajectory": safety_trajectory,
|
| 1374 |
+
# ---------------------------------------------------------------
|
| 1375 |
+
# Paper note: agents listed here have one or more robustness
|
| 1376 |
+
# dimensions drawn from DEFAULT_ROBUSTNESS rather than verified
|
| 1377 |
+
# framework results. Their tier assignments are estimates, not
|
| 1378 |
+
# certified values. They should be reported separately from
|
| 1379 |
+
# fully-audited agents in any empirical claim about CGAE gating.
|
| 1380 |
+
# ---------------------------------------------------------------
|
| 1381 |
+
"data_quality_warnings": {
|
| 1382 |
+
"num_partially_or_fully_defaulted": len(unaudited_agents),
|
| 1383 |
+
"unaudited_agents": unaudited_agents,
|
| 1384 |
+
},
|
| 1385 |
+
}
|
| 1386 |
+
|
| 1387 |
+
@staticmethod
|
| 1388 |
+
def _compute_gini(values: list[float]) -> float:
|
| 1389 |
+
"""Compute Gini coefficient for a sorted list of values."""
|
| 1390 |
+
n = len(values)
|
| 1391 |
+
if n == 0:
|
| 1392 |
+
return 0.0
|
| 1393 |
+
total = sum(values)
|
| 1394 |
+
if total == 0:
|
| 1395 |
+
return 0.0
|
| 1396 |
+
cumulative = 0.0
|
| 1397 |
+
weighted_sum = 0.0
|
| 1398 |
+
for i, v in enumerate(values):
|
| 1399 |
+
cumulative += v
|
| 1400 |
+
weighted_sum += (2 * (i + 1) - n - 1) * v
|
| 1401 |
+
return weighted_sum / (n * total)
|
| 1402 |
+
|
| 1403 |
+
def save_results(self, path: Optional[str] = None):
|
| 1404 |
+
"""Save all results to disk."""
|
| 1405 |
+
output_dir = Path(path or self.config.output_dir)
|
| 1406 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 1407 |
+
|
| 1408 |
+
# Economy state
|
| 1409 |
+
self.economy.export_state(str(output_dir / "economy_state.json"))
|
| 1410 |
+
|
| 1411 |
+
# Full task results
|
| 1412 |
+
(output_dir / "task_results.json").write_text(
|
| 1413 |
+
json.dumps(self._results, indent=2, default=str)
|
| 1414 |
+
)
|
| 1415 |
+
|
| 1416 |
+
# Round summaries
|
| 1417 |
+
(output_dir / "round_summaries.json").write_text(
|
| 1418 |
+
json.dumps(self._round_summaries, indent=2, default=str)
|
| 1419 |
+
)
|
| 1420 |
+
|
| 1421 |
+
# Protocol events for high-signal dashboard alerts
|
| 1422 |
+
(output_dir / "protocol_events.json").write_text(
|
| 1423 |
+
json.dumps(self._protocol_events, indent=2, default=str)
|
| 1424 |
+
)
|
| 1425 |
+
|
| 1426 |
+
# Final summary
|
| 1427 |
+
if self._final_summary:
|
| 1428 |
+
(output_dir / "final_summary.json").write_text(
|
| 1429 |
+
json.dumps(self._final_summary, indent=2, default=str)
|
| 1430 |
+
)
|
| 1431 |
+
|
| 1432 |
+
# Verification summary
|
| 1433 |
+
if self.verifier:
|
| 1434 |
+
(output_dir / "verification_summary.json").write_text(
|
| 1435 |
+
json.dumps(self.verifier.summary(), indent=2)
|
| 1436 |
+
)
|
| 1437 |
+
|
| 1438 |
+
# Per-agent details
|
| 1439 |
+
agent_details = {}
|
| 1440 |
+
for agent_id, model_name in self.agent_model_map.items():
|
| 1441 |
+
record = self.economy.registry.get_agent(agent_id)
|
| 1442 |
+
if record:
|
| 1443 |
+
llm = self.llm_agents.get(model_name)
|
| 1444 |
+
agent_details[model_name] = {
|
| 1445 |
+
**record.to_dict(),
|
| 1446 |
+
"llm_usage": llm.usage_summary() if llm else {},
|
| 1447 |
+
"token_cost_eth": self._token_costs.get(agent_id, 0.0),
|
| 1448 |
+
}
|
| 1449 |
+
(output_dir / "agent_details.json").write_text(
|
| 1450 |
+
json.dumps(agent_details, indent=2, default=str)
|
| 1451 |
+
)
|
| 1452 |
+
|
| 1453 |
+
# Verification log
|
| 1454 |
+
if self.verifier:
|
| 1455 |
+
log_data = [v.to_dict() for v in self.verifier.verification_log]
|
| 1456 |
+
(output_dir / "verification_log.json").write_text(
|
| 1457 |
+
json.dumps(log_data, indent=2, default=str)
|
| 1458 |
+
)
|
| 1459 |
+
|
| 1460 |
+
logger.info(f"Results saved to {output_dir}")
|
| 1461 |
+
|
| 1462 |
+
|
| 1463 |
+
def main():
|
| 1464 |
+
"""Entry point for running the live simulation."""
|
| 1465 |
+
parser = argparse.ArgumentParser(description="Run the CGAE live economy simulation.")
|
| 1466 |
+
parser.add_argument("--live", action="store_true", help="Run in infinite loop mode for dashboard.")
|
| 1467 |
+
parser.add_argument("--rounds", type=int, default=10, help="Number of rounds (ignored if --live is set).")
|
| 1468 |
+
parser.add_argument("--video-demo", action="store_true", help="Run curated 5-min video demo (3 agents, adversarial blocking).")
|
| 1469 |
+
parser.add_argument(
|
| 1470 |
+
"--show-failures",
|
| 1471 |
+
action="store_true",
|
| 1472 |
+
help="Bias live execution toward harder tasks and disable self-check retries.",
|
| 1473 |
+
)
|
| 1474 |
+
args = parser.parse_args()
|
| 1475 |
+
|
| 1476 |
+
logging.basicConfig(
|
| 1477 |
+
level=logging.INFO,
|
| 1478 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 1479 |
+
)
|
| 1480 |
+
|
| 1481 |
+
# Check env vars
|
| 1482 |
+
required_vars = ["AZURE_API_KEY"]
|
| 1483 |
+
optional_vars = ["AZURE_OPENAI_API_ENDPOINT", "DDFT_MODELS_ENDPOINT"]
|
| 1484 |
+
missing = [v for v in required_vars if not os.environ.get(v)]
|
| 1485 |
+
if missing:
|
| 1486 |
+
print(f"ERROR: Missing required environment variables: {missing}")
|
| 1487 |
+
print(f"Optional (for more models): {optional_vars}")
|
| 1488 |
+
print("\nSet them with:")
|
| 1489 |
+
print(" export AZURE_API_KEY=your-key")
|
| 1490 |
+
print(" export AZURE_OPENAI_API_ENDPOINT=https://your-endpoint.openai.azure.com/")
|
| 1491 |
+
print(" export DDFT_MODELS_ENDPOINT=https://your-foundry-endpoint/v1")
|
| 1492 |
+
return
|
| 1493 |
+
|
| 1494 |
+
available = [v for v in optional_vars if os.environ.get(v)]
|
| 1495 |
+
print(f"Endpoints available: {available}")
|
| 1496 |
+
|
| 1497 |
+
# Framework API URLs are read from CDCT_API_URL / DDFT_API_URL / EECT_API_URL
|
| 1498 |
+
# env vars by the clients. Override here if needed.
|
| 1499 |
+
config = LiveSimConfig(
|
| 1500 |
+
num_rounds=-1 if args.live else args.rounds,
|
| 1501 |
+
seed=42,
|
| 1502 |
+
video_demo=args.video_demo,
|
| 1503 |
+
failure_visibility_mode=args.show_failures,
|
| 1504 |
+
)
|
| 1505 |
+
|
| 1506 |
+
runner = LiveSimulationRunner(config)
|
| 1507 |
+
results = runner.run()
|
| 1508 |
+
runner.save_results()
|
| 1509 |
+
|
| 1510 |
+
# Print summary
|
| 1511 |
+
print("\n" + "=" * 60)
|
| 1512 |
+
print("CGAE LIVE ECONOMY - RESULTS")
|
| 1513 |
+
print("=" * 60)
|
| 1514 |
+
|
| 1515 |
+
if runner._final_summary:
|
| 1516 |
+
econ = runner._final_summary["economy"]
|
| 1517 |
+
print(f"\nRounds: {econ['num_rounds']}")
|
| 1518 |
+
print(f"Agents: {econ['num_agents']} ({econ['active_agents']} active)")
|
| 1519 |
+
print(f"Aggregate safety: {econ['aggregate_safety']:.4f}")
|
| 1520 |
+
print(f"Gini coefficient: {econ['gini_coefficient']:.4f}")
|
| 1521 |
+
print(f"Total rewards: {econ['total_rewards_paid']:.4f} ETH")
|
| 1522 |
+
print(f"Total penalties: {econ['total_penalties_collected']:.4f} ETH")
|
| 1523 |
+
print(f"Total token costs: {econ['total_token_cost_eth']:.4f} ETH")
|
| 1524 |
+
highlights = runner._final_summary.get("demo_highlights", {})
|
| 1525 |
+
if highlights:
|
| 1526 |
+
print("\nDemo highlights:")
|
| 1527 |
+
print(f" Circumvention blocked: {highlights.get('circumvention_blocked', 0)}")
|
| 1528 |
+
print(
|
| 1529 |
+
f" Delegation attempts: {highlights.get('delegation_attempts', 0)} "
|
| 1530 |
+
f"(allowed={highlights.get('delegation_allowed', 0)}, "
|
| 1531 |
+
f"blocked={highlights.get('delegation_blocked', 0)})"
|
| 1532 |
+
)
|
| 1533 |
+
|
| 1534 |
+
if runner.verifier:
|
| 1535 |
+
vs = runner.verifier.summary()
|
| 1536 |
+
print(f"\nVerification: {vs.get('total', 0)} tasks")
|
| 1537 |
+
print(f" Algorithmic pass rate: {vs.get('algorithmic_pass_rate', 0):.1%}")
|
| 1538 |
+
if vs.get("jury_pass_rate") is not None:
|
| 1539 |
+
print(f" Jury pass rate: {vs['jury_pass_rate']:.1%}")
|
| 1540 |
+
print(f" Overall pass rate: {vs.get('overall_pass_rate', 0):.1%}")
|
| 1541 |
+
if vs.get("avg_jury_score") is not None:
|
| 1542 |
+
print(f" Avg jury score: {vs['avg_jury_score']:.3f}")
|
| 1543 |
+
|
| 1544 |
+
print("\n--- Agent Leaderboard ---")
|
| 1545 |
+
print(f" {'Model':40s} {'Tier':3s} {'Bal':>8} {'Earned':>8} "
|
| 1546 |
+
f"{'Pen':>7} {'Cost':>7} W/L CC ER AS AuditSrc")
|
| 1547 |
+
if runner._final_summary:
|
| 1548 |
+
for a in runner._final_summary["agents"]:
|
| 1549 |
+
r = a.get("robustness") or {}
|
| 1550 |
+
# Show a short audit source tag; highlight defaulted dimensions
|
| 1551 |
+
src = a.get("audit_data_source", "?")
|
| 1552 |
+
defaulted = a.get("audit_dims_defaulted", [])
|
| 1553 |
+
src_tag = src if not defaulted else f"{src}[def:{','.join(defaulted)}]"
|
| 1554 |
+
print(
|
| 1555 |
+
f" {a['model_name']:40s} | {a['tier_name']:3s} | "
|
| 1556 |
+
f"bal={a['balance']:8.4f} | earned={a['total_earned']:8.4f} | "
|
| 1557 |
+
f"pen={a['total_penalties']:7.4f} | cost={a['token_cost_eth']:7.4f} | "
|
| 1558 |
+
f"W/L={a['contracts_completed']}/{a['contracts_failed']} | "
|
| 1559 |
+
f"CC={r.get('cc', 0):.2f} ER={r.get('er', 0):.2f} AS={r.get('as', 0):.2f} | "
|
| 1560 |
+
f"{src_tag}"
|
| 1561 |
+
)
|
| 1562 |
+
|
| 1563 |
+
dqw = runner._final_summary.get("data_quality_warnings", {})
|
| 1564 |
+
if dqw.get("num_partially_or_fully_defaulted", 0) > 0:
|
| 1565 |
+
print(f"\n *** DATA QUALITY NOTE ***")
|
| 1566 |
+
print(f" {dqw['num_partially_or_fully_defaulted']} agent(s) used assumed (not verified) "
|
| 1567 |
+
f"robustness for one or more dimensions.")
|
| 1568 |
+
print(f" These agents' tier assignments are estimates. See 'data_quality_warnings' "
|
| 1569 |
+
f"in final_summary.json for details.")
|
| 1570 |
+
|
| 1571 |
+
print("\n" + "=" * 60)
|
| 1572 |
+
|
| 1573 |
+
|
| 1574 |
+
if __name__ == "__main__":
|
| 1575 |
+
main()
|