File size: 5,966 Bytes
130c63c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """
fingerprint.py — Capability fingerprinting from execution traces.
Extracts a structured profile of what an agent CAN do based on what it HAS done:
- Domains used (coding, research, data, etc.)
- Tool motifs (common tool sequences)
- Reasoning patterns (plan→execute, trial→error→fix)
- Output schemas observed
- Failure modes (what breaks, how often)
- Latency/cost profile
"""
from __future__ import annotations
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from typing import Any
from purpose_agent.trace import Trace
@dataclass
class CapabilityFingerprint:
"""Structured capability profile derived from traces."""
domains: dict[str, int] = field(default_factory=dict) # domain → count
tool_motifs: list[tuple[str, ...]] = field(default_factory=list) # common tool sequences
action_patterns: dict[str, int] = field(default_factory=dict) # pattern → count
failure_modes: dict[str, int] = field(default_factory=dict) # error_type → count
avg_steps_per_task: float = 0.0
avg_duration_s: float = 0.0
total_traces: int = 0
success_rate: float = 0.0
tool_usage: dict[str, int] = field(default_factory=dict) # tool → usage count
def to_dict(self) -> dict[str, Any]:
return {
"domains": self.domains,
"tool_motifs": [list(m) for m in self.tool_motifs[:10]],
"action_patterns": dict(list(self.action_patterns.items())[:20]),
"failure_modes": self.failure_modes,
"avg_steps": round(self.avg_steps_per_task, 1),
"avg_duration_s": round(self.avg_duration_s, 1),
"total_traces": self.total_traces,
"success_rate": round(self.success_rate, 3),
"tool_usage": self.tool_usage,
}
def summary(self) -> str:
top_domains = sorted(self.domains.items(), key=lambda x: -x[1])[:5]
top_tools = sorted(self.tool_usage.items(), key=lambda x: -x[1])[:5]
return (
f"Fingerprint: {self.total_traces} traces, {self.success_rate:.0%} success\n"
f" Domains: {', '.join(f'{d}({c})' for d,c in top_domains)}\n"
f" Tools: {', '.join(f'{t}({c})' for t,c in top_tools)}\n"
f" Avg steps: {self.avg_steps_per_task:.1f}, Avg time: {self.avg_duration_s:.1f}s\n"
f" Failures: {dict(list(self.failure_modes.items())[:3])}"
)
# Domain classification keywords
_DOMAIN_KEYWORDS = {
"coding": {"code", "function", "python", "debug", "script", "class", "def", "import"},
"research": {"research", "paper", "find", "search", "study", "analyze"},
"data": {"data", "csv", "database", "sql", "statistics", "chart"},
"writing": {"write", "blog", "article", "essay", "content", "draft"},
"operations": {"deploy", "monitor", "server", "docker", "pipeline"},
"security": {"security", "vulnerability", "cve", "audit", "threat"},
}
def fingerprint_traces(traces: list[Trace]) -> CapabilityFingerprint:
"""
Analyze a collection of traces and produce a capability fingerprint.
Usage:
traces = Trace.load_many("./traces/")
fp = fingerprint_traces(traces)
print(fp.summary())
"""
fp = CapabilityFingerprint(total_traces=len(traces))
if not traces:
return fp
total_steps = 0
total_duration = 0.0
successes = 0
tool_sequences: list[list[str]] = []
action_counter: Counter = Counter()
failure_counter: Counter = Counter()
tool_counter: Counter = Counter()
domain_counter: Counter = Counter()
for trace in traces:
# Duration
total_duration += trace.duration_s
total_steps += trace.step_count
# Domain classification from purpose
purpose_words = set(trace.purpose.lower().split())
for domain, keywords in _DOMAIN_KEYWORDS.items():
if purpose_words & keywords:
domain_counter[domain] += 1
# Analyze events
current_tool_seq = []
for event in trace.events:
kind = event.kind
# Tool usage
if kind == "tool_started" or kind == "tool.started":
tool_name = event.data.get("name", event.data.get("tool", "unknown"))
tool_counter[tool_name] += 1
current_tool_seq.append(tool_name)
# Actions
if kind == "action" or kind == "agent.progress":
action_name = event.data.get("name", event.data.get("action", ""))
if action_name:
action_counter[action_name] += 1
# Failures
if "error" in kind:
error_type = event.data.get("error_type", event.data.get("error", "unknown"))[:50]
failure_counter[error_type] += 1
# Success detection
if kind in ("run.finished", "run_finished"):
if event.data.get("success") or event.data.get("phi", 0) >= 7:
successes += 1
if current_tool_seq:
tool_sequences.append(current_tool_seq)
# Compile fingerprint
fp.domains = dict(domain_counter.most_common(10))
fp.tool_usage = dict(tool_counter.most_common(20))
fp.action_patterns = dict(action_counter.most_common(20))
fp.failure_modes = dict(failure_counter.most_common(10))
fp.avg_steps_per_task = total_steps / len(traces) if traces else 0
fp.avg_duration_s = total_duration / len(traces) if traces else 0
fp.success_rate = successes / len(traces) if traces else 0
# Extract tool motifs (common subsequences)
motif_counter: Counter = Counter()
for seq in tool_sequences:
for i in range(len(seq) - 1):
motif_counter[tuple(seq[i:i+2])] += 1
for i in range(len(seq) - 2):
motif_counter[tuple(seq[i:i+3])] += 1
fp.tool_motifs = [m for m, _ in motif_counter.most_common(10)]
return fp
|