Rohan03's picture
Sprint 9A: fingerprint.py — capability fingerprint from execution traces
130c63c verified
"""
fingerprint.py — Capability fingerprinting from execution traces.
Extracts a structured profile of what an agent CAN do based on what it HAS done:
- Domains used (coding, research, data, etc.)
- Tool motifs (common tool sequences)
- Reasoning patterns (plan→execute, trial→error→fix)
- Output schemas observed
- Failure modes (what breaks, how often)
- Latency/cost profile
"""
from __future__ import annotations
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from typing import Any
from purpose_agent.trace import Trace
@dataclass
class CapabilityFingerprint:
"""Structured capability profile derived from traces."""
domains: dict[str, int] = field(default_factory=dict) # domain → count
tool_motifs: list[tuple[str, ...]] = field(default_factory=list) # common tool sequences
action_patterns: dict[str, int] = field(default_factory=dict) # pattern → count
failure_modes: dict[str, int] = field(default_factory=dict) # error_type → count
avg_steps_per_task: float = 0.0
avg_duration_s: float = 0.0
total_traces: int = 0
success_rate: float = 0.0
tool_usage: dict[str, int] = field(default_factory=dict) # tool → usage count
def to_dict(self) -> dict[str, Any]:
return {
"domains": self.domains,
"tool_motifs": [list(m) for m in self.tool_motifs[:10]],
"action_patterns": dict(list(self.action_patterns.items())[:20]),
"failure_modes": self.failure_modes,
"avg_steps": round(self.avg_steps_per_task, 1),
"avg_duration_s": round(self.avg_duration_s, 1),
"total_traces": self.total_traces,
"success_rate": round(self.success_rate, 3),
"tool_usage": self.tool_usage,
}
def summary(self) -> str:
top_domains = sorted(self.domains.items(), key=lambda x: -x[1])[:5]
top_tools = sorted(self.tool_usage.items(), key=lambda x: -x[1])[:5]
return (
f"Fingerprint: {self.total_traces} traces, {self.success_rate:.0%} success\n"
f" Domains: {', '.join(f'{d}({c})' for d,c in top_domains)}\n"
f" Tools: {', '.join(f'{t}({c})' for t,c in top_tools)}\n"
f" Avg steps: {self.avg_steps_per_task:.1f}, Avg time: {self.avg_duration_s:.1f}s\n"
f" Failures: {dict(list(self.failure_modes.items())[:3])}"
)
# Domain classification keywords
_DOMAIN_KEYWORDS = {
"coding": {"code", "function", "python", "debug", "script", "class", "def", "import"},
"research": {"research", "paper", "find", "search", "study", "analyze"},
"data": {"data", "csv", "database", "sql", "statistics", "chart"},
"writing": {"write", "blog", "article", "essay", "content", "draft"},
"operations": {"deploy", "monitor", "server", "docker", "pipeline"},
"security": {"security", "vulnerability", "cve", "audit", "threat"},
}
def fingerprint_traces(traces: list[Trace]) -> CapabilityFingerprint:
"""
Analyze a collection of traces and produce a capability fingerprint.
Usage:
traces = Trace.load_many("./traces/")
fp = fingerprint_traces(traces)
print(fp.summary())
"""
fp = CapabilityFingerprint(total_traces=len(traces))
if not traces:
return fp
total_steps = 0
total_duration = 0.0
successes = 0
tool_sequences: list[list[str]] = []
action_counter: Counter = Counter()
failure_counter: Counter = Counter()
tool_counter: Counter = Counter()
domain_counter: Counter = Counter()
for trace in traces:
# Duration
total_duration += trace.duration_s
total_steps += trace.step_count
# Domain classification from purpose
purpose_words = set(trace.purpose.lower().split())
for domain, keywords in _DOMAIN_KEYWORDS.items():
if purpose_words & keywords:
domain_counter[domain] += 1
# Analyze events
current_tool_seq = []
for event in trace.events:
kind = event.kind
# Tool usage
if kind == "tool_started" or kind == "tool.started":
tool_name = event.data.get("name", event.data.get("tool", "unknown"))
tool_counter[tool_name] += 1
current_tool_seq.append(tool_name)
# Actions
if kind == "action" or kind == "agent.progress":
action_name = event.data.get("name", event.data.get("action", ""))
if action_name:
action_counter[action_name] += 1
# Failures
if "error" in kind:
error_type = event.data.get("error_type", event.data.get("error", "unknown"))[:50]
failure_counter[error_type] += 1
# Success detection
if kind in ("run.finished", "run_finished"):
if event.data.get("success") or event.data.get("phi", 0) >= 7:
successes += 1
if current_tool_seq:
tool_sequences.append(current_tool_seq)
# Compile fingerprint
fp.domains = dict(domain_counter.most_common(10))
fp.tool_usage = dict(tool_counter.most_common(20))
fp.action_patterns = dict(action_counter.most_common(20))
fp.failure_modes = dict(failure_counter.most_common(10))
fp.avg_steps_per_task = total_steps / len(traces) if traces else 0
fp.avg_duration_s = total_duration / len(traces) if traces else 0
fp.success_rate = successes / len(traces) if traces else 0
# Extract tool motifs (common subsequences)
motif_counter: Counter = Counter()
for seq in tool_sequences:
for i in range(len(seq) - 1):
motif_counter[tuple(seq[i:i+2])] += 1
for i in range(len(seq) - 2):
motif_counter[tuple(seq[i:i+3])] += 1
fp.tool_motifs = [m for m, _ in motif_counter.most_common(10)]
return fp