""" fingerprint.py — Capability fingerprinting from execution traces. Extracts a structured profile of what an agent CAN do based on what it HAS done: - Domains used (coding, research, data, etc.) - Tool motifs (common tool sequences) - Reasoning patterns (plan→execute, trial→error→fix) - Output schemas observed - Failure modes (what breaks, how often) - Latency/cost profile """ from __future__ import annotations from collections import Counter, defaultdict from dataclasses import dataclass, field from typing import Any from purpose_agent.trace import Trace @dataclass class CapabilityFingerprint: """Structured capability profile derived from traces.""" domains: dict[str, int] = field(default_factory=dict) # domain → count tool_motifs: list[tuple[str, ...]] = field(default_factory=list) # common tool sequences action_patterns: dict[str, int] = field(default_factory=dict) # pattern → count failure_modes: dict[str, int] = field(default_factory=dict) # error_type → count avg_steps_per_task: float = 0.0 avg_duration_s: float = 0.0 total_traces: int = 0 success_rate: float = 0.0 tool_usage: dict[str, int] = field(default_factory=dict) # tool → usage count def to_dict(self) -> dict[str, Any]: return { "domains": self.domains, "tool_motifs": [list(m) for m in self.tool_motifs[:10]], "action_patterns": dict(list(self.action_patterns.items())[:20]), "failure_modes": self.failure_modes, "avg_steps": round(self.avg_steps_per_task, 1), "avg_duration_s": round(self.avg_duration_s, 1), "total_traces": self.total_traces, "success_rate": round(self.success_rate, 3), "tool_usage": self.tool_usage, } def summary(self) -> str: top_domains = sorted(self.domains.items(), key=lambda x: -x[1])[:5] top_tools = sorted(self.tool_usage.items(), key=lambda x: -x[1])[:5] return ( f"Fingerprint: {self.total_traces} traces, {self.success_rate:.0%} success\n" f" Domains: {', '.join(f'{d}({c})' for d,c in top_domains)}\n" f" Tools: {', '.join(f'{t}({c})' for t,c in top_tools)}\n" f" Avg steps: {self.avg_steps_per_task:.1f}, Avg time: {self.avg_duration_s:.1f}s\n" f" Failures: {dict(list(self.failure_modes.items())[:3])}" ) # Domain classification keywords _DOMAIN_KEYWORDS = { "coding": {"code", "function", "python", "debug", "script", "class", "def", "import"}, "research": {"research", "paper", "find", "search", "study", "analyze"}, "data": {"data", "csv", "database", "sql", "statistics", "chart"}, "writing": {"write", "blog", "article", "essay", "content", "draft"}, "operations": {"deploy", "monitor", "server", "docker", "pipeline"}, "security": {"security", "vulnerability", "cve", "audit", "threat"}, } def fingerprint_traces(traces: list[Trace]) -> CapabilityFingerprint: """ Analyze a collection of traces and produce a capability fingerprint. Usage: traces = Trace.load_many("./traces/") fp = fingerprint_traces(traces) print(fp.summary()) """ fp = CapabilityFingerprint(total_traces=len(traces)) if not traces: return fp total_steps = 0 total_duration = 0.0 successes = 0 tool_sequences: list[list[str]] = [] action_counter: Counter = Counter() failure_counter: Counter = Counter() tool_counter: Counter = Counter() domain_counter: Counter = Counter() for trace in traces: # Duration total_duration += trace.duration_s total_steps += trace.step_count # Domain classification from purpose purpose_words = set(trace.purpose.lower().split()) for domain, keywords in _DOMAIN_KEYWORDS.items(): if purpose_words & keywords: domain_counter[domain] += 1 # Analyze events current_tool_seq = [] for event in trace.events: kind = event.kind # Tool usage if kind == "tool_started" or kind == "tool.started": tool_name = event.data.get("name", event.data.get("tool", "unknown")) tool_counter[tool_name] += 1 current_tool_seq.append(tool_name) # Actions if kind == "action" or kind == "agent.progress": action_name = event.data.get("name", event.data.get("action", "")) if action_name: action_counter[action_name] += 1 # Failures if "error" in kind: error_type = event.data.get("error_type", event.data.get("error", "unknown"))[:50] failure_counter[error_type] += 1 # Success detection if kind in ("run.finished", "run_finished"): if event.data.get("success") or event.data.get("phi", 0) >= 7: successes += 1 if current_tool_seq: tool_sequences.append(current_tool_seq) # Compile fingerprint fp.domains = dict(domain_counter.most_common(10)) fp.tool_usage = dict(tool_counter.most_common(20)) fp.action_patterns = dict(action_counter.most_common(20)) fp.failure_modes = dict(failure_counter.most_common(10)) fp.avg_steps_per_task = total_steps / len(traces) if traces else 0 fp.avg_duration_s = total_duration / len(traces) if traces else 0 fp.success_rate = successes / len(traces) if traces else 0 # Extract tool motifs (common subsequences) motif_counter: Counter = Counter() for seq in tool_sequences: for i in range(len(seq) - 1): motif_counter[tuple(seq[i:i+2])] += 1 for i in range(len(seq) - 2): motif_counter[tuple(seq[i:i+3])] += 1 fp.tool_motifs = [m for m, _ in motif_counter.most_common(10)] return fp