Rohan03 commited on
Commit
f9d84be
·
verified ·
1 Parent(s): 854d13a

v0.2.0: Add purpose_agent/observability.py

Browse files
Files changed (1) hide show
  1. purpose_agent/observability.py +304 -0
purpose_agent/observability.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Observability — Callback hooks, cost tracking, and span tracing.
3
+
4
+ Provides production-grade visibility into agent runs:
5
+ - Token & cost tracking per step, per task, per agent
6
+ - Callback hooks for custom integrations (LangSmith, Arize, custom dashboards)
7
+ - Structured event logging
8
+ - Performance profiling
9
+
10
+ Lightweight — no external dependencies. Integrates with OpenTelemetry-compatible
11
+ systems via the callback interface.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import logging
18
+ import time
19
+ from dataclasses import dataclass, field
20
+ from enum import Enum
21
+ from typing import Any, Callable, Protocol
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ # ---------------------------------------------------------------------------
27
+ # Cost Tracking
28
+ # ---------------------------------------------------------------------------
29
+
30
+ # Approximate cost per 1M tokens (input) for common models
31
+ MODEL_COSTS_PER_1M_INPUT = {
32
+ # Cloud LLMs
33
+ "gpt-4o": 2.50,
34
+ "gpt-4o-mini": 0.15,
35
+ "claude-3-5-sonnet": 3.00,
36
+ "claude-3-5-haiku": 0.80,
37
+ # Cloud SLMs via inference providers
38
+ "qwen/qwen3-32b": 0.20,
39
+ "qwen/qwen3-8b": 0.05,
40
+ "meta-llama/llama-3.1-8b-instruct": 0.05,
41
+ # Local models (electricity cost estimate per 1M tokens)
42
+ "local-gpu": 0.01, # ~$0.01/1M tokens on consumer GPU
43
+ "local-cpu": 0.005, # ~$0.005/1M tokens on CPU
44
+ "ollama": 0.005, # Estimate for local Ollama
45
+ }
46
+
47
+
48
+ @dataclass
49
+ class TokenUsage:
50
+ """Token usage for a single LLM call."""
51
+ prompt_tokens: int = 0
52
+ completion_tokens: int = 0
53
+ total_tokens: int = 0
54
+ model: str = ""
55
+ estimated_cost_usd: float = 0.0
56
+ timestamp: float = field(default_factory=time.time)
57
+
58
+
59
+ @dataclass
60
+ class CostTracker:
61
+ """
62
+ Tracks token usage and estimated costs across all LLM calls.
63
+
64
+ Usage:
65
+ tracker = CostTracker(model_name="qwen3:1.7b", cost_per_1m=0.005)
66
+ tracker.record(prompt_tokens=500, completion_tokens=200)
67
+ print(tracker.summary())
68
+ """
69
+ model_name: str = "unknown"
70
+ cost_per_1m_input: float = 0.01
71
+ cost_per_1m_output: float = 0.02
72
+ calls: list[TokenUsage] = field(default_factory=list)
73
+
74
+ def record(
75
+ self,
76
+ prompt_tokens: int = 0,
77
+ completion_tokens: int = 0,
78
+ model: str | None = None,
79
+ ) -> TokenUsage:
80
+ """Record a single LLM call."""
81
+ total = prompt_tokens + completion_tokens
82
+ cost = (
83
+ prompt_tokens * self.cost_per_1m_input / 1_000_000
84
+ + completion_tokens * self.cost_per_1m_output / 1_000_000
85
+ )
86
+
87
+ usage = TokenUsage(
88
+ prompt_tokens=prompt_tokens,
89
+ completion_tokens=completion_tokens,
90
+ total_tokens=total,
91
+ model=model or self.model_name,
92
+ estimated_cost_usd=cost,
93
+ )
94
+ self.calls.append(usage)
95
+ return usage
96
+
97
+ @property
98
+ def total_tokens(self) -> int:
99
+ return sum(c.total_tokens for c in self.calls)
100
+
101
+ @property
102
+ def total_cost_usd(self) -> float:
103
+ return sum(c.estimated_cost_usd for c in self.calls)
104
+
105
+ @property
106
+ def total_calls(self) -> int:
107
+ return len(self.calls)
108
+
109
+ def summary(self) -> dict[str, Any]:
110
+ return {
111
+ "model": self.model_name,
112
+ "total_calls": self.total_calls,
113
+ "total_tokens": self.total_tokens,
114
+ "prompt_tokens": sum(c.prompt_tokens for c in self.calls),
115
+ "completion_tokens": sum(c.completion_tokens for c in self.calls),
116
+ "estimated_cost_usd": round(self.total_cost_usd, 6),
117
+ }
118
+
119
+ def reset(self):
120
+ self.calls.clear()
121
+
122
+
123
+ # ---------------------------------------------------------------------------
124
+ # Callback System
125
+ # ---------------------------------------------------------------------------
126
+
127
+ class EventType(Enum):
128
+ """Events emitted during agent execution."""
129
+ TASK_START = "task_start"
130
+ TASK_END = "task_end"
131
+ STEP_START = "step_start"
132
+ STEP_END = "step_end"
133
+ ACTION_DECIDED = "action_decided"
134
+ TOOL_CALLED = "tool_called"
135
+ TOOL_RESULT = "tool_result"
136
+ STATE_EVALUATED = "state_evaluated"
137
+ LLM_CALL_START = "llm_call_start"
138
+ LLM_CALL_END = "llm_call_end"
139
+ HEURISTIC_LEARNED = "heuristic_learned"
140
+ MEMORY_UPDATED = "memory_updated"
141
+ OPTIMIZATION_START = "optimization_start"
142
+ OPTIMIZATION_END = "optimization_end"
143
+ ERROR = "error"
144
+ CHECKPOINT = "checkpoint"
145
+ HUMAN_INPUT_REQUESTED = "human_input_requested"
146
+ HUMAN_INPUT_RECEIVED = "human_input_received"
147
+
148
+
149
+ @dataclass
150
+ class AgentEvent:
151
+ """A structured event emitted during agent execution."""
152
+ event_type: EventType
153
+ data: dict[str, Any] = field(default_factory=dict)
154
+ step: int = 0
155
+ task_id: str = ""
156
+ agent_id: str = ""
157
+ timestamp: float = field(default_factory=time.time)
158
+ duration_s: float = 0.0
159
+
160
+ def to_dict(self) -> dict[str, Any]:
161
+ return {
162
+ "event": self.event_type.value,
163
+ "data": self.data,
164
+ "step": self.step,
165
+ "task_id": self.task_id,
166
+ "agent_id": self.agent_id,
167
+ "timestamp": self.timestamp,
168
+ "duration_s": self.duration_s,
169
+ }
170
+
171
+ def to_json(self) -> str:
172
+ return json.dumps(self.to_dict(), default=str)
173
+
174
+
175
+ class AgentCallback(Protocol):
176
+ """Protocol for agent callbacks. Implement this to integrate with external systems."""
177
+
178
+ def on_event(self, event: AgentEvent) -> None:
179
+ """Called when an event occurs during agent execution."""
180
+ ...
181
+
182
+
183
+ class LoggingCallback:
184
+ """Simple callback that logs all events."""
185
+
186
+ def __init__(self, level: int = logging.INFO):
187
+ self.level = level
188
+ self.events: list[AgentEvent] = []
189
+
190
+ def on_event(self, event: AgentEvent) -> None:
191
+ self.events.append(event)
192
+ logger.log(
193
+ self.level,
194
+ f"[{event.event_type.value}] step={event.step} "
195
+ f"task={event.task_id} {json.dumps(event.data, default=str)[:200]}",
196
+ )
197
+
198
+
199
+ class JSONFileCallback:
200
+ """Callback that writes events to a JSON Lines file."""
201
+
202
+ def __init__(self, path: str):
203
+ self.path = path
204
+
205
+ def on_event(self, event: AgentEvent) -> None:
206
+ with open(self.path, "a") as f:
207
+ f.write(event.to_json() + "\n")
208
+
209
+
210
+ class MetricsCollector:
211
+ """
212
+ Callback that collects aggregate metrics for analysis.
213
+
214
+ Usage:
215
+ collector = MetricsCollector()
216
+ # ... run tasks with collector as callback ...
217
+ print(collector.summary())
218
+ """
219
+
220
+ def __init__(self):
221
+ self.tasks: list[dict] = []
222
+ self.steps: list[dict] = []
223
+ self.llm_calls: list[dict] = []
224
+ self.errors: list[dict] = []
225
+ self._current_task: dict = {}
226
+ self._step_start: float = 0
227
+
228
+ def on_event(self, event: AgentEvent) -> None:
229
+ if event.event_type == EventType.TASK_START:
230
+ self._current_task = {"task_id": event.task_id, "start": event.timestamp, "steps": 0}
231
+ elif event.event_type == EventType.TASK_END:
232
+ self._current_task["end"] = event.timestamp
233
+ self._current_task["duration_s"] = event.timestamp - self._current_task.get("start", event.timestamp)
234
+ self._current_task.update(event.data)
235
+ self.tasks.append(self._current_task)
236
+ elif event.event_type == EventType.STEP_START:
237
+ self._step_start = event.timestamp
238
+ if self._current_task:
239
+ self._current_task["steps"] = self._current_task.get("steps", 0) + 1
240
+ elif event.event_type == EventType.STATE_EVALUATED:
241
+ self.steps.append({
242
+ "step": event.step,
243
+ "task_id": event.task_id,
244
+ "duration_s": event.timestamp - self._step_start if self._step_start else 0,
245
+ **event.data,
246
+ })
247
+ elif event.event_type == EventType.LLM_CALL_END:
248
+ self.llm_calls.append(event.data)
249
+ elif event.event_type == EventType.ERROR:
250
+ self.errors.append({"step": event.step, **event.data})
251
+
252
+ def summary(self) -> dict[str, Any]:
253
+ if not self.tasks:
254
+ return {"tasks": 0}
255
+
256
+ success_count = sum(1 for t in self.tasks if t.get("success_rate", 0) > 0.5)
257
+ total_steps = sum(t.get("steps", 0) for t in self.tasks)
258
+ total_duration = sum(t.get("duration_s", 0) for t in self.tasks)
259
+ avg_phi_deltas = [s.get("delta", 0) for s in self.steps if "delta" in s]
260
+
261
+ return {
262
+ "total_tasks": len(self.tasks),
263
+ "successful_tasks": success_count,
264
+ "success_rate": success_count / len(self.tasks) if self.tasks else 0,
265
+ "total_steps": total_steps,
266
+ "avg_steps_per_task": total_steps / len(self.tasks),
267
+ "total_duration_s": round(total_duration, 2),
268
+ "avg_duration_per_task_s": round(total_duration / len(self.tasks), 2),
269
+ "total_llm_calls": len(self.llm_calls),
270
+ "total_errors": len(self.errors),
271
+ "avg_phi_delta": round(sum(avg_phi_deltas) / len(avg_phi_deltas), 3) if avg_phi_deltas else 0,
272
+ }
273
+
274
+
275
+ # ---------------------------------------------------------------------------
276
+ # Callback Manager — dispatches events to multiple callbacks
277
+ # ---------------------------------------------------------------------------
278
+
279
+ class CallbackManager:
280
+ """
281
+ Manages multiple callbacks and dispatches events to all of them.
282
+
283
+ Usage:
284
+ mgr = CallbackManager()
285
+ mgr.add(LoggingCallback())
286
+ mgr.add(MetricsCollector())
287
+ mgr.add(JSONFileCallback("events.jsonl"))
288
+
289
+ mgr.emit(AgentEvent(EventType.TASK_START, data={"purpose": "..."}))
290
+ """
291
+
292
+ def __init__(self, callbacks: list | None = None):
293
+ self.callbacks: list = callbacks or []
294
+
295
+ def add(self, callback) -> "CallbackManager":
296
+ self.callbacks.append(callback)
297
+ return self
298
+
299
+ def emit(self, event: AgentEvent) -> None:
300
+ for cb in self.callbacks:
301
+ try:
302
+ cb.on_event(event)
303
+ except Exception as e:
304
+ logger.warning(f"Callback {type(cb).__name__} failed: {e}")