purpose-agent / purpose_agent /prompt_optimizer.py

V2 merge: purpose_agent/prompt_optimizer.py

6dd7984 verified 15 days ago

7.77 kB

	"""
	prompt_optimizer.py — DSPy-style automatic prompt optimization.

	From DSPy (arxiv:2310.03714):
	Instead of hand-crafting prompts, define signatures (input → output)
	and let the optimizer bootstrap effective demonstrations automatically.

	Adaptation for Purpose Agent:
	1. Define a Signature: e.g., "state, action, purpose → phi_score, reasoning"
	2. Collect demonstration traces from successful runs
	3. The optimizer selects the best N demonstrations by trial scoring
	4. These demonstrations are injected into the prompt as few-shot examples
	5. Periodically re-optimize as more traces become available

	No weight updates — improvement comes from better few-shot examples
	in the prompt, selected via a metric (accuracy on held-out examples).
	"""
	from __future__ import annotations

	import json
	import logging
	import random
	from dataclasses import dataclass, field
	from typing import Any, Callable

	from purpose_agent.llm_backend import LLMBackend, ChatMessage
	from purpose_agent.trace import Trace

	logger = logging.getLogger(__name__)


	@dataclass
	class Signature:
	"""
	DSPy-style signature: declares what a prompt should do.

	Example:
	sig = Signature(
	name="state_evaluator",
	inputs=["state_before", "action", "state_after", "purpose"],
	outputs=["phi_score", "reasoning", "evidence"],
	instruction="Evaluate the state transition and score progress toward the purpose.",
	)
	"""
	name: str
	inputs: list[str]
	outputs: list[str]
	instruction: str = ""


	@dataclass
	class Demonstration:
	"""A single input→output example for few-shot prompting."""
	inputs: dict[str, str]
	outputs: dict[str, str]
	score: float = 0.0 # how good this demo is at improving task performance


	class PromptOptimizer:
	"""
	Automatically optimizes prompts by bootstrapping demonstrations.

	The DSPy approach adapted for Purpose Agent:
	1. Collect candidate demonstrations from traces
	2. Score each candidate by running it as a few-shot example and measuring output quality
	3. Select the top-K demonstrations
	4. Return an optimized prompt with the best demonstrations

	Usage:
	optimizer = PromptOptimizer(llm=model)

	# Define what the prompt should do
	sig = Signature(
	name="actor",
	inputs=["state", "purpose"],
	outputs=["thought", "action"],
	instruction="Decide the best next action.",
	)

	# Collect demonstrations from traces
	demos = optimizer.extract_demonstrations(traces, sig)

	# Optimize: find the best subset
	best = optimizer.optimize(sig, demos, metric_fn=my_metric, k=3)

	# Get the optimized prompt
	prompt = optimizer.compile_prompt(sig, best)
	"""

	def __init__(self, llm: LLMBackend \| None = None):
	self.llm = llm

	def extract_demonstrations(
	self,
	traces: list[Trace],
	signature: Signature,
	max_demos: int = 50,
	) -> list[Demonstration]:
	"""
	Extract candidate demonstrations from traces.

	Looks for trace events that match the signature's input/output fields.
	"""
	demos = []
	for trace in traces:
	for event in trace.events:
	data = event.data
	# Check if this event has the right fields
	has_inputs = all(f in data or f in (event.kind,) for f in signature.inputs)
	has_outputs = any(f in data for f in signature.outputs)

	if has_outputs:
	inputs = {f: str(data.get(f, "")) for f in signature.inputs}
	outputs = {f: str(data.get(f, "")) for f in signature.outputs}
	demos.append(Demonstration(inputs=inputs, outputs=outputs))

	if len(demos) >= max_demos:
	break

	logger.info(f"PromptOptimizer: Extracted {len(demos)} candidate demonstrations for '{signature.name}'")
	return demos

	def optimize(
	self,
	signature: Signature,
	candidates: list[Demonstration],
	metric_fn: Callable[[str, dict], float] \| None = None,
	k: int = 3,
	trials: int = 10,
	) -> list[Demonstration]:
	"""
	Select the best K demonstrations by trial-and-error.

	If metric_fn is provided, uses it to score each candidate set.
	Otherwise, uses a diversity heuristic (varied examples > similar ones).
	"""
	if len(candidates) <= k:
	return candidates

	if metric_fn is None:
	# Diversity-based selection: pick demos with different output patterns
	return self._diverse_select(candidates, k)

	# Trial-based optimization: sample subsets and score them
	best_subset = candidates[:k]
	best_score = -float("inf")

	for trial in range(trials):
	subset = random.sample(candidates, min(k, len(candidates)))
	prompt = self.compile_prompt(signature, subset)

	# Score this prompt configuration
	try:
	score = metric_fn(prompt, {"signature": signature.name})
	except Exception:
	score = 0.0

	if score > best_score:
	best_score = score
	best_subset = subset
	logger.debug(f"PromptOptimizer: Trial {trial+1} new best score={score:.3f}")

	# Record scores on selected demos
	for demo in best_subset:
	demo.score = best_score

	logger.info(f"PromptOptimizer: Selected {len(best_subset)} demos (best_score={best_score:.3f})")
	return best_subset

	def compile_prompt(
	self,
	signature: Signature,
	demonstrations: list[Demonstration],
	) -> str:
	"""
	Compile a signature + demonstrations into a ready-to-use prompt.

	Returns the optimized system prompt string.
	"""
	sections = []

	# Instruction
	if signature.instruction:
	sections.append(f"## Task\n{signature.instruction}")

	# Input/output format
	input_desc = ", ".join(signature.inputs)
	output_desc = ", ".join(signature.outputs)
	sections.append(f"## Format\nGiven: {input_desc}\nProduce: {output_desc}")

	# Demonstrations
	if demonstrations:
	sections.append("## Examples")
	for i, demo in enumerate(demonstrations[:5], 1):
	lines = [f"### Example {i}"]
	for k, v in demo.inputs.items():
	if v:
	lines.append(f" {k}: {v[:150]}")
	lines.append(" →")
	for k, v in demo.outputs.items():
	if v:
	lines.append(f" {k}: {v[:150]}")
	sections.append("\n".join(lines))

	return "\n\n".join(sections)

	def _diverse_select(
	self, candidates: list[Demonstration], k: int
	) -> list[Demonstration]:
	"""Select diverse demonstrations by output variety."""
	seen_outputs: set[str] = set()
	selected: list[Demonstration] = []

	for demo in candidates:
	key = str(sorted(demo.outputs.values()))[:50]
	if key not in seen_outputs:
	seen_outputs.add(key)
	selected.append(demo)
	if len(selected) >= k:
	break

	# Fill remaining with any unused candidates
	if len(selected) < k:
	for demo in candidates:
	if demo not in selected:
	selected.append(demo)
	if len(selected) >= k:
	break

	return selected