""" prompt_optimizer.py — DSPy-style automatic prompt optimization. From DSPy (arxiv:2310.03714): Instead of hand-crafting prompts, define signatures (input → output) and let the optimizer bootstrap effective demonstrations automatically. Adaptation for Purpose Agent: 1. Define a Signature: e.g., "state, action, purpose → phi_score, reasoning" 2. Collect demonstration traces from successful runs 3. The optimizer selects the best N demonstrations by trial scoring 4. These demonstrations are injected into the prompt as few-shot examples 5. Periodically re-optimize as more traces become available No weight updates — improvement comes from better few-shot examples in the prompt, selected via a metric (accuracy on held-out examples). """ from __future__ import annotations import json import logging import random from dataclasses import dataclass, field from typing import Any, Callable from purpose_agent.llm_backend import LLMBackend, ChatMessage from purpose_agent.trace import Trace logger = logging.getLogger(__name__) @dataclass class Signature: """ DSPy-style signature: declares what a prompt should do. Example: sig = Signature( name="state_evaluator", inputs=["state_before", "action", "state_after", "purpose"], outputs=["phi_score", "reasoning", "evidence"], instruction="Evaluate the state transition and score progress toward the purpose.", ) """ name: str inputs: list[str] outputs: list[str] instruction: str = "" @dataclass class Demonstration: """A single input→output example for few-shot prompting.""" inputs: dict[str, str] outputs: dict[str, str] score: float = 0.0 # how good this demo is at improving task performance class PromptOptimizer: """ Automatically optimizes prompts by bootstrapping demonstrations. The DSPy approach adapted for Purpose Agent: 1. Collect candidate demonstrations from traces 2. Score each candidate by running it as a few-shot example and measuring output quality 3. Select the top-K demonstrations 4. Return an optimized prompt with the best demonstrations Usage: optimizer = PromptOptimizer(llm=model) # Define what the prompt should do sig = Signature( name="actor", inputs=["state", "purpose"], outputs=["thought", "action"], instruction="Decide the best next action.", ) # Collect demonstrations from traces demos = optimizer.extract_demonstrations(traces, sig) # Optimize: find the best subset best = optimizer.optimize(sig, demos, metric_fn=my_metric, k=3) # Get the optimized prompt prompt = optimizer.compile_prompt(sig, best) """ def __init__(self, llm: LLMBackend | None = None): self.llm = llm def extract_demonstrations( self, traces: list[Trace], signature: Signature, max_demos: int = 50, ) -> list[Demonstration]: """ Extract candidate demonstrations from traces. Looks for trace events that match the signature's input/output fields. """ demos = [] for trace in traces: for event in trace.events: data = event.data # Check if this event has the right fields has_inputs = all(f in data or f in (event.kind,) for f in signature.inputs) has_outputs = any(f in data for f in signature.outputs) if has_outputs: inputs = {f: str(data.get(f, "")) for f in signature.inputs} outputs = {f: str(data.get(f, "")) for f in signature.outputs} demos.append(Demonstration(inputs=inputs, outputs=outputs)) if len(demos) >= max_demos: break logger.info(f"PromptOptimizer: Extracted {len(demos)} candidate demonstrations for '{signature.name}'") return demos def optimize( self, signature: Signature, candidates: list[Demonstration], metric_fn: Callable[[str, dict], float] | None = None, k: int = 3, trials: int = 10, ) -> list[Demonstration]: """ Select the best K demonstrations by trial-and-error. If metric_fn is provided, uses it to score each candidate set. Otherwise, uses a diversity heuristic (varied examples > similar ones). """ if len(candidates) <= k: return candidates if metric_fn is None: # Diversity-based selection: pick demos with different output patterns return self._diverse_select(candidates, k) # Trial-based optimization: sample subsets and score them best_subset = candidates[:k] best_score = -float("inf") for trial in range(trials): subset = random.sample(candidates, min(k, len(candidates))) prompt = self.compile_prompt(signature, subset) # Score this prompt configuration try: score = metric_fn(prompt, {"signature": signature.name}) except Exception: score = 0.0 if score > best_score: best_score = score best_subset = subset logger.debug(f"PromptOptimizer: Trial {trial+1} new best score={score:.3f}") # Record scores on selected demos for demo in best_subset: demo.score = best_score logger.info(f"PromptOptimizer: Selected {len(best_subset)} demos (best_score={best_score:.3f})") return best_subset def compile_prompt( self, signature: Signature, demonstrations: list[Demonstration], ) -> str: """ Compile a signature + demonstrations into a ready-to-use prompt. Returns the optimized system prompt string. """ sections = [] # Instruction if signature.instruction: sections.append(f"## Task\n{signature.instruction}") # Input/output format input_desc = ", ".join(signature.inputs) output_desc = ", ".join(signature.outputs) sections.append(f"## Format\nGiven: {input_desc}\nProduce: {output_desc}") # Demonstrations if demonstrations: sections.append("## Examples") for i, demo in enumerate(demonstrations[:5], 1): lines = [f"### Example {i}"] for k, v in demo.inputs.items(): if v: lines.append(f" {k}: {v[:150]}") lines.append(" →") for k, v in demo.outputs.items(): if v: lines.append(f" {k}: {v[:150]}") sections.append("\n".join(lines)) return "\n\n".join(sections) def _diverse_select( self, candidates: list[Demonstration], k: int ) -> list[Demonstration]: """Select diverse demonstrations by output variety.""" seen_outputs: set[str] = set() selected: list[Demonstration] = [] for demo in candidates: key = str(sorted(demo.outputs.values()))[:50] if key not in seen_outputs: seen_outputs.add(key) selected.append(demo) if len(selected) >= k: break # Fill remaining with any unused candidates if len(selected) < k: for demo in candidates: if demo not in selected: selected.append(demo) if len(selected) >= k: break return selected