File size: 7,769 Bytes
6dd7984 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | """
prompt_optimizer.py — DSPy-style automatic prompt optimization.
From DSPy (arxiv:2310.03714):
Instead of hand-crafting prompts, define signatures (input → output)
and let the optimizer bootstrap effective demonstrations automatically.
Adaptation for Purpose Agent:
1. Define a Signature: e.g., "state, action, purpose → phi_score, reasoning"
2. Collect demonstration traces from successful runs
3. The optimizer selects the best N demonstrations by trial scoring
4. These demonstrations are injected into the prompt as few-shot examples
5. Periodically re-optimize as more traces become available
No weight updates — improvement comes from better few-shot examples
in the prompt, selected via a metric (accuracy on held-out examples).
"""
from __future__ import annotations
import json
import logging
import random
from dataclasses import dataclass, field
from typing import Any, Callable
from purpose_agent.llm_backend import LLMBackend, ChatMessage
from purpose_agent.trace import Trace
logger = logging.getLogger(__name__)
@dataclass
class Signature:
"""
DSPy-style signature: declares what a prompt should do.
Example:
sig = Signature(
name="state_evaluator",
inputs=["state_before", "action", "state_after", "purpose"],
outputs=["phi_score", "reasoning", "evidence"],
instruction="Evaluate the state transition and score progress toward the purpose.",
)
"""
name: str
inputs: list[str]
outputs: list[str]
instruction: str = ""
@dataclass
class Demonstration:
"""A single input→output example for few-shot prompting."""
inputs: dict[str, str]
outputs: dict[str, str]
score: float = 0.0 # how good this demo is at improving task performance
class PromptOptimizer:
"""
Automatically optimizes prompts by bootstrapping demonstrations.
The DSPy approach adapted for Purpose Agent:
1. Collect candidate demonstrations from traces
2. Score each candidate by running it as a few-shot example and measuring output quality
3. Select the top-K demonstrations
4. Return an optimized prompt with the best demonstrations
Usage:
optimizer = PromptOptimizer(llm=model)
# Define what the prompt should do
sig = Signature(
name="actor",
inputs=["state", "purpose"],
outputs=["thought", "action"],
instruction="Decide the best next action.",
)
# Collect demonstrations from traces
demos = optimizer.extract_demonstrations(traces, sig)
# Optimize: find the best subset
best = optimizer.optimize(sig, demos, metric_fn=my_metric, k=3)
# Get the optimized prompt
prompt = optimizer.compile_prompt(sig, best)
"""
def __init__(self, llm: LLMBackend | None = None):
self.llm = llm
def extract_demonstrations(
self,
traces: list[Trace],
signature: Signature,
max_demos: int = 50,
) -> list[Demonstration]:
"""
Extract candidate demonstrations from traces.
Looks for trace events that match the signature's input/output fields.
"""
demos = []
for trace in traces:
for event in trace.events:
data = event.data
# Check if this event has the right fields
has_inputs = all(f in data or f in (event.kind,) for f in signature.inputs)
has_outputs = any(f in data for f in signature.outputs)
if has_outputs:
inputs = {f: str(data.get(f, "")) for f in signature.inputs}
outputs = {f: str(data.get(f, "")) for f in signature.outputs}
demos.append(Demonstration(inputs=inputs, outputs=outputs))
if len(demos) >= max_demos:
break
logger.info(f"PromptOptimizer: Extracted {len(demos)} candidate demonstrations for '{signature.name}'")
return demos
def optimize(
self,
signature: Signature,
candidates: list[Demonstration],
metric_fn: Callable[[str, dict], float] | None = None,
k: int = 3,
trials: int = 10,
) -> list[Demonstration]:
"""
Select the best K demonstrations by trial-and-error.
If metric_fn is provided, uses it to score each candidate set.
Otherwise, uses a diversity heuristic (varied examples > similar ones).
"""
if len(candidates) <= k:
return candidates
if metric_fn is None:
# Diversity-based selection: pick demos with different output patterns
return self._diverse_select(candidates, k)
# Trial-based optimization: sample subsets and score them
best_subset = candidates[:k]
best_score = -float("inf")
for trial in range(trials):
subset = random.sample(candidates, min(k, len(candidates)))
prompt = self.compile_prompt(signature, subset)
# Score this prompt configuration
try:
score = metric_fn(prompt, {"signature": signature.name})
except Exception:
score = 0.0
if score > best_score:
best_score = score
best_subset = subset
logger.debug(f"PromptOptimizer: Trial {trial+1} new best score={score:.3f}")
# Record scores on selected demos
for demo in best_subset:
demo.score = best_score
logger.info(f"PromptOptimizer: Selected {len(best_subset)} demos (best_score={best_score:.3f})")
return best_subset
def compile_prompt(
self,
signature: Signature,
demonstrations: list[Demonstration],
) -> str:
"""
Compile a signature + demonstrations into a ready-to-use prompt.
Returns the optimized system prompt string.
"""
sections = []
# Instruction
if signature.instruction:
sections.append(f"## Task\n{signature.instruction}")
# Input/output format
input_desc = ", ".join(signature.inputs)
output_desc = ", ".join(signature.outputs)
sections.append(f"## Format\nGiven: {input_desc}\nProduce: {output_desc}")
# Demonstrations
if demonstrations:
sections.append("## Examples")
for i, demo in enumerate(demonstrations[:5], 1):
lines = [f"### Example {i}"]
for k, v in demo.inputs.items():
if v:
lines.append(f" {k}: {v[:150]}")
lines.append(" →")
for k, v in demo.outputs.items():
if v:
lines.append(f" {k}: {v[:150]}")
sections.append("\n".join(lines))
return "\n\n".join(sections)
def _diverse_select(
self, candidates: list[Demonstration], k: int
) -> list[Demonstration]:
"""Select diverse demonstrations by output variety."""
seen_outputs: set[str] = set()
selected: list[Demonstration] = []
for demo in candidates:
key = str(sorted(demo.outputs.values()))[:50]
if key not in seen_outputs:
seen_outputs.add(key)
selected.append(demo)
if len(selected) >= k:
break
# Fill remaining with any unused candidates
if len(selected) < k:
for demo in candidates:
if demo not in selected:
selected.append(demo)
if len(selected) >= k:
break
return selected
|