purpose-agent / purpose_agent /prompt_optimizer.py
Rohan03's picture
V2 merge: purpose_agent/prompt_optimizer.py
6dd7984 verified
"""
prompt_optimizer.py — DSPy-style automatic prompt optimization.
From DSPy (arxiv:2310.03714):
Instead of hand-crafting prompts, define signatures (input → output)
and let the optimizer bootstrap effective demonstrations automatically.
Adaptation for Purpose Agent:
1. Define a Signature: e.g., "state, action, purpose → phi_score, reasoning"
2. Collect demonstration traces from successful runs
3. The optimizer selects the best N demonstrations by trial scoring
4. These demonstrations are injected into the prompt as few-shot examples
5. Periodically re-optimize as more traces become available
No weight updates — improvement comes from better few-shot examples
in the prompt, selected via a metric (accuracy on held-out examples).
"""
from __future__ import annotations
import json
import logging
import random
from dataclasses import dataclass, field
from typing import Any, Callable
from purpose_agent.llm_backend import LLMBackend, ChatMessage
from purpose_agent.trace import Trace
logger = logging.getLogger(__name__)
@dataclass
class Signature:
"""
DSPy-style signature: declares what a prompt should do.
Example:
sig = Signature(
name="state_evaluator",
inputs=["state_before", "action", "state_after", "purpose"],
outputs=["phi_score", "reasoning", "evidence"],
instruction="Evaluate the state transition and score progress toward the purpose.",
)
"""
name: str
inputs: list[str]
outputs: list[str]
instruction: str = ""
@dataclass
class Demonstration:
"""A single input→output example for few-shot prompting."""
inputs: dict[str, str]
outputs: dict[str, str]
score: float = 0.0 # how good this demo is at improving task performance
class PromptOptimizer:
"""
Automatically optimizes prompts by bootstrapping demonstrations.
The DSPy approach adapted for Purpose Agent:
1. Collect candidate demonstrations from traces
2. Score each candidate by running it as a few-shot example and measuring output quality
3. Select the top-K demonstrations
4. Return an optimized prompt with the best demonstrations
Usage:
optimizer = PromptOptimizer(llm=model)
# Define what the prompt should do
sig = Signature(
name="actor",
inputs=["state", "purpose"],
outputs=["thought", "action"],
instruction="Decide the best next action.",
)
# Collect demonstrations from traces
demos = optimizer.extract_demonstrations(traces, sig)
# Optimize: find the best subset
best = optimizer.optimize(sig, demos, metric_fn=my_metric, k=3)
# Get the optimized prompt
prompt = optimizer.compile_prompt(sig, best)
"""
def __init__(self, llm: LLMBackend | None = None):
self.llm = llm
def extract_demonstrations(
self,
traces: list[Trace],
signature: Signature,
max_demos: int = 50,
) -> list[Demonstration]:
"""
Extract candidate demonstrations from traces.
Looks for trace events that match the signature's input/output fields.
"""
demos = []
for trace in traces:
for event in trace.events:
data = event.data
# Check if this event has the right fields
has_inputs = all(f in data or f in (event.kind,) for f in signature.inputs)
has_outputs = any(f in data for f in signature.outputs)
if has_outputs:
inputs = {f: str(data.get(f, "")) for f in signature.inputs}
outputs = {f: str(data.get(f, "")) for f in signature.outputs}
demos.append(Demonstration(inputs=inputs, outputs=outputs))
if len(demos) >= max_demos:
break
logger.info(f"PromptOptimizer: Extracted {len(demos)} candidate demonstrations for '{signature.name}'")
return demos
def optimize(
self,
signature: Signature,
candidates: list[Demonstration],
metric_fn: Callable[[str, dict], float] | None = None,
k: int = 3,
trials: int = 10,
) -> list[Demonstration]:
"""
Select the best K demonstrations by trial-and-error.
If metric_fn is provided, uses it to score each candidate set.
Otherwise, uses a diversity heuristic (varied examples > similar ones).
"""
if len(candidates) <= k:
return candidates
if metric_fn is None:
# Diversity-based selection: pick demos with different output patterns
return self._diverse_select(candidates, k)
# Trial-based optimization: sample subsets and score them
best_subset = candidates[:k]
best_score = -float("inf")
for trial in range(trials):
subset = random.sample(candidates, min(k, len(candidates)))
prompt = self.compile_prompt(signature, subset)
# Score this prompt configuration
try:
score = metric_fn(prompt, {"signature": signature.name})
except Exception:
score = 0.0
if score > best_score:
best_score = score
best_subset = subset
logger.debug(f"PromptOptimizer: Trial {trial+1} new best score={score:.3f}")
# Record scores on selected demos
for demo in best_subset:
demo.score = best_score
logger.info(f"PromptOptimizer: Selected {len(best_subset)} demos (best_score={best_score:.3f})")
return best_subset
def compile_prompt(
self,
signature: Signature,
demonstrations: list[Demonstration],
) -> str:
"""
Compile a signature + demonstrations into a ready-to-use prompt.
Returns the optimized system prompt string.
"""
sections = []
# Instruction
if signature.instruction:
sections.append(f"## Task\n{signature.instruction}")
# Input/output format
input_desc = ", ".join(signature.inputs)
output_desc = ", ".join(signature.outputs)
sections.append(f"## Format\nGiven: {input_desc}\nProduce: {output_desc}")
# Demonstrations
if demonstrations:
sections.append("## Examples")
for i, demo in enumerate(demonstrations[:5], 1):
lines = [f"### Example {i}"]
for k, v in demo.inputs.items():
if v:
lines.append(f" {k}: {v[:150]}")
lines.append(" →")
for k, v in demo.outputs.items():
if v:
lines.append(f" {k}: {v[:150]}")
sections.append("\n".join(lines))
return "\n\n".join(sections)
def _diverse_select(
self, candidates: list[Demonstration], k: int
) -> list[Demonstration]:
"""Select diverse demonstrations by output variety."""
seen_outputs: set[str] = set()
selected: list[Demonstration] = []
for demo in candidates:
key = str(sorted(demo.outputs.values()))[:50]
if key not in seen_outputs:
seen_outputs.add(key)
selected.append(demo)
if len(selected) >= k:
break
# Fill remaining with any unused candidates
if len(selected) < k:
for demo in candidates:
if demo not in selected:
selected.append(demo)
if len(selected) >= k:
break
return selected