File size: 6,108 Bytes
34bd448 36d2671 34bd448 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | #!/usr/bin/env python3
"""
Track D Tests β Optimization Pipeline.
T9.1 Coding traces fingerprint correctly
T9.2 Tool motif extraction works
T9.3 Dataset builder filters by success
T9.4 Dataset builder removes poisoned examples
T9.5 Prompt pack respects token budget
T9.6 Prompt pack includes highest-fitness skills first
T10.1 Shadow eval: better candidate passes
T10.2 Shadow eval: worse candidate fails
T10.3 Optimizer: improving β continue
T10.4 Optimizer: plateau β optimize prompts
T10.5 Optimizer: degrading β rollback
T10.6 Distillation plan: no GPU β prompt_pack
T10.7 Distillation plan: large dataset + GPU β distill
"""
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
PASS = FAIL = 0
def check(name, cond, detail=""):
global PASS, FAIL
PASS += int(cond); FAIL += int(not cond)
print(f" {'β' if cond else 'β'} {name}" + (f": {detail}" if detail and not cond else ""))
# βββ Sprint 9: Fingerprint + Dataset + PromptPack βββ
print("Sprint 9: Fingerprint")
from purpose_agent.trace import Trace
from purpose_agent.optimization.fingerprint import fingerprint_traces, CapabilityFingerprint
# Create mock traces
traces = []
for i in range(10):
t = Trace(purpose=f"Write a Python function for task {i}")
t.emit("action", step=1, name="submit_code", tool="python_exec", thought="Thinking about task")
t.emit("tool.started", step=1, name="python_exec")
t.emit("score", step=1, phi_after=8.0 if i > 3 else 4.0)
t.emit("run.finished", step=2, success=i > 3, phi=8.0 if i > 3 else 4.0)
t.finalize()
traces.append(t)
fp = fingerprint_traces(traces)
check("T9.1 Domains detected", "coding" in fp.domains, str(fp.domains))
check("T9.1 Total traces counted", fp.total_traces == 10)
check("T9.2 Tool usage tracked", "python_exec" in fp.tool_usage, str(fp.tool_usage))
check("T9.2 Tool motifs found", len(fp.tool_motifs) >= 0) # May be empty with simple traces
print("\nSprint 9: Dataset")
from purpose_agent.optimization.dataset import TraceDatasetBuilder
builder = TraceDatasetBuilder(min_phi=6.0)
dataset = builder.build(traces)
check("T9.3 Filters by success", dataset.size > 0 and dataset.size < 10, f"size={dataset.size}")
check("T9.3 Has train split", len(dataset.train) > 0)
check("T9.4 Rejected count tracked", builder.rejected_count >= 0)
print("\nSprint 9: Prompt Pack")
from purpose_agent.optimization.prompt_pack import PromptPackBuilder, PromptPack
from purpose_agent.skills.schema import SkillCard
from purpose_agent.memory_homeostasis import MemoryBudget
skills = [
SkillCard(name="test_first", trigger="When coding", procedure=["Write tests", "Implement", "Run"], fitness_score=0.9),
SkillCard(name="edge_cases", trigger="When handling input", procedure=["Check null", "Check empty", "Check negative"], fitness_score=0.7),
SkillCard(name="low_fitness", trigger="When stuck", procedure=["Try random things"], fitness_score=0.2),
]
ppb = PromptPackBuilder(budget=MemoryBudget(max_injected_tokens=200))
pack = ppb.build(skills=skills, instructions=["Always validate input"])
check("T9.5 Token budget respected", pack.token_estimate <= 200, f"tokens={pack.token_estimate}")
check("T9.6 Highest fitness first", pack.skills[0]["fitness"] >= pack.skills[-1]["fitness"] if len(pack.skills) > 1 else True)
prompt = pack.to_system_prompt()
check("T9.6 Prompt has instructions", "validate input" in prompt)
# βββ Sprint 10: Shadow Eval + Optimizer + Distillation βββ
print("\nSprint 10: Shadow Eval")
from purpose_agent.optimization.shadow_eval import ShadowEvaluator
evaluator = ShadowEvaluator(threshold=0.95)
# Better candidate passes
r1 = evaluator.compare(baseline_scores=[7.0, 8.0, 7.5], candidate_scores=[8.0, 8.5, 8.0])
check("T10.1 Better candidate passes", r1.passed, r1.detail)
# Worse candidate fails
r2 = evaluator.compare(baseline_scores=[8.0, 8.5, 9.0], candidate_scores=[5.0, 4.0, 5.5])
check("T10.2 Worse candidate fails", not r2.passed, r2.detail)
check("T10.2 Should rollback", evaluator.should_rollback(r2))
print("\nSprint 10: Optimizer")
from purpose_agent.optimization.optimizer import AgenticOptimizer, OptimizationState
opt = AgenticOptimizer(min_samples=3, plateau_threshold=0.05, degradation_threshold=-0.1)
# Improving
for s in [5.0, 6.0, 7.0, 8.0, 9.0]:
opt.record_score(s)
rec = opt.recommend()
check("T10.3 Improving β continue", rec.action == "continue", rec.reason)
# Plateau
opt2 = AgenticOptimizer(min_samples=3)
for s in [7.0, 7.1, 7.0, 6.9, 7.0, 7.1, 7.0, 6.9, 7.0, 7.1]:
opt2.record_score(s)
rec2 = opt2.recommend()
check("T10.4 Plateau β optimize", "optimize" in rec2.action or rec2.state == OptimizationState.PLATEAU,
f"action={rec2.action}")
# Degrading
opt3 = AgenticOptimizer(min_samples=3)
for s in [9.0, 8.5, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]:
opt3.record_score(s)
rec3 = opt3.recommend()
check("T10.5 Degrading β rollback", rec3.action == "rollback", f"action={rec3.action}")
print("\nSprint 10: Distillation Plan")
from purpose_agent.optimization.lora_plan import plan_distillation, DistillationPlan
# No GPU
plan1 = plan_distillation(fingerprint={}, dataset_size=500, has_gpu=False)
check("T10.6 No GPU β prompt_pack", plan1.mode == "prompt_pack", plan1.mode)
check("T10.6 No GPU required", not plan1.requires_gpu)
# Large dataset + GPU
plan2 = plan_distillation(fingerprint={}, dataset_size=5000, current_model="llama-70b",
target_model="qwen-1.5b", has_gpu=True)
check("T10.7 Large+GPU β distill", plan2.mode == "distill", plan2.mode)
check("T10.7 Has rollback model", plan2.rollback_model == "llama-70b")
check("T10.7 Has acceptance score", plan2.acceptance_score > 0)
# Small dataset
plan3 = plan_distillation(fingerprint={}, dataset_size=5, has_gpu=True)
check("T10.x Tiny dataset β none", plan3.mode == "none")
# βββ REPORT βββ
print(f"\n{'='*50}")
print(f" Track D Tests: {PASS} pass, {FAIL} fail")
print(f" {'ALL PASS β' if FAIL == 0 else f'{FAIL} FAILURES'}")
print(f"{'='*50}")
sys.exit(0 if FAIL == 0 else 1)
|