#!/usr/bin/env python3 """ Track D Tests — Optimization Pipeline. T9.1 Coding traces fingerprint correctly T9.2 Tool motif extraction works T9.3 Dataset builder filters by success T9.4 Dataset builder removes poisoned examples T9.5 Prompt pack respects token budget T9.6 Prompt pack includes highest-fitness skills first T10.1 Shadow eval: better candidate passes T10.2 Shadow eval: worse candidate fails T10.3 Optimizer: improving → continue T10.4 Optimizer: plateau → optimize prompts T10.5 Optimizer: degrading → rollback T10.6 Distillation plan: no GPU → prompt_pack T10.7 Distillation plan: large dataset + GPU → distill """ import sys, os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) PASS = FAIL = 0 def check(name, cond, detail=""): global PASS, FAIL PASS += int(cond); FAIL += int(not cond) print(f" {'✓' if cond else '✗'} {name}" + (f": {detail}" if detail and not cond else "")) # ═══ Sprint 9: Fingerprint + Dataset + PromptPack ═══ print("Sprint 9: Fingerprint") from purpose_agent.trace import Trace from purpose_agent.optimization.fingerprint import fingerprint_traces, CapabilityFingerprint # Create mock traces traces = [] for i in range(10): t = Trace(purpose=f"Write a Python function for task {i}") t.emit("action", step=1, name="submit_code", tool="python_exec", thought="Thinking about task") t.emit("tool.started", step=1, name="python_exec") t.emit("score", step=1, phi_after=8.0 if i > 3 else 4.0) t.emit("run.finished", step=2, success=i > 3, phi=8.0 if i > 3 else 4.0) t.finalize() traces.append(t) fp = fingerprint_traces(traces) check("T9.1 Domains detected", "coding" in fp.domains, str(fp.domains)) check("T9.1 Total traces counted", fp.total_traces == 10) check("T9.2 Tool usage tracked", "python_exec" in fp.tool_usage, str(fp.tool_usage)) check("T9.2 Tool motifs found", len(fp.tool_motifs) >= 0) # May be empty with simple traces print("\nSprint 9: Dataset") from purpose_agent.optimization.dataset import TraceDatasetBuilder builder = TraceDatasetBuilder(min_phi=6.0) dataset = builder.build(traces) check("T9.3 Filters by success", dataset.size > 0 and dataset.size < 10, f"size={dataset.size}") check("T9.3 Has train split", len(dataset.train) > 0) check("T9.4 Rejected count tracked", builder.rejected_count >= 0) print("\nSprint 9: Prompt Pack") from purpose_agent.optimization.prompt_pack import PromptPackBuilder, PromptPack from purpose_agent.skills.schema import SkillCard from purpose_agent.memory_homeostasis import MemoryBudget skills = [ SkillCard(name="test_first", trigger="When coding", procedure=["Write tests", "Implement", "Run"], fitness_score=0.9), SkillCard(name="edge_cases", trigger="When handling input", procedure=["Check null", "Check empty", "Check negative"], fitness_score=0.7), SkillCard(name="low_fitness", trigger="When stuck", procedure=["Try random things"], fitness_score=0.2), ] ppb = PromptPackBuilder(budget=MemoryBudget(max_injected_tokens=200)) pack = ppb.build(skills=skills, instructions=["Always validate input"]) check("T9.5 Token budget respected", pack.token_estimate <= 200, f"tokens={pack.token_estimate}") check("T9.6 Highest fitness first", pack.skills[0]["fitness"] >= pack.skills[-1]["fitness"] if len(pack.skills) > 1 else True) prompt = pack.to_system_prompt() check("T9.6 Prompt has instructions", "validate input" in prompt) # ═══ Sprint 10: Shadow Eval + Optimizer + Distillation ═══ print("\nSprint 10: Shadow Eval") from purpose_agent.optimization.shadow_eval import ShadowEvaluator evaluator = ShadowEvaluator(threshold=0.95) # Better candidate passes r1 = evaluator.compare(baseline_scores=[7.0, 8.0, 7.5], candidate_scores=[8.0, 8.5, 8.0]) check("T10.1 Better candidate passes", r1.passed, r1.detail) # Worse candidate fails r2 = evaluator.compare(baseline_scores=[8.0, 8.5, 9.0], candidate_scores=[5.0, 4.0, 5.5]) check("T10.2 Worse candidate fails", not r2.passed, r2.detail) check("T10.2 Should rollback", evaluator.should_rollback(r2)) print("\nSprint 10: Optimizer") from purpose_agent.optimization.optimizer import AgenticOptimizer, OptimizationState opt = AgenticOptimizer(min_samples=3, plateau_threshold=0.05, degradation_threshold=-0.1) # Improving for s in [5.0, 6.0, 7.0, 8.0, 9.0]: opt.record_score(s) rec = opt.recommend() check("T10.3 Improving → continue", rec.action == "continue", rec.reason) # Plateau opt2 = AgenticOptimizer(min_samples=3) for s in [7.0, 7.1, 7.0, 6.9, 7.0, 7.1, 7.0, 6.9, 7.0, 7.1]: opt2.record_score(s) rec2 = opt2.recommend() check("T10.4 Plateau → optimize", "optimize" in rec2.action or rec2.state == OptimizationState.PLATEAU, f"action={rec2.action}") # Degrading opt3 = AgenticOptimizer(min_samples=3) for s in [9.0, 8.5, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]: opt3.record_score(s) rec3 = opt3.recommend() check("T10.5 Degrading → rollback", rec3.action == "rollback", f"action={rec3.action}") print("\nSprint 10: Distillation Plan") from purpose_agent.optimization.lora_plan import plan_distillation, DistillationPlan # No GPU plan1 = plan_distillation(fingerprint={}, dataset_size=500, has_gpu=False) check("T10.6 No GPU → prompt_pack", plan1.mode == "prompt_pack", plan1.mode) check("T10.6 No GPU required", not plan1.requires_gpu) # Large dataset + GPU plan2 = plan_distillation(fingerprint={}, dataset_size=5000, current_model="llama-70b", target_model="qwen-1.5b", has_gpu=True) check("T10.7 Large+GPU → distill", plan2.mode == "distill", plan2.mode) check("T10.7 Has rollback model", plan2.rollback_model == "llama-70b") check("T10.7 Has acceptance score", plan2.acceptance_score > 0) # Small dataset plan3 = plan_distillation(fingerprint={}, dataset_size=5, has_gpu=True) check("T10.x Tiny dataset → none", plan3.mode == "none") # ═══ REPORT ═══ print(f"\n{'='*50}") print(f" Track D Tests: {PASS} pass, {FAIL} fail") print(f" {'ALL PASS ✓' if FAIL == 0 else f'{FAIL} FAILURES'}") print(f"{'='*50}") sys.exit(0 if FAIL == 0 else 1)