| |
| """ |
| Track D Tests β Optimization Pipeline. |
| |
| T9.1 Coding traces fingerprint correctly |
| T9.2 Tool motif extraction works |
| T9.3 Dataset builder filters by success |
| T9.4 Dataset builder removes poisoned examples |
| T9.5 Prompt pack respects token budget |
| T9.6 Prompt pack includes highest-fitness skills first |
| T10.1 Shadow eval: better candidate passes |
| T10.2 Shadow eval: worse candidate fails |
| T10.3 Optimizer: improving β continue |
| T10.4 Optimizer: plateau β optimize prompts |
| T10.5 Optimizer: degrading β rollback |
| T10.6 Distillation plan: no GPU β prompt_pack |
| T10.7 Distillation plan: large dataset + GPU β distill |
| """ |
| import sys, os |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) |
|
|
| PASS = FAIL = 0 |
| def check(name, cond, detail=""): |
| global PASS, FAIL |
| PASS += int(cond); FAIL += int(not cond) |
| print(f" {'β' if cond else 'β'} {name}" + (f": {detail}" if detail and not cond else "")) |
|
|
| |
| print("Sprint 9: Fingerprint") |
| from purpose_agent.trace import Trace |
| from purpose_agent.optimization.fingerprint import fingerprint_traces, CapabilityFingerprint |
|
|
| |
| traces = [] |
| for i in range(10): |
| t = Trace(purpose=f"Write a Python function for task {i}") |
| t.emit("action", step=1, name="submit_code", tool="python_exec", thought="Thinking about task") |
| t.emit("tool.started", step=1, name="python_exec") |
| t.emit("score", step=1, phi_after=8.0 if i > 3 else 4.0) |
| t.emit("run.finished", step=2, success=i > 3, phi=8.0 if i > 3 else 4.0) |
| t.finalize() |
| traces.append(t) |
|
|
| fp = fingerprint_traces(traces) |
| check("T9.1 Domains detected", "coding" in fp.domains, str(fp.domains)) |
| check("T9.1 Total traces counted", fp.total_traces == 10) |
| check("T9.2 Tool usage tracked", "python_exec" in fp.tool_usage, str(fp.tool_usage)) |
| check("T9.2 Tool motifs found", len(fp.tool_motifs) >= 0) |
|
|
| print("\nSprint 9: Dataset") |
| from purpose_agent.optimization.dataset import TraceDatasetBuilder |
|
|
| builder = TraceDatasetBuilder(min_phi=6.0) |
| dataset = builder.build(traces) |
| check("T9.3 Filters by success", dataset.size > 0 and dataset.size < 10, f"size={dataset.size}") |
| check("T9.3 Has train split", len(dataset.train) > 0) |
| check("T9.4 Rejected count tracked", builder.rejected_count >= 0) |
|
|
| print("\nSprint 9: Prompt Pack") |
| from purpose_agent.optimization.prompt_pack import PromptPackBuilder, PromptPack |
| from purpose_agent.skills.schema import SkillCard |
| from purpose_agent.memory_homeostasis import MemoryBudget |
|
|
| skills = [ |
| SkillCard(name="test_first", trigger="When coding", procedure=["Write tests", "Implement", "Run"], fitness_score=0.9), |
| SkillCard(name="edge_cases", trigger="When handling input", procedure=["Check null", "Check empty", "Check negative"], fitness_score=0.7), |
| SkillCard(name="low_fitness", trigger="When stuck", procedure=["Try random things"], fitness_score=0.2), |
| ] |
|
|
| ppb = PromptPackBuilder(budget=MemoryBudget(max_injected_tokens=200)) |
| pack = ppb.build(skills=skills, instructions=["Always validate input"]) |
| check("T9.5 Token budget respected", pack.token_estimate <= 200, f"tokens={pack.token_estimate}") |
| check("T9.6 Highest fitness first", pack.skills[0]["fitness"] >= pack.skills[-1]["fitness"] if len(pack.skills) > 1 else True) |
| prompt = pack.to_system_prompt() |
| check("T9.6 Prompt has instructions", "validate input" in prompt) |
|
|
| |
| print("\nSprint 10: Shadow Eval") |
| from purpose_agent.optimization.shadow_eval import ShadowEvaluator |
|
|
| evaluator = ShadowEvaluator(threshold=0.95) |
|
|
| |
| r1 = evaluator.compare(baseline_scores=[7.0, 8.0, 7.5], candidate_scores=[8.0, 8.5, 8.0]) |
| check("T10.1 Better candidate passes", r1.passed, r1.detail) |
|
|
| |
| r2 = evaluator.compare(baseline_scores=[8.0, 8.5, 9.0], candidate_scores=[5.0, 4.0, 5.5]) |
| check("T10.2 Worse candidate fails", not r2.passed, r2.detail) |
| check("T10.2 Should rollback", evaluator.should_rollback(r2)) |
|
|
| print("\nSprint 10: Optimizer") |
| from purpose_agent.optimization.optimizer import AgenticOptimizer, OptimizationState |
|
|
| opt = AgenticOptimizer(min_samples=3, plateau_threshold=0.05, degradation_threshold=-0.1) |
|
|
| |
| for s in [5.0, 6.0, 7.0, 8.0, 9.0]: |
| opt.record_score(s) |
| rec = opt.recommend() |
| check("T10.3 Improving β continue", rec.action == "continue", rec.reason) |
|
|
| |
| opt2 = AgenticOptimizer(min_samples=3) |
| for s in [7.0, 7.1, 7.0, 6.9, 7.0, 7.1, 7.0, 6.9, 7.0, 7.1]: |
| opt2.record_score(s) |
| rec2 = opt2.recommend() |
| check("T10.4 Plateau β optimize", "optimize" in rec2.action or rec2.state == OptimizationState.PLATEAU, |
| f"action={rec2.action}") |
|
|
| |
| opt3 = AgenticOptimizer(min_samples=3) |
| for s in [9.0, 8.5, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]: |
| opt3.record_score(s) |
| rec3 = opt3.recommend() |
| check("T10.5 Degrading β rollback", rec3.action == "rollback", f"action={rec3.action}") |
|
|
| print("\nSprint 10: Distillation Plan") |
| from purpose_agent.optimization.lora_plan import plan_distillation, DistillationPlan |
|
|
| |
| plan1 = plan_distillation(fingerprint={}, dataset_size=500, has_gpu=False) |
| check("T10.6 No GPU β prompt_pack", plan1.mode == "prompt_pack", plan1.mode) |
| check("T10.6 No GPU required", not plan1.requires_gpu) |
|
|
| |
| plan2 = plan_distillation(fingerprint={}, dataset_size=5000, current_model="llama-70b", |
| target_model="qwen-1.5b", has_gpu=True) |
| check("T10.7 Large+GPU β distill", plan2.mode == "distill", plan2.mode) |
| check("T10.7 Has rollback model", plan2.rollback_model == "llama-70b") |
| check("T10.7 Has acceptance score", plan2.acceptance_score > 0) |
|
|
| |
| plan3 = plan_distillation(fingerprint={}, dataset_size=5, has_gpu=True) |
| check("T10.x Tiny dataset β none", plan3.mode == "none") |
|
|
| |
| print(f"\n{'='*50}") |
| print(f" Track D Tests: {PASS} pass, {FAIL} fail") |
| print(f" {'ALL PASS β' if FAIL == 0 else f'{FAIL} FAILURES'}") |
| print(f"{'='*50}") |
| sys.exit(0 if FAIL == 0 else 1) |
|
|