Track D tests: fingerprint, dataset, prompt_pack, shadow_eval, optimizer, distillation
Browse files- tests/test_track_d.py +142 -0
tests/test_track_d.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Track D Tests β Optimization Pipeline.
|
| 4 |
+
|
| 5 |
+
T9.1 Coding traces fingerprint correctly
|
| 6 |
+
T9.2 Tool motif extraction works
|
| 7 |
+
T9.3 Dataset builder filters by success
|
| 8 |
+
T9.4 Dataset builder removes poisoned examples
|
| 9 |
+
T9.5 Prompt pack respects token budget
|
| 10 |
+
T9.6 Prompt pack includes highest-fitness skills first
|
| 11 |
+
T10.1 Shadow eval: better candidate passes
|
| 12 |
+
T10.2 Shadow eval: worse candidate fails
|
| 13 |
+
T10.3 Optimizer: improving β continue
|
| 14 |
+
T10.4 Optimizer: plateau β optimize prompts
|
| 15 |
+
T10.5 Optimizer: degrading β rollback
|
| 16 |
+
T10.6 Distillation plan: no GPU β prompt_pack
|
| 17 |
+
T10.7 Distillation plan: large dataset + GPU β distill
|
| 18 |
+
"""
|
| 19 |
+
import sys, os
|
| 20 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
| 21 |
+
|
| 22 |
+
PASS = FAIL = 0
|
| 23 |
+
def check(name, cond, detail=""):
|
| 24 |
+
global PASS, FAIL
|
| 25 |
+
PASS += int(cond); FAIL += int(not cond)
|
| 26 |
+
print(f" {'β' if cond else 'β'} {name}" + (f": {detail}" if detail and not cond else ""))
|
| 27 |
+
|
| 28 |
+
# βββ Sprint 9: Fingerprint + Dataset + PromptPack βββ
|
| 29 |
+
print("Sprint 9: Fingerprint")
|
| 30 |
+
from purpose_agent.trace import Trace
|
| 31 |
+
from purpose_agent.optimization.fingerprint import fingerprint_traces, CapabilityFingerprint
|
| 32 |
+
|
| 33 |
+
# Create mock traces
|
| 34 |
+
traces = []
|
| 35 |
+
for i in range(10):
|
| 36 |
+
t = Trace(purpose=f"Write a Python function for task {i}")
|
| 37 |
+
t.emit("action", step=1, name="submit_code", tool="python_exec")
|
| 38 |
+
t.emit("tool.started", step=1, name="python_exec")
|
| 39 |
+
t.emit("score", step=1, phi_after=8.0 if i > 3 else 4.0)
|
| 40 |
+
t.emit("run.finished", step=2, success=i > 3, phi=8.0 if i > 3 else 4.0)
|
| 41 |
+
t.finalize()
|
| 42 |
+
traces.append(t)
|
| 43 |
+
|
| 44 |
+
fp = fingerprint_traces(traces)
|
| 45 |
+
check("T9.1 Domains detected", "coding" in fp.domains, str(fp.domains))
|
| 46 |
+
check("T9.1 Total traces counted", fp.total_traces == 10)
|
| 47 |
+
check("T9.2 Tool usage tracked", "python_exec" in fp.tool_usage, str(fp.tool_usage))
|
| 48 |
+
check("T9.2 Tool motifs found", len(fp.tool_motifs) >= 0) # May be empty with simple traces
|
| 49 |
+
|
| 50 |
+
print("\nSprint 9: Dataset")
|
| 51 |
+
from purpose_agent.optimization.dataset import TraceDatasetBuilder
|
| 52 |
+
|
| 53 |
+
builder = TraceDatasetBuilder(min_phi=6.0)
|
| 54 |
+
dataset = builder.build(traces)
|
| 55 |
+
check("T9.3 Filters by success", dataset.size > 0 and dataset.size < 10, f"size={dataset.size}")
|
| 56 |
+
check("T9.3 Has train split", len(dataset.train) > 0)
|
| 57 |
+
check("T9.4 Rejected count tracked", builder.rejected_count >= 0)
|
| 58 |
+
|
| 59 |
+
print("\nSprint 9: Prompt Pack")
|
| 60 |
+
from purpose_agent.optimization.prompt_pack import PromptPackBuilder, PromptPack
|
| 61 |
+
from purpose_agent.skills.schema import SkillCard
|
| 62 |
+
from purpose_agent.memory_homeostasis import MemoryBudget
|
| 63 |
+
|
| 64 |
+
skills = [
|
| 65 |
+
SkillCard(name="test_first", trigger="When coding", procedure=["Write tests", "Implement", "Run"], fitness_score=0.9),
|
| 66 |
+
SkillCard(name="edge_cases", trigger="When handling input", procedure=["Check null", "Check empty", "Check negative"], fitness_score=0.7),
|
| 67 |
+
SkillCard(name="low_fitness", trigger="When stuck", procedure=["Try random things"], fitness_score=0.2),
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
ppb = PromptPackBuilder(budget=MemoryBudget(max_injected_tokens=200))
|
| 71 |
+
pack = ppb.build(skills=skills, instructions=["Always validate input"])
|
| 72 |
+
check("T9.5 Token budget respected", pack.token_estimate <= 200, f"tokens={pack.token_estimate}")
|
| 73 |
+
check("T9.6 Highest fitness first", pack.skills[0]["fitness"] >= pack.skills[-1]["fitness"] if len(pack.skills) > 1 else True)
|
| 74 |
+
prompt = pack.to_system_prompt()
|
| 75 |
+
check("T9.6 Prompt has instructions", "validate input" in prompt)
|
| 76 |
+
|
| 77 |
+
# βββ Sprint 10: Shadow Eval + Optimizer + Distillation βββ
|
| 78 |
+
print("\nSprint 10: Shadow Eval")
|
| 79 |
+
from purpose_agent.optimization.shadow_eval import ShadowEvaluator
|
| 80 |
+
|
| 81 |
+
evaluator = ShadowEvaluator(threshold=0.95)
|
| 82 |
+
|
| 83 |
+
# Better candidate passes
|
| 84 |
+
r1 = evaluator.compare(baseline_scores=[7.0, 8.0, 7.5], candidate_scores=[8.0, 8.5, 8.0])
|
| 85 |
+
check("T10.1 Better candidate passes", r1.passed, r1.detail)
|
| 86 |
+
|
| 87 |
+
# Worse candidate fails
|
| 88 |
+
r2 = evaluator.compare(baseline_scores=[8.0, 8.5, 9.0], candidate_scores=[5.0, 4.0, 5.5])
|
| 89 |
+
check("T10.2 Worse candidate fails", not r2.passed, r2.detail)
|
| 90 |
+
check("T10.2 Should rollback", evaluator.should_rollback(r2))
|
| 91 |
+
|
| 92 |
+
print("\nSprint 10: Optimizer")
|
| 93 |
+
from purpose_agent.optimization.optimizer import AgenticOptimizer, OptimizationState
|
| 94 |
+
|
| 95 |
+
opt = AgenticOptimizer(min_samples=3, plateau_threshold=0.05, degradation_threshold=-0.1)
|
| 96 |
+
|
| 97 |
+
# Improving
|
| 98 |
+
for s in [5.0, 6.0, 7.0, 8.0, 9.0]:
|
| 99 |
+
opt.record_score(s)
|
| 100 |
+
rec = opt.recommend()
|
| 101 |
+
check("T10.3 Improving β continue", rec.action == "continue", rec.reason)
|
| 102 |
+
|
| 103 |
+
# Plateau
|
| 104 |
+
opt2 = AgenticOptimizer(min_samples=3)
|
| 105 |
+
for s in [7.0, 7.1, 7.0, 6.9, 7.0, 7.1, 7.0, 6.9, 7.0, 7.1]:
|
| 106 |
+
opt2.record_score(s)
|
| 107 |
+
rec2 = opt2.recommend()
|
| 108 |
+
check("T10.4 Plateau β optimize", "optimize" in rec2.action or rec2.state == OptimizationState.PLATEAU,
|
| 109 |
+
f"action={rec2.action}")
|
| 110 |
+
|
| 111 |
+
# Degrading
|
| 112 |
+
opt3 = AgenticOptimizer(min_samples=3)
|
| 113 |
+
for s in [9.0, 8.5, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]:
|
| 114 |
+
opt3.record_score(s)
|
| 115 |
+
rec3 = opt3.recommend()
|
| 116 |
+
check("T10.5 Degrading β rollback", rec3.action == "rollback", f"action={rec3.action}")
|
| 117 |
+
|
| 118 |
+
print("\nSprint 10: Distillation Plan")
|
| 119 |
+
from purpose_agent.optimization.lora_plan import plan_distillation, DistillationPlan
|
| 120 |
+
|
| 121 |
+
# No GPU
|
| 122 |
+
plan1 = plan_distillation(fingerprint={}, dataset_size=500, has_gpu=False)
|
| 123 |
+
check("T10.6 No GPU β prompt_pack", plan1.mode == "prompt_pack", plan1.mode)
|
| 124 |
+
check("T10.6 No GPU required", not plan1.requires_gpu)
|
| 125 |
+
|
| 126 |
+
# Large dataset + GPU
|
| 127 |
+
plan2 = plan_distillation(fingerprint={}, dataset_size=5000, current_model="llama-70b",
|
| 128 |
+
target_model="qwen-1.5b", has_gpu=True)
|
| 129 |
+
check("T10.7 Large+GPU β distill", plan2.mode == "distill", plan2.mode)
|
| 130 |
+
check("T10.7 Has rollback model", plan2.rollback_model == "llama-70b")
|
| 131 |
+
check("T10.7 Has acceptance score", plan2.acceptance_score > 0)
|
| 132 |
+
|
| 133 |
+
# Small dataset
|
| 134 |
+
plan3 = plan_distillation(fingerprint={}, dataset_size=5, has_gpu=True)
|
| 135 |
+
check("T10.x Tiny dataset β none", plan3.mode == "none")
|
| 136 |
+
|
| 137 |
+
# βββ REPORT βββ
|
| 138 |
+
print(f"\n{'='*50}")
|
| 139 |
+
print(f" Track D Tests: {PASS} pass, {FAIL} fail")
|
| 140 |
+
print(f" {'ALL PASS β' if FAIL == 0 else f'{FAIL} FAILURES'}")
|
| 141 |
+
print(f"{'='*50}")
|
| 142 |
+
sys.exit(0 if FAIL == 0 else 1)
|