| |
| """ |
| First-Principles Engineering Tests. |
| |
| T-FP1: State-delta is O(1) β constant tokens regardless of state size |
| T-FP2: State-delta captures actual changes correctly |
| T-FP3: Falsification critic generates assertions from code |
| T-FP4: Falsification score is COMPUTED (0 hallucinations) |
| T-FP5: Falsification score = passed/total * 10 |
| T-FP6: PEP 578 sandbox policy creation |
| T-FP7: Path allowlist/blocklist logic correct |
| """ |
| import sys, os |
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) |
|
|
| PASS = FAIL = 0 |
| def check(name, cond, detail=""): |
| global PASS, FAIL |
| PASS += int(cond); FAIL += int(not cond) |
| print(f" {'β' if cond else 'β'} {name}" + (f": {detail}" if detail and not cond else "")) |
|
|
| from purpose_agent.types import State |
|
|
| |
| print("βββ State-Delta Critic (Markovian, O(1)) βββ") |
| from purpose_agent.state_delta import compute_state_delta, format_critic_input, StateDelta |
|
|
| |
| big_state = State(data={f"key_{i}": f"value_{i}" * 100 for i in range(100)}) |
| big_state2 = State(data={**big_state.data, "new_key": "new_value"}) |
|
|
| delta = compute_state_delta(big_state, big_state2) |
| check("T-FP1 Delta is O(1)", delta.token_estimate < 100, |
| f"tokens={delta.token_estimate} (state was ~100KB)") |
|
|
| |
| huge = State(data={f"k{i}": i for i in range(1000)}) |
| huge2 = State(data={**huge.data, "k500": 999}) |
| d2 = compute_state_delta(huge, huge2) |
| check("T-FP1 1000-key state β tiny delta", d2.token_estimate < 50, |
| f"tokens={d2.token_estimate}") |
|
|
| |
| s1 = State(data={"score": 3, "status": "running", "attempts": 1}) |
| s2 = State(data={"score": 7, "status": "running", "attempts": 2, "output": "done"}) |
| d3 = compute_state_delta(s1, s2) |
| check("T-FP2 Detects added keys", "output" in d3.added_keys) |
| check("T-FP2 Detects changed keys", "score" in d3.changed_keys) |
| check("T-FP2 Changed values correct", d3.changed_keys["score"] == (3, 7)) |
| check("T-FP2 Unchanged keys ignored", "status" not in d3.changed_keys) |
|
|
| |
| s3 = State(data={"x": 1}) |
| d4 = compute_state_delta(s3, s3) |
| check("T-FP2 No change = empty delta", d4.is_empty) |
|
|
| |
| formatted = format_critic_input("Write fibonacci", "submit_code", "I wrote the code", d3, max_tokens=300) |
| check("T-FP2 Formatted output exists", len(formatted) > 0) |
| check("T-FP2 Under token budget", len(formatted) // 4 <= 300) |
|
|
| |
| print("\nβββ Falsification Critic (Popper's Method) βββ") |
| from purpose_agent.falsification_critic import FalsificationCritic, FalsificationResult |
| from purpose_agent import MockLLMBackend |
|
|
| |
| mock = MockLLMBackend() |
| mock.register_handler("TEST ADVERSARY", |
| "assert fib(0) == 0\nassert fib(1) == 1\nassert fib(-1) == 0") |
|
|
| critic = FalsificationCritic(llm=mock, timeout_s=5.0) |
|
|
| |
| good_code = "def fib(n):\n if n <= 0: return 0\n if n == 1: return 1\n a,b=0,1\n for _ in range(2,n+1): a,b=b,a+b\n return b" |
| result = critic.evaluate(good_code, purpose="fibonacci") |
| check("T-FP3 Generates assertions", len(result.generated_assertions) > 0, |
| f"got {len(result.generated_assertions)}") |
| check("T-FP4 Score is computed (not hallucinated)", isinstance(result.score, float)) |
| check("T-FP5 Good code scores high", result.score >= 6.0, f"score={result.score}") |
| check("T-FP5 Score = passed/total*10", |
| abs(result.score - (result.assertions_passed / max(result.assertions_total, 1) * 10)) < 0.1) |
|
|
| |
| bad_code = "def fib(n): return n + 1" |
| mock2 = MockLLMBackend() |
| mock2.register_handler("TEST ADVERSARY", |
| "assert fib(0) == 0\nassert fib(5) == 5\nassert fib(10) == 55") |
| critic2 = FalsificationCritic(llm=mock2, timeout_s=5.0) |
| result2 = critic2.evaluate(bad_code) |
| check("T-FP5 Bad code scores low", result2.score < 5.0, f"score={result2.score}") |
| check("T-FP5 Bad code is falsified", result2.is_falsified) |
|
|
| |
| result3 = critic.evaluate("") |
| check("T-FP4 No code β 0", result3.score == 0.0) |
|
|
| |
| print("\nβββ PEP 578 Sandbox (Kernel-Level) βββ") |
| from purpose_agent.sandbox_hooks import SandboxPolicy, _path_allowed, is_sandbox_installed |
|
|
| |
| |
|
|
| |
| policy = SandboxPolicy( |
| allowed_paths=["/app/workspace", "/tmp"], |
| blocked_paths=["/etc", "/proc"], |
| block_network=True, |
| block_subprocess=True, |
| ) |
| check("T-FP6 Policy creates", policy is not None) |
| check("T-FP6 Network blocked", policy.block_network) |
| check("T-FP6 Subprocess blocked", policy.block_subprocess) |
| check("T-FP6 Has blocked modules", "ctypes" in policy.blocked_modules) |
|
|
| |
| |
| import purpose_agent.sandbox_hooks as sh |
| old_policy = sh._policy |
| sh._policy = policy |
|
|
| check("T-FP7 /tmp allowed", _path_allowed("/tmp/test.py")) |
| check("T-FP7 /app/workspace allowed", _path_allowed("/app/workspace/code.py")) |
| check("T-FP7 /etc blocked", not _path_allowed("/etc/passwd")) |
| check("T-FP7 /proc blocked", not _path_allowed("/proc/self/environ")) |
|
|
| |
| sh._policy = old_policy |
|
|
| |
| check("T-FP7 Not installed in tests", not is_sandbox_installed()) |
|
|
| |
| print(f"\n{'='*50}") |
| print(f" First-Principles Tests: {PASS} pass, {FAIL} fail") |
| print(f" {'ALL PASS β' if FAIL == 0 else f'{FAIL} FAILURES'}") |
| print(f"{'='*50}") |
| sys.exit(0 if FAIL == 0 else 1) |
|
|