File size: 5,830 Bytes
4977407 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | #!/usr/bin/env python3
"""
First-Principles Engineering Tests.
T-FP1: State-delta is O(1) β constant tokens regardless of state size
T-FP2: State-delta captures actual changes correctly
T-FP3: Falsification critic generates assertions from code
T-FP4: Falsification score is COMPUTED (0 hallucinations)
T-FP5: Falsification score = passed/total * 10
T-FP6: PEP 578 sandbox policy creation
T-FP7: Path allowlist/blocklist logic correct
"""
import sys, os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
PASS = FAIL = 0
def check(name, cond, detail=""):
global PASS, FAIL
PASS += int(cond); FAIL += int(not cond)
print(f" {'β' if cond else 'β'} {name}" + (f": {detail}" if detail and not cond else ""))
from purpose_agent.types import State
# βββ State Delta (O(1) token cost) βββ
print("βββ State-Delta Critic (Markovian, O(1)) βββ")
from purpose_agent.state_delta import compute_state_delta, format_critic_input, StateDelta
# T-FP1: Constant token cost regardless of state size
big_state = State(data={f"key_{i}": f"value_{i}" * 100 for i in range(100)}) # ~100KB state
big_state2 = State(data={**big_state.data, "new_key": "new_value"})
delta = compute_state_delta(big_state, big_state2)
check("T-FP1 Delta is O(1)", delta.token_estimate < 100,
f"tokens={delta.token_estimate} (state was ~100KB)")
# Even with 1000-key state, delta is still tiny
huge = State(data={f"k{i}": i for i in range(1000)})
huge2 = State(data={**huge.data, "k500": 999}) # One change
d2 = compute_state_delta(huge, huge2)
check("T-FP1 1000-key state β tiny delta", d2.token_estimate < 50,
f"tokens={d2.token_estimate}")
# T-FP2: Captures changes correctly
s1 = State(data={"score": 3, "status": "running", "attempts": 1})
s2 = State(data={"score": 7, "status": "running", "attempts": 2, "output": "done"})
d3 = compute_state_delta(s1, s2)
check("T-FP2 Detects added keys", "output" in d3.added_keys)
check("T-FP2 Detects changed keys", "score" in d3.changed_keys)
check("T-FP2 Changed values correct", d3.changed_keys["score"] == (3, 7))
check("T-FP2 Unchanged keys ignored", "status" not in d3.changed_keys)
# Empty delta
s3 = State(data={"x": 1})
d4 = compute_state_delta(s3, s3)
check("T-FP2 No change = empty delta", d4.is_empty)
# Format for critic
formatted = format_critic_input("Write fibonacci", "submit_code", "I wrote the code", d3, max_tokens=300)
check("T-FP2 Formatted output exists", len(formatted) > 0)
check("T-FP2 Under token budget", len(formatted) // 4 <= 300)
# βββ Falsification Critic (Popperian) βββ
print("\nβββ Falsification Critic (Popper's Method) βββ")
from purpose_agent.falsification_critic import FalsificationCritic, FalsificationResult
from purpose_agent import MockLLMBackend
# T-FP3: Mock LLM generates assertions
mock = MockLLMBackend()
mock.register_handler("TEST ADVERSARY",
"assert fib(0) == 0\nassert fib(1) == 1\nassert fib(-1) == 0")
critic = FalsificationCritic(llm=mock, timeout_s=5.0)
# Good code β all assertions pass
good_code = "def fib(n):\n if n <= 0: return 0\n if n == 1: return 1\n a,b=0,1\n for _ in range(2,n+1): a,b=b,a+b\n return b"
result = critic.evaluate(good_code, purpose="fibonacci")
check("T-FP3 Generates assertions", len(result.generated_assertions) > 0,
f"got {len(result.generated_assertions)}")
check("T-FP4 Score is computed (not hallucinated)", isinstance(result.score, float))
check("T-FP5 Good code scores high", result.score >= 6.0, f"score={result.score}")
check("T-FP5 Score = passed/total*10",
abs(result.score - (result.assertions_passed / max(result.assertions_total, 1) * 10)) < 0.1)
# Bad code β assertions fail
bad_code = "def fib(n): return n + 1" # Wrong implementation
mock2 = MockLLMBackend()
mock2.register_handler("TEST ADVERSARY",
"assert fib(0) == 0\nassert fib(5) == 5\nassert fib(10) == 55")
critic2 = FalsificationCritic(llm=mock2, timeout_s=5.0)
result2 = critic2.evaluate(bad_code)
check("T-FP5 Bad code scores low", result2.score < 5.0, f"score={result2.score}")
check("T-FP5 Bad code is falsified", result2.is_falsified)
# No code β score 0
result3 = critic.evaluate("")
check("T-FP4 No code β 0", result3.score == 0.0)
# βββ PEP 578 Sandbox Hooks βββ
print("\nβββ PEP 578 Sandbox (Kernel-Level) βββ")
from purpose_agent.sandbox_hooks import SandboxPolicy, _path_allowed, is_sandbox_installed
# Note: We do NOT actually install the hook in tests (it's permanent + affects test runner)
# Instead we test the LOGIC of the policy
# T-FP6: Policy creation
policy = SandboxPolicy(
allowed_paths=["/app/workspace", "/tmp"],
blocked_paths=["/etc", "/proc"],
block_network=True,
block_subprocess=True,
)
check("T-FP6 Policy creates", policy is not None)
check("T-FP6 Network blocked", policy.block_network)
check("T-FP6 Subprocess blocked", policy.block_subprocess)
check("T-FP6 Has blocked modules", "ctypes" in policy.blocked_modules)
# T-FP7: Path logic (test without installing hook)
# Monkey-patch the global policy for testing
import purpose_agent.sandbox_hooks as sh
old_policy = sh._policy
sh._policy = policy
check("T-FP7 /tmp allowed", _path_allowed("/tmp/test.py"))
check("T-FP7 /app/workspace allowed", _path_allowed("/app/workspace/code.py"))
check("T-FP7 /etc blocked", not _path_allowed("/etc/passwd"))
check("T-FP7 /proc blocked", not _path_allowed("/proc/self/environ"))
# Restore
sh._policy = old_policy
# Verify not installed in test process
check("T-FP7 Not installed in tests", not is_sandbox_installed())
# βββ REPORT βββ
print(f"\n{'='*50}")
print(f" First-Principles Tests: {PASS} pass, {FAIL} fail")
print(f" {'ALL PASS β' if FAIL == 0 else f'{FAIL} FAILURES'}")
print(f"{'='*50}")
sys.exit(0 if FAIL == 0 else 1)
|