purpose-agent / purpose_agent /falsification_critic.py
Rohan03's picture
first-principles: falsification_critic.py β€” Popperian scoring by adversarial assertions
9f46343 verified
"""
falsification_critic.py β€” Reward by Falsification (Karl Popper's method).
PROBLEM: SLMs cannot logically verify if complex code is correct.
Asking a 1.7B model "rate this code 0-10" guarantees hallucinations.
The model doesn't KNOW if the code works β€” it GUESSES.
SOLUTION: Invert the Critic's job using falsifiability.
Instead of: "Score this code" (requires understanding)
We do: "Generate 3 assertions designed to BREAK this code" (requires creativity)
Then we RUN those assertions deterministically on the CPU.
Score = assertions_passed / total_assertions * 10
This is mathematically rigorous:
- Score is computed by EXECUTION, not LLM judgment
- Zero hallucinations in the score itself
- The LLM's job is EASY (generate test cases) not HARD (verify correctness)
- Even a 0.6B model can generate edge cases (what about input=0? negative? empty?)
Theoretical basis:
Karl Popper: "A theory is scientific if and only if it is falsifiable."
We don't ask "is this code correct?" (unfalsifiable for SLMs)
We ask "CAN I BREAK this code?" (falsifiable by execution)
"""
from __future__ import annotations
import logging
import re
import subprocess
import sys
import tempfile
import os
from dataclasses import dataclass, field
from typing import Any
from purpose_agent.llm_backend import LLMBackend, ChatMessage
from purpose_agent.robust_parser import extract_code
logger = logging.getLogger(__name__)
FALSIFICATION_PROMPT = """\
You are a TEST ADVERSARY. Your job is to BREAK the code below.
Generate exactly 3 Python assert statements that test edge cases and boundary conditions.
Each assert should be designed to catch a common bug.
Think about:
- What happens with 0? Empty input? None? Negative numbers?
- What about very large inputs? Single element? Duplicate values?
- What about the exact boundary between cases?
CODE TO BREAK:
```python
{code}
```
Respond with ONLY 3 assert statements, one per line:
assert ...
assert ...
assert ...
"""
@dataclass
class FalsificationResult:
"""
Result of falsification-based scoring.
The score is COMPUTED, not LLM-generated. Zero hallucinations.
"""
score: float # 0.0-10.0, computed as (passed/total * 10)
assertions_total: int
assertions_passed: int
assertions_failed: int
failed_details: list[str] = field(default_factory=list) # Which assertions failed and why
generated_assertions: list[str] = field(default_factory=list)
execution_error: str | None = None
@property
def pass_rate(self) -> float:
if self.assertions_total == 0:
return 0.0
return self.assertions_passed / self.assertions_total
@property
def is_falsified(self) -> bool:
"""Was the code broken by at least one assertion?"""
return self.assertions_failed > 0
class FalsificationCritic:
"""
Popperian Critic: scores code by trying to BREAK it.
The LLM generates adversarial assertions.
The CPU executes them deterministically.
The score is pure math: passed / total * 10.
Usage:
critic = FalsificationCritic(llm=backend)
result = critic.evaluate(code="def fib(n): ...")
print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)")
"""
def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0):
self.llm = llm
self.num_assertions = num_assertions
self.timeout_s = timeout_s
def evaluate(self, code: str, purpose: str = "") -> FalsificationResult:
"""
Evaluate code by attempting to falsify it.
Steps:
1. LLM generates adversarial assertions (easy task β€” even SLMs can do this)
2. CPU executes code + assertions in sandboxed subprocess
3. Score = assertions_passed / total * 10 (deterministic, no hallucination)
"""
if not code or "def " not in code:
return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0,
assertions_failed=0, execution_error="No valid code provided")
# Step 1: Generate adversarial assertions via LLM
assertions = self._generate_assertions(code, purpose)
if not assertions:
return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0,
assertions_failed=0, execution_error="LLM failed to generate assertions")
# Step 2: Execute deterministically
passed, failed, details = self._execute_assertions(code, assertions)
# Step 3: Compute score (pure math β€” zero hallucination)
total = len(assertions)
score = (passed / total * 10.0) if total > 0 else 0.0
return FalsificationResult(
score=round(score, 1),
assertions_total=total,
assertions_passed=passed,
assertions_failed=failed,
failed_details=details,
generated_assertions=assertions,
)
def _generate_assertions(self, code: str, purpose: str = "") -> list[str]:
"""
Ask the LLM to generate adversarial test assertions.
This is an EASY task for SLMs β€” generating edge cases requires
creativity, not deep logical reasoning about correctness.
"""
prompt = FALSIFICATION_PROMPT.format(code=code[:1000])
if purpose:
prompt = f"CONTEXT: {purpose}\n\n" + prompt
try:
raw = self.llm.generate(
[ChatMessage(role="user", content=prompt)],
temperature=0.7, # Some creativity for edge cases
max_tokens=500,
)
except Exception as e:
logger.warning(f"FalsificationCritic: LLM call failed: {e}")
return []
# Extract assert statements
assertions = []
for line in raw.split("\n"):
line = line.strip()
if line.startswith("assert "):
assertions.append(line)
# If no explicit asserts found, try to extract from code blocks
if not assertions:
code_block = extract_code(raw)
if code_block:
for line in code_block.split("\n"):
line = line.strip()
if line.startswith("assert "):
assertions.append(line)
return assertions[:self.num_assertions]
def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]:
"""
Execute code + assertions in a sandboxed subprocess.
Returns: (passed_count, failed_count, failure_details)
"""
passed = 0
failed = 0
details = []
for assertion in assertions:
# Build test script
test_script = f"{code}\n\n# Test assertion\ntry:\n {assertion}\n print('PASS')\nexcept AssertionError as e:\n print(f'FAIL: {{e}}')\nexcept Exception as e:\n print(f'ERROR: {{type(e).__name__}}: {{e}}')\n"
# Execute in subprocess with timeout
try:
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f:
f.write(test_script)
script_path = f.name
result = subprocess.run(
[sys.executable, script_path],
capture_output=True, text=True,
timeout=self.timeout_s,
cwd=tempfile.gettempdir(),
)
output = result.stdout.strip()
if output == "PASS":
passed += 1
elif output.startswith("FAIL"):
failed += 1
details.append(f"{assertion} β†’ {output}")
elif output.startswith("ERROR"):
failed += 1
details.append(f"{assertion} β†’ {output}")
else:
# Unexpected output or crash
failed += 1
stderr = result.stderr.strip()[:100]
details.append(f"{assertion} β†’ unexpected: {stderr or output}")
except subprocess.TimeoutExpired:
failed += 1
details.append(f"{assertion} β†’ TIMEOUT ({self.timeout_s}s)")
except Exception as e:
failed += 1
details.append(f"{assertion} β†’ EXEC_ERROR: {e}")
finally:
try:
os.unlink(script_path)
except:
pass
return passed, failed, details