purpose-agent / purpose_agent /falsification_critic.py

first-principles: falsification_critic.py — Popperian scoring by adversarial assertions

9f46343 verified 11 days ago

8.76 kB

	"""
	falsification_critic.py — Reward by Falsification (Karl Popper's method).

	PROBLEM: SLMs cannot logically verify if complex code is correct.
	Asking a 1.7B model "rate this code 0-10" guarantees hallucinations.
	The model doesn't KNOW if the code works — it GUESSES.

	SOLUTION: Invert the Critic's job using falsifiability.

	Instead of: "Score this code" (requires understanding)
	We do: "Generate 3 assertions designed to BREAK this code" (requires creativity)

	Then we RUN those assertions deterministically on the CPU.
	Score = assertions_passed / total_assertions * 10

	This is mathematically rigorous:
	- Score is computed by EXECUTION, not LLM judgment
	- Zero hallucinations in the score itself
	- The LLM's job is EASY (generate test cases) not HARD (verify correctness)
	- Even a 0.6B model can generate edge cases (what about input=0? negative? empty?)

	Theoretical basis:
	Karl Popper: "A theory is scientific if and only if it is falsifiable."
	We don't ask "is this code correct?" (unfalsifiable for SLMs)
	We ask "CAN I BREAK this code?" (falsifiable by execution)
	"""
	from __future__ import annotations

	import logging
	import re
	import subprocess
	import sys
	import tempfile
	import os
	from dataclasses import dataclass, field
	from typing import Any

	from purpose_agent.llm_backend import LLMBackend, ChatMessage
	from purpose_agent.robust_parser import extract_code

	logger = logging.getLogger(__name__)


	FALSIFICATION_PROMPT = """\
	You are a TEST ADVERSARY. Your job is to BREAK the code below.

	Generate exactly 3 Python assert statements that test edge cases and boundary conditions.
	Each assert should be designed to catch a common bug.

	Think about:
	- What happens with 0? Empty input? None? Negative numbers?
	- What about very large inputs? Single element? Duplicate values?
	- What about the exact boundary between cases?

	CODE TO BREAK:
	```python
	{code}
	```

	Respond with ONLY 3 assert statements, one per line:
	assert ...
	assert ...
	assert ...
	"""


	@dataclass
	class FalsificationResult:
	"""
	Result of falsification-based scoring.

	The score is COMPUTED, not LLM-generated. Zero hallucinations.
	"""
	score: float # 0.0-10.0, computed as (passed/total * 10)
	assertions_total: int
	assertions_passed: int
	assertions_failed: int
	failed_details: list[str] = field(default_factory=list) # Which assertions failed and why
	generated_assertions: list[str] = field(default_factory=list)
	execution_error: str \| None = None

	@property
	def pass_rate(self) -> float:
	if self.assertions_total == 0:
	return 0.0
	return self.assertions_passed / self.assertions_total

	@property
	def is_falsified(self) -> bool:
	"""Was the code broken by at least one assertion?"""
	return self.assertions_failed > 0


	class FalsificationCritic:
	"""
	Popperian Critic: scores code by trying to BREAK it.

	The LLM generates adversarial assertions.
	The CPU executes them deterministically.
	The score is pure math: passed / total * 10.

	Usage:
	critic = FalsificationCritic(llm=backend)
	result = critic.evaluate(code="def fib(n): ...")
	print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)")
	"""

	def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0):
	self.llm = llm
	self.num_assertions = num_assertions
	self.timeout_s = timeout_s

	def evaluate(self, code: str, purpose: str = "") -> FalsificationResult:
	"""
	Evaluate code by attempting to falsify it.

	Steps:
	1. LLM generates adversarial assertions (easy task — even SLMs can do this)
	2. CPU executes code + assertions in sandboxed subprocess
	3. Score = assertions_passed / total * 10 (deterministic, no hallucination)
	"""
	if not code or "def " not in code:
	return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0,
	assertions_failed=0, execution_error="No valid code provided")

	# Step 1: Generate adversarial assertions via LLM
	assertions = self._generate_assertions(code, purpose)
	if not assertions:
	return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0,
	assertions_failed=0, execution_error="LLM failed to generate assertions")

	# Step 2: Execute deterministically
	passed, failed, details = self._execute_assertions(code, assertions)

	# Step 3: Compute score (pure math — zero hallucination)
	total = len(assertions)
	score = (passed / total * 10.0) if total > 0 else 0.0

	return FalsificationResult(
	score=round(score, 1),
	assertions_total=total,
	assertions_passed=passed,
	assertions_failed=failed,
	failed_details=details,
	generated_assertions=assertions,
	)

	def _generate_assertions(self, code: str, purpose: str = "") -> list[str]:
	"""
	Ask the LLM to generate adversarial test assertions.

	This is an EASY task for SLMs — generating edge cases requires
	creativity, not deep logical reasoning about correctness.
	"""
	prompt = FALSIFICATION_PROMPT.format(code=code[:1000])
	if purpose:
	prompt = f"CONTEXT: {purpose}\n\n" + prompt

	try:
	raw = self.llm.generate(
	[ChatMessage(role="user", content=prompt)],
	temperature=0.7, # Some creativity for edge cases
	max_tokens=500,
	)
	except Exception as e:
	logger.warning(f"FalsificationCritic: LLM call failed: {e}")
	return []

	# Extract assert statements
	assertions = []
	for line in raw.split("\n"):
	line = line.strip()
	if line.startswith("assert "):
	assertions.append(line)

	# If no explicit asserts found, try to extract from code blocks
	if not assertions:
	code_block = extract_code(raw)
	if code_block:
	for line in code_block.split("\n"):
	line = line.strip()
	if line.startswith("assert "):
	assertions.append(line)

	return assertions[:self.num_assertions]

	def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]:
	"""
	Execute code + assertions in a sandboxed subprocess.

	Returns: (passed_count, failed_count, failure_details)
	"""
	passed = 0
	failed = 0
	details = []

	for assertion in assertions:
	# Build test script
	test_script = f"{code}\n\n# Test assertion\ntry:\n {assertion}\n print('PASS')\nexcept AssertionError as e:\n print(f'FAIL: {{e}}')\nexcept Exception as e:\n print(f'ERROR: {{type(e).__name__}}: {{e}}')\n"

	# Execute in subprocess with timeout
	try:
	with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f:
	f.write(test_script)
	script_path = f.name

	result = subprocess.run(
	[sys.executable, script_path],
	capture_output=True, text=True,
	timeout=self.timeout_s,
	cwd=tempfile.gettempdir(),
	)

	output = result.stdout.strip()
	if output == "PASS":
	passed += 1
	elif output.startswith("FAIL"):
	failed += 1
	details.append(f"{assertion} → {output}")
	elif output.startswith("ERROR"):
	failed += 1
	details.append(f"{assertion} → {output}")
	else:
	# Unexpected output or crash
	failed += 1
	stderr = result.stderr.strip()[:100]
	details.append(f"{assertion} → unexpected: {stderr or output}")

	except subprocess.TimeoutExpired:
	failed += 1
	details.append(f"{assertion} → TIMEOUT ({self.timeout_s}s)")
	except Exception as e:
	failed += 1
	details.append(f"{assertion} → EXEC_ERROR: {e}")
	finally:
	try:
	os.unlink(script_path)
	except:
	pass

	return passed, failed, details