File size: 8,755 Bytes
9f46343 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | """
falsification_critic.py β Reward by Falsification (Karl Popper's method).
PROBLEM: SLMs cannot logically verify if complex code is correct.
Asking a 1.7B model "rate this code 0-10" guarantees hallucinations.
The model doesn't KNOW if the code works β it GUESSES.
SOLUTION: Invert the Critic's job using falsifiability.
Instead of: "Score this code" (requires understanding)
We do: "Generate 3 assertions designed to BREAK this code" (requires creativity)
Then we RUN those assertions deterministically on the CPU.
Score = assertions_passed / total_assertions * 10
This is mathematically rigorous:
- Score is computed by EXECUTION, not LLM judgment
- Zero hallucinations in the score itself
- The LLM's job is EASY (generate test cases) not HARD (verify correctness)
- Even a 0.6B model can generate edge cases (what about input=0? negative? empty?)
Theoretical basis:
Karl Popper: "A theory is scientific if and only if it is falsifiable."
We don't ask "is this code correct?" (unfalsifiable for SLMs)
We ask "CAN I BREAK this code?" (falsifiable by execution)
"""
from __future__ import annotations
import logging
import re
import subprocess
import sys
import tempfile
import os
from dataclasses import dataclass, field
from typing import Any
from purpose_agent.llm_backend import LLMBackend, ChatMessage
from purpose_agent.robust_parser import extract_code
logger = logging.getLogger(__name__)
FALSIFICATION_PROMPT = """\
You are a TEST ADVERSARY. Your job is to BREAK the code below.
Generate exactly 3 Python assert statements that test edge cases and boundary conditions.
Each assert should be designed to catch a common bug.
Think about:
- What happens with 0? Empty input? None? Negative numbers?
- What about very large inputs? Single element? Duplicate values?
- What about the exact boundary between cases?
CODE TO BREAK:
```python
{code}
```
Respond with ONLY 3 assert statements, one per line:
assert ...
assert ...
assert ...
"""
@dataclass
class FalsificationResult:
"""
Result of falsification-based scoring.
The score is COMPUTED, not LLM-generated. Zero hallucinations.
"""
score: float # 0.0-10.0, computed as (passed/total * 10)
assertions_total: int
assertions_passed: int
assertions_failed: int
failed_details: list[str] = field(default_factory=list) # Which assertions failed and why
generated_assertions: list[str] = field(default_factory=list)
execution_error: str | None = None
@property
def pass_rate(self) -> float:
if self.assertions_total == 0:
return 0.0
return self.assertions_passed / self.assertions_total
@property
def is_falsified(self) -> bool:
"""Was the code broken by at least one assertion?"""
return self.assertions_failed > 0
class FalsificationCritic:
"""
Popperian Critic: scores code by trying to BREAK it.
The LLM generates adversarial assertions.
The CPU executes them deterministically.
The score is pure math: passed / total * 10.
Usage:
critic = FalsificationCritic(llm=backend)
result = critic.evaluate(code="def fib(n): ...")
print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)")
"""
def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0):
self.llm = llm
self.num_assertions = num_assertions
self.timeout_s = timeout_s
def evaluate(self, code: str, purpose: str = "") -> FalsificationResult:
"""
Evaluate code by attempting to falsify it.
Steps:
1. LLM generates adversarial assertions (easy task β even SLMs can do this)
2. CPU executes code + assertions in sandboxed subprocess
3. Score = assertions_passed / total * 10 (deterministic, no hallucination)
"""
if not code or "def " not in code:
return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0,
assertions_failed=0, execution_error="No valid code provided")
# Step 1: Generate adversarial assertions via LLM
assertions = self._generate_assertions(code, purpose)
if not assertions:
return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0,
assertions_failed=0, execution_error="LLM failed to generate assertions")
# Step 2: Execute deterministically
passed, failed, details = self._execute_assertions(code, assertions)
# Step 3: Compute score (pure math β zero hallucination)
total = len(assertions)
score = (passed / total * 10.0) if total > 0 else 0.0
return FalsificationResult(
score=round(score, 1),
assertions_total=total,
assertions_passed=passed,
assertions_failed=failed,
failed_details=details,
generated_assertions=assertions,
)
def _generate_assertions(self, code: str, purpose: str = "") -> list[str]:
"""
Ask the LLM to generate adversarial test assertions.
This is an EASY task for SLMs β generating edge cases requires
creativity, not deep logical reasoning about correctness.
"""
prompt = FALSIFICATION_PROMPT.format(code=code[:1000])
if purpose:
prompt = f"CONTEXT: {purpose}\n\n" + prompt
try:
raw = self.llm.generate(
[ChatMessage(role="user", content=prompt)],
temperature=0.7, # Some creativity for edge cases
max_tokens=500,
)
except Exception as e:
logger.warning(f"FalsificationCritic: LLM call failed: {e}")
return []
# Extract assert statements
assertions = []
for line in raw.split("\n"):
line = line.strip()
if line.startswith("assert "):
assertions.append(line)
# If no explicit asserts found, try to extract from code blocks
if not assertions:
code_block = extract_code(raw)
if code_block:
for line in code_block.split("\n"):
line = line.strip()
if line.startswith("assert "):
assertions.append(line)
return assertions[:self.num_assertions]
def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]:
"""
Execute code + assertions in a sandboxed subprocess.
Returns: (passed_count, failed_count, failure_details)
"""
passed = 0
failed = 0
details = []
for assertion in assertions:
# Build test script
test_script = f"{code}\n\n# Test assertion\ntry:\n {assertion}\n print('PASS')\nexcept AssertionError as e:\n print(f'FAIL: {{e}}')\nexcept Exception as e:\n print(f'ERROR: {{type(e).__name__}}: {{e}}')\n"
# Execute in subprocess with timeout
try:
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f:
f.write(test_script)
script_path = f.name
result = subprocess.run(
[sys.executable, script_path],
capture_output=True, text=True,
timeout=self.timeout_s,
cwd=tempfile.gettempdir(),
)
output = result.stdout.strip()
if output == "PASS":
passed += 1
elif output.startswith("FAIL"):
failed += 1
details.append(f"{assertion} β {output}")
elif output.startswith("ERROR"):
failed += 1
details.append(f"{assertion} β {output}")
else:
# Unexpected output or crash
failed += 1
stderr = result.stderr.strip()[:100]
details.append(f"{assertion} β unexpected: {stderr or output}")
except subprocess.TimeoutExpired:
failed += 1
details.append(f"{assertion} β TIMEOUT ({self.timeout_s}s)")
except Exception as e:
failed += 1
details.append(f"{assertion} β EXEC_ERROR: {e}")
finally:
try:
os.unlink(script_path)
except:
pass
return passed, failed, details
|