| """ |
| falsification_critic.py β Reward by Falsification (Karl Popper's method). |
| |
| PROBLEM: SLMs cannot logically verify if complex code is correct. |
| Asking a 1.7B model "rate this code 0-10" guarantees hallucinations. |
| The model doesn't KNOW if the code works β it GUESSES. |
| |
| SOLUTION: Invert the Critic's job using falsifiability. |
| |
| Instead of: "Score this code" (requires understanding) |
| We do: "Generate 3 assertions designed to BREAK this code" (requires creativity) |
| |
| Then we RUN those assertions deterministically on the CPU. |
| Score = assertions_passed / total_assertions * 10 |
| |
| This is mathematically rigorous: |
| - Score is computed by EXECUTION, not LLM judgment |
| - Zero hallucinations in the score itself |
| - The LLM's job is EASY (generate test cases) not HARD (verify correctness) |
| - Even a 0.6B model can generate edge cases (what about input=0? negative? empty?) |
| |
| Theoretical basis: |
| Karl Popper: "A theory is scientific if and only if it is falsifiable." |
| We don't ask "is this code correct?" (unfalsifiable for SLMs) |
| We ask "CAN I BREAK this code?" (falsifiable by execution) |
| """ |
| from __future__ import annotations |
|
|
| import logging |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import os |
| from dataclasses import dataclass, field |
| from typing import Any |
|
|
| from purpose_agent.llm_backend import LLMBackend, ChatMessage |
| from purpose_agent.robust_parser import extract_code |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| FALSIFICATION_PROMPT = """\ |
| You are a TEST ADVERSARY. Your job is to BREAK the code below. |
| |
| Generate exactly 3 Python assert statements that test edge cases and boundary conditions. |
| Each assert should be designed to catch a common bug. |
| |
| Think about: |
| - What happens with 0? Empty input? None? Negative numbers? |
| - What about very large inputs? Single element? Duplicate values? |
| - What about the exact boundary between cases? |
| |
| CODE TO BREAK: |
| ```python |
| {code} |
| ``` |
| |
| Respond with ONLY 3 assert statements, one per line: |
| assert ... |
| assert ... |
| assert ... |
| """ |
|
|
|
|
| @dataclass |
| class FalsificationResult: |
| """ |
| Result of falsification-based scoring. |
| |
| The score is COMPUTED, not LLM-generated. Zero hallucinations. |
| """ |
| score: float |
| assertions_total: int |
| assertions_passed: int |
| assertions_failed: int |
| failed_details: list[str] = field(default_factory=list) |
| generated_assertions: list[str] = field(default_factory=list) |
| execution_error: str | None = None |
|
|
| @property |
| def pass_rate(self) -> float: |
| if self.assertions_total == 0: |
| return 0.0 |
| return self.assertions_passed / self.assertions_total |
|
|
| @property |
| def is_falsified(self) -> bool: |
| """Was the code broken by at least one assertion?""" |
| return self.assertions_failed > 0 |
|
|
|
|
| class FalsificationCritic: |
| """ |
| Popperian Critic: scores code by trying to BREAK it. |
| |
| The LLM generates adversarial assertions. |
| The CPU executes them deterministically. |
| The score is pure math: passed / total * 10. |
| |
| Usage: |
| critic = FalsificationCritic(llm=backend) |
| result = critic.evaluate(code="def fib(n): ...") |
| print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)") |
| """ |
|
|
| def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0): |
| self.llm = llm |
| self.num_assertions = num_assertions |
| self.timeout_s = timeout_s |
|
|
| def evaluate(self, code: str, purpose: str = "") -> FalsificationResult: |
| """ |
| Evaluate code by attempting to falsify it. |
| |
| Steps: |
| 1. LLM generates adversarial assertions (easy task β even SLMs can do this) |
| 2. CPU executes code + assertions in sandboxed subprocess |
| 3. Score = assertions_passed / total * 10 (deterministic, no hallucination) |
| """ |
| if not code or "def " not in code: |
| return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0, |
| assertions_failed=0, execution_error="No valid code provided") |
|
|
| |
| assertions = self._generate_assertions(code, purpose) |
| if not assertions: |
| return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0, |
| assertions_failed=0, execution_error="LLM failed to generate assertions") |
|
|
| |
| passed, failed, details = self._execute_assertions(code, assertions) |
|
|
| |
| total = len(assertions) |
| score = (passed / total * 10.0) if total > 0 else 0.0 |
|
|
| return FalsificationResult( |
| score=round(score, 1), |
| assertions_total=total, |
| assertions_passed=passed, |
| assertions_failed=failed, |
| failed_details=details, |
| generated_assertions=assertions, |
| ) |
|
|
| def _generate_assertions(self, code: str, purpose: str = "") -> list[str]: |
| """ |
| Ask the LLM to generate adversarial test assertions. |
| |
| This is an EASY task for SLMs β generating edge cases requires |
| creativity, not deep logical reasoning about correctness. |
| """ |
| prompt = FALSIFICATION_PROMPT.format(code=code[:1000]) |
| if purpose: |
| prompt = f"CONTEXT: {purpose}\n\n" + prompt |
|
|
| try: |
| raw = self.llm.generate( |
| [ChatMessage(role="user", content=prompt)], |
| temperature=0.7, |
| max_tokens=500, |
| ) |
| except Exception as e: |
| logger.warning(f"FalsificationCritic: LLM call failed: {e}") |
| return [] |
|
|
| |
| assertions = [] |
| for line in raw.split("\n"): |
| line = line.strip() |
| if line.startswith("assert "): |
| assertions.append(line) |
|
|
| |
| if not assertions: |
| code_block = extract_code(raw) |
| if code_block: |
| for line in code_block.split("\n"): |
| line = line.strip() |
| if line.startswith("assert "): |
| assertions.append(line) |
|
|
| return assertions[:self.num_assertions] |
|
|
| def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]: |
| """ |
| Execute code + assertions in a sandboxed subprocess. |
| |
| Returns: (passed_count, failed_count, failure_details) |
| """ |
| passed = 0 |
| failed = 0 |
| details = [] |
|
|
| for assertion in assertions: |
| |
| test_script = f"{code}\n\n# Test assertion\ntry:\n {assertion}\n print('PASS')\nexcept AssertionError as e:\n print(f'FAIL: {{e}}')\nexcept Exception as e:\n print(f'ERROR: {{type(e).__name__}}: {{e}}')\n" |
|
|
| |
| try: |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f: |
| f.write(test_script) |
| script_path = f.name |
|
|
| result = subprocess.run( |
| [sys.executable, script_path], |
| capture_output=True, text=True, |
| timeout=self.timeout_s, |
| cwd=tempfile.gettempdir(), |
| ) |
|
|
| output = result.stdout.strip() |
| if output == "PASS": |
| passed += 1 |
| elif output.startswith("FAIL"): |
| failed += 1 |
| details.append(f"{assertion} β {output}") |
| elif output.startswith("ERROR"): |
| failed += 1 |
| details.append(f"{assertion} β {output}") |
| else: |
| |
| failed += 1 |
| stderr = result.stderr.strip()[:100] |
| details.append(f"{assertion} β unexpected: {stderr or output}") |
|
|
| except subprocess.TimeoutExpired: |
| failed += 1 |
| details.append(f"{assertion} β TIMEOUT ({self.timeout_s}s)") |
| except Exception as e: |
| failed += 1 |
| details.append(f"{assertion} β EXEC_ERROR: {e}") |
| finally: |
| try: |
| os.unlink(script_path) |
| except: |
| pass |
|
|
| return passed, failed, details |
|
|