""" falsification_critic.py — Reward by Falsification (Karl Popper's method). PROBLEM: SLMs cannot logically verify if complex code is correct. Asking a 1.7B model "rate this code 0-10" guarantees hallucinations. The model doesn't KNOW if the code works — it GUESSES. SOLUTION: Invert the Critic's job using falsifiability. Instead of: "Score this code" (requires understanding) We do: "Generate 3 assertions designed to BREAK this code" (requires creativity) Then we RUN those assertions deterministically on the CPU. Score = assertions_passed / total_assertions * 10 This is mathematically rigorous: - Score is computed by EXECUTION, not LLM judgment - Zero hallucinations in the score itself - The LLM's job is EASY (generate test cases) not HARD (verify correctness) - Even a 0.6B model can generate edge cases (what about input=0? negative? empty?) Theoretical basis: Karl Popper: "A theory is scientific if and only if it is falsifiable." We don't ask "is this code correct?" (unfalsifiable for SLMs) We ask "CAN I BREAK this code?" (falsifiable by execution) """ from __future__ import annotations import logging import re import subprocess import sys import tempfile import os from dataclasses import dataclass, field from typing import Any from purpose_agent.llm_backend import LLMBackend, ChatMessage from purpose_agent.robust_parser import extract_code logger = logging.getLogger(__name__) FALSIFICATION_PROMPT = """\ You are a TEST ADVERSARY. Your job is to BREAK the code below. Generate exactly 3 Python assert statements that test edge cases and boundary conditions. Each assert should be designed to catch a common bug. Think about: - What happens with 0? Empty input? None? Negative numbers? - What about very large inputs? Single element? Duplicate values? - What about the exact boundary between cases? CODE TO BREAK: ```python {code} ``` Respond with ONLY 3 assert statements, one per line: assert ... assert ... assert ... """ @dataclass class FalsificationResult: """ Result of falsification-based scoring. The score is COMPUTED, not LLM-generated. Zero hallucinations. """ score: float # 0.0-10.0, computed as (passed/total * 10) assertions_total: int assertions_passed: int assertions_failed: int failed_details: list[str] = field(default_factory=list) # Which assertions failed and why generated_assertions: list[str] = field(default_factory=list) execution_error: str | None = None @property def pass_rate(self) -> float: if self.assertions_total == 0: return 0.0 return self.assertions_passed / self.assertions_total @property def is_falsified(self) -> bool: """Was the code broken by at least one assertion?""" return self.assertions_failed > 0 class FalsificationCritic: """ Popperian Critic: scores code by trying to BREAK it. The LLM generates adversarial assertions. The CPU executes them deterministically. The score is pure math: passed / total * 10. Usage: critic = FalsificationCritic(llm=backend) result = critic.evaluate(code="def fib(n): ...") print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)") """ def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0): self.llm = llm self.num_assertions = num_assertions self.timeout_s = timeout_s def evaluate(self, code: str, purpose: str = "") -> FalsificationResult: """ Evaluate code by attempting to falsify it. Steps: 1. LLM generates adversarial assertions (easy task — even SLMs can do this) 2. CPU executes code + assertions in sandboxed subprocess 3. Score = assertions_passed / total * 10 (deterministic, no hallucination) """ if not code or "def " not in code: return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0, assertions_failed=0, execution_error="No valid code provided") # Step 1: Generate adversarial assertions via LLM assertions = self._generate_assertions(code, purpose) if not assertions: return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0, assertions_failed=0, execution_error="LLM failed to generate assertions") # Step 2: Execute deterministically passed, failed, details = self._execute_assertions(code, assertions) # Step 3: Compute score (pure math — zero hallucination) total = len(assertions) score = (passed / total * 10.0) if total > 0 else 0.0 return FalsificationResult( score=round(score, 1), assertions_total=total, assertions_passed=passed, assertions_failed=failed, failed_details=details, generated_assertions=assertions, ) def _generate_assertions(self, code: str, purpose: str = "") -> list[str]: """ Ask the LLM to generate adversarial test assertions. This is an EASY task for SLMs — generating edge cases requires creativity, not deep logical reasoning about correctness. """ prompt = FALSIFICATION_PROMPT.format(code=code[:1000]) if purpose: prompt = f"CONTEXT: {purpose}\n\n" + prompt try: raw = self.llm.generate( [ChatMessage(role="user", content=prompt)], temperature=0.7, # Some creativity for edge cases max_tokens=500, ) except Exception as e: logger.warning(f"FalsificationCritic: LLM call failed: {e}") return [] # Extract assert statements assertions = [] for line in raw.split("\n"): line = line.strip() if line.startswith("assert "): assertions.append(line) # If no explicit asserts found, try to extract from code blocks if not assertions: code_block = extract_code(raw) if code_block: for line in code_block.split("\n"): line = line.strip() if line.startswith("assert "): assertions.append(line) return assertions[:self.num_assertions] def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]: """ Execute code + assertions in a sandboxed subprocess. Returns: (passed_count, failed_count, failure_details) """ passed = 0 failed = 0 details = [] for assertion in assertions: # Build test script test_script = f"{code}\n\n# Test assertion\ntry:\n {assertion}\n print('PASS')\nexcept AssertionError as e:\n print(f'FAIL: {{e}}')\nexcept Exception as e:\n print(f'ERROR: {{type(e).__name__}}: {{e}}')\n" # Execute in subprocess with timeout try: with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f: f.write(test_script) script_path = f.name result = subprocess.run( [sys.executable, script_path], capture_output=True, text=True, timeout=self.timeout_s, cwd=tempfile.gettempdir(), ) output = result.stdout.strip() if output == "PASS": passed += 1 elif output.startswith("FAIL"): failed += 1 details.append(f"{assertion} → {output}") elif output.startswith("ERROR"): failed += 1 details.append(f"{assertion} → {output}") else: # Unexpected output or crash failed += 1 stderr = result.stderr.strip()[:100] details.append(f"{assertion} → unexpected: {stderr or output}") except subprocess.TimeoutExpired: failed += 1 details.append(f"{assertion} → TIMEOUT ({self.timeout_s}s)") except Exception as e: failed += 1 details.append(f"{assertion} → EXEC_ERROR: {e}") finally: try: os.unlink(script_path) except: pass return passed, failed, details