File size: 8,755 Bytes

9f46343

"""
falsification_critic.py — Reward by Falsification (Karl Popper's method).

PROBLEM: SLMs cannot logically verify if complex code is correct.
Asking a 1.7B model "rate this code 0-10" guarantees hallucinations.
The model doesn't KNOW if the code works — it GUESSES.

SOLUTION: Invert the Critic's job using falsifiability.

Instead of: "Score this code" (requires understanding)
We do:      "Generate 3 assertions designed to BREAK this code" (requires creativity)

Then we RUN those assertions deterministically on the CPU.
Score = assertions_passed / total_assertions * 10

This is mathematically rigorous:
  - Score is computed by EXECUTION, not LLM judgment
  - Zero hallucinations in the score itself
  - The LLM's job is EASY (generate test cases) not HARD (verify correctness)
  - Even a 0.6B model can generate edge cases (what about input=0? negative? empty?)

Theoretical basis:
  Karl Popper: "A theory is scientific if and only if it is falsifiable."
  We don't ask "is this code correct?" (unfalsifiable for SLMs)
  We ask "CAN I BREAK this code?" (falsifiable by execution)
"""
from __future__ import annotations

import logging
import re
import subprocess
import sys
import tempfile
import os
from dataclasses import dataclass, field
from typing import Any

from purpose_agent.llm_backend import LLMBackend, ChatMessage
from purpose_agent.robust_parser import extract_code

logger = logging.getLogger(__name__)


FALSIFICATION_PROMPT = """\
You are a TEST ADVERSARY. Your job is to BREAK the code below.

Generate exactly 3 Python assert statements that test edge cases and boundary conditions.
Each assert should be designed to catch a common bug.

Think about:
- What happens with 0? Empty input? None? Negative numbers?
- What about very large inputs? Single element? Duplicate values?
- What about the exact boundary between cases?

CODE TO BREAK:
```python
{code}
```

Respond with ONLY 3 assert statements, one per line:
assert ...
assert ...
assert ...
"""


@dataclass
class FalsificationResult:
    """
    Result of falsification-based scoring.
    
    The score is COMPUTED, not LLM-generated. Zero hallucinations.
    """
    score: float                    # 0.0-10.0, computed as (passed/total * 10)
    assertions_total: int
    assertions_passed: int
    assertions_failed: int
    failed_details: list[str] = field(default_factory=list)  # Which assertions failed and why
    generated_assertions: list[str] = field(default_factory=list)
    execution_error: str | None = None

    @property
    def pass_rate(self) -> float:
        if self.assertions_total == 0:
            return 0.0
        return self.assertions_passed / self.assertions_total

    @property
    def is_falsified(self) -> bool:
        """Was the code broken by at least one assertion?"""
        return self.assertions_failed > 0


class FalsificationCritic:
    """
    Popperian Critic: scores code by trying to BREAK it.
    
    The LLM generates adversarial assertions.
    The CPU executes them deterministically.
    The score is pure math: passed / total * 10.
    
    Usage:
        critic = FalsificationCritic(llm=backend)
        result = critic.evaluate(code="def fib(n): ...")
        print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)")
    """

    def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0):
        self.llm = llm
        self.num_assertions = num_assertions
        self.timeout_s = timeout_s

    def evaluate(self, code: str, purpose: str = "") -> FalsificationResult:
        """
        Evaluate code by attempting to falsify it.
        
        Steps:
          1. LLM generates adversarial assertions (easy task — even SLMs can do this)
          2. CPU executes code + assertions in sandboxed subprocess
          3. Score = assertions_passed / total * 10 (deterministic, no hallucination)
        """
        if not code or "def " not in code:
            return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0,
                                       assertions_failed=0, execution_error="No valid code provided")

        # Step 1: Generate adversarial assertions via LLM
        assertions = self._generate_assertions(code, purpose)
        if not assertions:
            return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0,
                                       assertions_failed=0, execution_error="LLM failed to generate assertions")

        # Step 2: Execute deterministically
        passed, failed, details = self._execute_assertions(code, assertions)

        # Step 3: Compute score (pure math — zero hallucination)
        total = len(assertions)
        score = (passed / total * 10.0) if total > 0 else 0.0

        return FalsificationResult(
            score=round(score, 1),
            assertions_total=total,
            assertions_passed=passed,
            assertions_failed=failed,
            failed_details=details,
            generated_assertions=assertions,
        )

    def _generate_assertions(self, code: str, purpose: str = "") -> list[str]:
        """
        Ask the LLM to generate adversarial test assertions.
        
        This is an EASY task for SLMs — generating edge cases requires
        creativity, not deep logical reasoning about correctness.
        """
        prompt = FALSIFICATION_PROMPT.format(code=code[:1000])
        if purpose:
            prompt = f"CONTEXT: {purpose}\n\n" + prompt

        try:
            raw = self.llm.generate(
                [ChatMessage(role="user", content=prompt)],
                temperature=0.7,  # Some creativity for edge cases
                max_tokens=500,
            )
        except Exception as e:
            logger.warning(f"FalsificationCritic: LLM call failed: {e}")
            return []

        # Extract assert statements
        assertions = []
        for line in raw.split("\n"):
            line = line.strip()
            if line.startswith("assert "):
                assertions.append(line)

        # If no explicit asserts found, try to extract from code blocks
        if not assertions:
            code_block = extract_code(raw)
            if code_block:
                for line in code_block.split("\n"):
                    line = line.strip()
                    if line.startswith("assert "):
                        assertions.append(line)

        return assertions[:self.num_assertions]

    def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]:
        """
        Execute code + assertions in a sandboxed subprocess.
        
        Returns: (passed_count, failed_count, failure_details)
        """
        passed = 0
        failed = 0
        details = []

        for assertion in assertions:
            # Build test script
            test_script = f"{code}\n\n# Test assertion\ntry:\n    {assertion}\n    print('PASS')\nexcept AssertionError as e:\n    print(f'FAIL: {{e}}')\nexcept Exception as e:\n    print(f'ERROR: {{type(e).__name__}}: {{e}}')\n"

            # Execute in subprocess with timeout
            try:
                with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f:
                    f.write(test_script)
                    script_path = f.name

                result = subprocess.run(
                    [sys.executable, script_path],
                    capture_output=True, text=True,
                    timeout=self.timeout_s,
                    cwd=tempfile.gettempdir(),
                )

                output = result.stdout.strip()
                if output == "PASS":
                    passed += 1
                elif output.startswith("FAIL"):
                    failed += 1
                    details.append(f"{assertion} → {output}")
                elif output.startswith("ERROR"):
                    failed += 1
                    details.append(f"{assertion} → {output}")
                else:
                    # Unexpected output or crash
                    failed += 1
                    stderr = result.stderr.strip()[:100]
                    details.append(f"{assertion} → unexpected: {stderr or output}")

            except subprocess.TimeoutExpired:
                failed += 1
                details.append(f"{assertion} → TIMEOUT ({self.timeout_s}s)")
            except Exception as e:
                failed += 1
                details.append(f"{assertion} → EXEC_ERROR: {e}")
            finally:
                try:
                    os.unlink(script_path)
                except:
                    pass

        return passed, failed, details