Rohan03
/

purpose-agent

+"""
+falsification_critic.py — Reward by Falsification (Karl Popper's method).
+PROBLEM: SLMs cannot logically verify if complex code is correct.
+Asking a 1.7B model "rate this code 0-10" guarantees hallucinations.
+The model doesn't KNOW if the code works — it GUESSES.
+SOLUTION: Invert the Critic's job using falsifiability.
+Instead of: "Score this code" (requires understanding)
+We do:      "Generate 3 assertions designed to BREAK this code" (requires creativity)
+Then we RUN those assertions deterministically on the CPU.
+Score = assertions_passed / total_assertions * 10
+This is mathematically rigorous:
+  - Score is computed by EXECUTION, not LLM judgment
+  - Zero hallucinations in the score itself
+  - The LLM's job is EASY (generate test cases) not HARD (verify correctness)
+  - Even a 0.6B model can generate edge cases (what about input=0? negative? empty?)
+Theoretical basis:
+  Karl Popper: "A theory is scientific if and only if it is falsifiable."
+  We don't ask "is this code correct?" (unfalsifiable for SLMs)
+  We ask "CAN I BREAK this code?" (falsifiable by execution)
+"""
+from __future__ import annotations
+import logging
+import re
+import subprocess
+import sys
+import tempfile
+import os
+from dataclasses import dataclass, field
+from typing import Any
+from purpose_agent.llm_backend import LLMBackend, ChatMessage
+from purpose_agent.robust_parser import extract_code
+logger = logging.getLogger(__name__)
+FALSIFICATION_PROMPT = """\
+You are a TEST ADVERSARY. Your job is to BREAK the code below.
+Generate exactly 3 Python assert statements that test edge cases and boundary conditions.
+Each assert should be designed to catch a common bug.
+Think about:
+- What happens with 0? Empty input? None? Negative numbers?
+- What about very large inputs? Single element? Duplicate values?
+- What about the exact boundary between cases?
+CODE TO BREAK:
+```python
+{code}
+```
+Respond with ONLY 3 assert statements, one per line:
+assert ...
+assert ...
+assert ...
+"""
+@dataclass
+class FalsificationResult:
+    """
+    Result of falsification-based scoring.
+    The score is COMPUTED, not LLM-generated. Zero hallucinations.
+    """
+    score: float                    # 0.0-10.0, computed as (passed/total * 10)
+    assertions_total: int
+    assertions_passed: int
+    assertions_failed: int
+    failed_details: list[str] = field(default_factory=list)  # Which assertions failed and why
+    generated_assertions: list[str] = field(default_factory=list)
+    execution_error: str | None = None
+    @property
+    def pass_rate(self) -> float:
+        if self.assertions_total == 0:
+            return 0.0
+        return self.assertions_passed / self.assertions_total
+    @property
+    def is_falsified(self) -> bool:
+        """Was the code broken by at least one assertion?"""
+        return self.assertions_failed > 0
+class FalsificationCritic:
+    """
+    Popperian Critic: scores code by trying to BREAK it.
+    The LLM generates adversarial assertions.
+    The CPU executes them deterministically.
+    The score is pure math: passed / total * 10.
+    Usage:
+        critic = FalsificationCritic(llm=backend)
+        result = critic.evaluate(code="def fib(n): ...")
+        print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)")
+    """
+    def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0):
+        self.llm = llm
+        self.num_assertions = num_assertions
+        self.timeout_s = timeout_s
+    def evaluate(self, code: str, purpose: str = "") -> FalsificationResult:
+        """
+        Evaluate code by attempting to falsify it.
+        Steps:
+          1. LLM generates adversarial assertions (easy task — even SLMs can do this)
+          2. CPU executes code + assertions in sandboxed subprocess
+          3. Score = assertions_passed / total * 10 (deterministic, no hallucination)
+        """
+        if not code or "def " not in code:
+            return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0,
+                                       assertions_failed=0, execution_error="No valid code provided")
+        # Step 1: Generate adversarial assertions via LLM
+        assertions = self._generate_assertions(code, purpose)
+        if not assertions:
+            return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0,
+                                       assertions_failed=0, execution_error="LLM failed to generate assertions")
+        # Step 2: Execute deterministically
+        passed, failed, details = self._execute_assertions(code, assertions)
+        # Step 3: Compute score (pure math — zero hallucination)
+        total = len(assertions)
+        score = (passed / total * 10.0) if total > 0 else 0.0
+        return FalsificationResult(
+            score=round(score, 1),
+            assertions_total=total,
+            assertions_passed=passed,
+            assertions_failed=failed,
+            failed_details=details,
+            generated_assertions=assertions,
+        )
+    def _generate_assertions(self, code: str, purpose: str = "") -> list[str]:
+        """
+        Ask the LLM to generate adversarial test assertions.
+        This is an EASY task for SLMs — generating edge cases requires
+        creativity, not deep logical reasoning about correctness.
+        """
+        prompt = FALSIFICATION_PROMPT.format(code=code[:1000])
+        if purpose:
+            prompt = f"CONTEXT: {purpose}\n\n" + prompt
+        try:
+            raw = self.llm.generate(
+                [ChatMessage(role="user", content=prompt)],
+                temperature=0.7,  # Some creativity for edge cases
+                max_tokens=500,
+            )
+        except Exception as e:
+            logger.warning(f"FalsificationCritic: LLM call failed: {e}")
+            return []
+        # Extract assert statements
+        assertions = []
+        for line in raw.split("\n"):
+            line = line.strip()
+            if line.startswith("assert "):
+                assertions.append(line)
+        # If no explicit asserts found, try to extract from code blocks
+        if not assertions:
+            code_block = extract_code(raw)
+            if code_block:
+                for line in code_block.split("\n"):
+                    line = line.strip()
+                    if line.startswith("assert "):
+                        assertions.append(line)
+        return assertions[:self.num_assertions]
+    def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]:
+        """
+        Execute code + assertions in a sandboxed subprocess.
+        Returns: (passed_count, failed_count, failure_details)
+        """
+        passed = 0
+        failed = 0
+        details = []
+        for assertion in assertions:
+            # Build test script
+            test_script = f"{code}\n\n# Test assertion\ntry:\n    {assertion}\n    print('PASS')\nexcept AssertionError as e:\n    print(f'FAIL: {{e}}')\nexcept Exception as e:\n    print(f'ERROR: {{type(e).__name__}}: {{e}}')\n"
+            # Execute in subprocess with timeout
+            try:
+                with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f:
+                    f.write(test_script)
+                    script_path = f.name
+                result = subprocess.run(
+                    [sys.executable, script_path],
+                    capture_output=True, text=True,
+                    timeout=self.timeout_s,
+                    cwd=tempfile.gettempdir(),
+                )
+                output = result.stdout.strip()
+                if output == "PASS":
+                    passed += 1
+                elif output.startswith("FAIL"):
+                    failed += 1
+                    details.append(f"{assertion} → {output}")
+                elif output.startswith("ERROR"):
+                    failed += 1
+                    details.append(f"{assertion} → {output}")
+                else:
+                    # Unexpected output or crash
+                    failed += 1
+                    stderr = result.stderr.strip()[:100]
+                    details.append(f"{assertion} → unexpected: {stderr or output}")
+            except subprocess.TimeoutExpired:
+                failed += 1
+                details.append(f"{assertion} → TIMEOUT ({self.timeout_s}s)")
+            except Exception as e:
+                failed += 1
+                details.append(f"{assertion} → EXEC_ERROR: {e}")
+            finally:
+                try:
+                    os.unlink(script_path)
+                except:
+                    pass
+        return passed, failed, details