Rohan03 commited on
Commit
9f46343
·
verified ·
1 Parent(s): d1da4ac

first-principles: falsification_critic.py — Popperian scoring by adversarial assertions

Browse files
Files changed (1) hide show
  1. purpose_agent/falsification_critic.py +241 -0
purpose_agent/falsification_critic.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ falsification_critic.py — Reward by Falsification (Karl Popper's method).
3
+
4
+ PROBLEM: SLMs cannot logically verify if complex code is correct.
5
+ Asking a 1.7B model "rate this code 0-10" guarantees hallucinations.
6
+ The model doesn't KNOW if the code works — it GUESSES.
7
+
8
+ SOLUTION: Invert the Critic's job using falsifiability.
9
+
10
+ Instead of: "Score this code" (requires understanding)
11
+ We do: "Generate 3 assertions designed to BREAK this code" (requires creativity)
12
+
13
+ Then we RUN those assertions deterministically on the CPU.
14
+ Score = assertions_passed / total_assertions * 10
15
+
16
+ This is mathematically rigorous:
17
+ - Score is computed by EXECUTION, not LLM judgment
18
+ - Zero hallucinations in the score itself
19
+ - The LLM's job is EASY (generate test cases) not HARD (verify correctness)
20
+ - Even a 0.6B model can generate edge cases (what about input=0? negative? empty?)
21
+
22
+ Theoretical basis:
23
+ Karl Popper: "A theory is scientific if and only if it is falsifiable."
24
+ We don't ask "is this code correct?" (unfalsifiable for SLMs)
25
+ We ask "CAN I BREAK this code?" (falsifiable by execution)
26
+ """
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ import re
31
+ import subprocess
32
+ import sys
33
+ import tempfile
34
+ import os
35
+ from dataclasses import dataclass, field
36
+ from typing import Any
37
+
38
+ from purpose_agent.llm_backend import LLMBackend, ChatMessage
39
+ from purpose_agent.robust_parser import extract_code
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ FALSIFICATION_PROMPT = """\
45
+ You are a TEST ADVERSARY. Your job is to BREAK the code below.
46
+
47
+ Generate exactly 3 Python assert statements that test edge cases and boundary conditions.
48
+ Each assert should be designed to catch a common bug.
49
+
50
+ Think about:
51
+ - What happens with 0? Empty input? None? Negative numbers?
52
+ - What about very large inputs? Single element? Duplicate values?
53
+ - What about the exact boundary between cases?
54
+
55
+ CODE TO BREAK:
56
+ ```python
57
+ {code}
58
+ ```
59
+
60
+ Respond with ONLY 3 assert statements, one per line:
61
+ assert ...
62
+ assert ...
63
+ assert ...
64
+ """
65
+
66
+
67
+ @dataclass
68
+ class FalsificationResult:
69
+ """
70
+ Result of falsification-based scoring.
71
+
72
+ The score is COMPUTED, not LLM-generated. Zero hallucinations.
73
+ """
74
+ score: float # 0.0-10.0, computed as (passed/total * 10)
75
+ assertions_total: int
76
+ assertions_passed: int
77
+ assertions_failed: int
78
+ failed_details: list[str] = field(default_factory=list) # Which assertions failed and why
79
+ generated_assertions: list[str] = field(default_factory=list)
80
+ execution_error: str | None = None
81
+
82
+ @property
83
+ def pass_rate(self) -> float:
84
+ if self.assertions_total == 0:
85
+ return 0.0
86
+ return self.assertions_passed / self.assertions_total
87
+
88
+ @property
89
+ def is_falsified(self) -> bool:
90
+ """Was the code broken by at least one assertion?"""
91
+ return self.assertions_failed > 0
92
+
93
+
94
+ class FalsificationCritic:
95
+ """
96
+ Popperian Critic: scores code by trying to BREAK it.
97
+
98
+ The LLM generates adversarial assertions.
99
+ The CPU executes them deterministically.
100
+ The score is pure math: passed / total * 10.
101
+
102
+ Usage:
103
+ critic = FalsificationCritic(llm=backend)
104
+ result = critic.evaluate(code="def fib(n): ...")
105
+ print(f"Score: {result.score}/10 ({result.assertions_passed}/{result.assertions_total} survived)")
106
+ """
107
+
108
+ def __init__(self, llm: LLMBackend, num_assertions: int = 3, timeout_s: float = 5.0):
109
+ self.llm = llm
110
+ self.num_assertions = num_assertions
111
+ self.timeout_s = timeout_s
112
+
113
+ def evaluate(self, code: str, purpose: str = "") -> FalsificationResult:
114
+ """
115
+ Evaluate code by attempting to falsify it.
116
+
117
+ Steps:
118
+ 1. LLM generates adversarial assertions (easy task — even SLMs can do this)
119
+ 2. CPU executes code + assertions in sandboxed subprocess
120
+ 3. Score = assertions_passed / total * 10 (deterministic, no hallucination)
121
+ """
122
+ if not code or "def " not in code:
123
+ return FalsificationResult(score=0.0, assertions_total=0, assertions_passed=0,
124
+ assertions_failed=0, execution_error="No valid code provided")
125
+
126
+ # Step 1: Generate adversarial assertions via LLM
127
+ assertions = self._generate_assertions(code, purpose)
128
+ if not assertions:
129
+ return FalsificationResult(score=5.0, assertions_total=0, assertions_passed=0,
130
+ assertions_failed=0, execution_error="LLM failed to generate assertions")
131
+
132
+ # Step 2: Execute deterministically
133
+ passed, failed, details = self._execute_assertions(code, assertions)
134
+
135
+ # Step 3: Compute score (pure math — zero hallucination)
136
+ total = len(assertions)
137
+ score = (passed / total * 10.0) if total > 0 else 0.0
138
+
139
+ return FalsificationResult(
140
+ score=round(score, 1),
141
+ assertions_total=total,
142
+ assertions_passed=passed,
143
+ assertions_failed=failed,
144
+ failed_details=details,
145
+ generated_assertions=assertions,
146
+ )
147
+
148
+ def _generate_assertions(self, code: str, purpose: str = "") -> list[str]:
149
+ """
150
+ Ask the LLM to generate adversarial test assertions.
151
+
152
+ This is an EASY task for SLMs — generating edge cases requires
153
+ creativity, not deep logical reasoning about correctness.
154
+ """
155
+ prompt = FALSIFICATION_PROMPT.format(code=code[:1000])
156
+ if purpose:
157
+ prompt = f"CONTEXT: {purpose}\n\n" + prompt
158
+
159
+ try:
160
+ raw = self.llm.generate(
161
+ [ChatMessage(role="user", content=prompt)],
162
+ temperature=0.7, # Some creativity for edge cases
163
+ max_tokens=500,
164
+ )
165
+ except Exception as e:
166
+ logger.warning(f"FalsificationCritic: LLM call failed: {e}")
167
+ return []
168
+
169
+ # Extract assert statements
170
+ assertions = []
171
+ for line in raw.split("\n"):
172
+ line = line.strip()
173
+ if line.startswith("assert "):
174
+ assertions.append(line)
175
+
176
+ # If no explicit asserts found, try to extract from code blocks
177
+ if not assertions:
178
+ code_block = extract_code(raw)
179
+ if code_block:
180
+ for line in code_block.split("\n"):
181
+ line = line.strip()
182
+ if line.startswith("assert "):
183
+ assertions.append(line)
184
+
185
+ return assertions[:self.num_assertions]
186
+
187
+ def _execute_assertions(self, code: str, assertions: list[str]) -> tuple[int, int, list[str]]:
188
+ """
189
+ Execute code + assertions in a sandboxed subprocess.
190
+
191
+ Returns: (passed_count, failed_count, failure_details)
192
+ """
193
+ passed = 0
194
+ failed = 0
195
+ details = []
196
+
197
+ for assertion in assertions:
198
+ # Build test script
199
+ test_script = f"{code}\n\n# Test assertion\ntry:\n {assertion}\n print('PASS')\nexcept AssertionError as e:\n print(f'FAIL: {{e}}')\nexcept Exception as e:\n print(f'ERROR: {{type(e).__name__}}: {{e}}')\n"
200
+
201
+ # Execute in subprocess with timeout
202
+ try:
203
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False, dir=tempfile.gettempdir()) as f:
204
+ f.write(test_script)
205
+ script_path = f.name
206
+
207
+ result = subprocess.run(
208
+ [sys.executable, script_path],
209
+ capture_output=True, text=True,
210
+ timeout=self.timeout_s,
211
+ cwd=tempfile.gettempdir(),
212
+ )
213
+
214
+ output = result.stdout.strip()
215
+ if output == "PASS":
216
+ passed += 1
217
+ elif output.startswith("FAIL"):
218
+ failed += 1
219
+ details.append(f"{assertion} → {output}")
220
+ elif output.startswith("ERROR"):
221
+ failed += 1
222
+ details.append(f"{assertion} → {output}")
223
+ else:
224
+ # Unexpected output or crash
225
+ failed += 1
226
+ stderr = result.stderr.strip()[:100]
227
+ details.append(f"{assertion} → unexpected: {stderr or output}")
228
+
229
+ except subprocess.TimeoutExpired:
230
+ failed += 1
231
+ details.append(f"{assertion} → TIMEOUT ({self.timeout_s}s)")
232
+ except Exception as e:
233
+ failed += 1
234
+ details.append(f"{assertion} → EXEC_ERROR: {e}")
235
+ finally:
236
+ try:
237
+ os.unlink(script_path)
238
+ except:
239
+ pass
240
+
241
+ return passed, failed, details