Rohan03 commited on
Commit
d6bcdbc
·
verified ·
1 Parent(s): d7dc6c8

fix: universal parsing + OpenRouter + state bug — purpose_agent/robust_parser.py

Browse files
Files changed (1) hide show
  1. purpose_agent/robust_parser.py +297 -0
purpose_agent/robust_parser.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ robust_parser.py — Universal LLM output parser that never requires JSON.
3
+
4
+ The problem: LLMs are unreliable at producing valid JSON. Different models
5
+ format differently. Structured output (json_schema) isn't supported everywhere.
6
+
7
+ The solution: Parse whatever the LLM gives you. Extract fields by multiple
8
+ strategies, fall back gracefully, and always return something usable.
9
+
10
+ This replaces the fragile generate_structured → json.loads → crash pattern.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import re
16
+ import logging
17
+ from typing import Any
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def extract_json(text: str) -> dict[str, Any] | None:
23
+ """
24
+ Try to extract a JSON object from arbitrary LLM text.
25
+ Handles: pure JSON, JSON in code blocks, JSON embedded in prose.
26
+ Returns None if no valid JSON found.
27
+ """
28
+ text = text.strip()
29
+
30
+ # Strategy 1: Entire text is JSON
31
+ try:
32
+ return json.loads(text)
33
+ except (json.JSONDecodeError, ValueError):
34
+ pass
35
+
36
+ # Strategy 2: JSON in markdown code block
37
+ m = re.search(r'```(?:json)?\s*(\{.*\})\s*```', text, re.DOTALL)
38
+ if m:
39
+ try:
40
+ return json.loads(m.group(1))
41
+ except (json.JSONDecodeError, ValueError):
42
+ pass
43
+
44
+ # Strategy 3: Find outermost { ... } by brace matching
45
+ start = text.find('{')
46
+ if start >= 0:
47
+ depth = 0
48
+ for i in range(start, len(text)):
49
+ if text[i] == '{':
50
+ depth += 1
51
+ elif text[i] == '}':
52
+ depth -= 1
53
+ if depth == 0:
54
+ try:
55
+ return json.loads(text[start:i + 1])
56
+ except (json.JSONDecodeError, ValueError):
57
+ break
58
+
59
+ return None
60
+
61
+
62
+ def extract_field(text: str, field_name: str, default: str = "") -> str:
63
+ """
64
+ Extract a named field value from LLM text, regardless of format.
65
+
66
+ Handles:
67
+ - JSON: {"field": "value"}
68
+ - Markdown: **field:** value / field: value
69
+ - Labeled: FIELD: value
70
+ - Line-based: field\nvalue
71
+ """
72
+ text_lower = text.lower()
73
+ name_lower = field_name.lower()
74
+
75
+ # Try JSON first
76
+ obj = extract_json(text)
77
+ if obj and field_name in obj:
78
+ return str(obj[field_name])
79
+
80
+ # Pattern: "field_name": "value" or field_name: value
81
+ patterns = [
82
+ rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"', # JSON string
83
+ rf'"{field_name}"\s*:\s*(\d+\.?\d*)', # JSON number
84
+ rf'\*?\*?{field_name}\*?\*?\s*:\s*(.+?)(?:\n|$)', # Markdown/label
85
+ rf'{field_name}\s*[=:]\s*(.+?)(?:\n|$)', # Assignment
86
+ ]
87
+ for pattern in patterns:
88
+ m = re.search(pattern, text, re.IGNORECASE)
89
+ if m:
90
+ return m.group(1).strip().strip('"').strip("'")
91
+
92
+ return default
93
+
94
+
95
+ def extract_number(text: str, field_name: str, default: float = 0.0) -> float:
96
+ """Extract a numeric field from LLM text."""
97
+ val = extract_field(text, field_name)
98
+ if val:
99
+ try:
100
+ return float(val.rstrip('.').rstrip(','))
101
+ except (ValueError, TypeError):
102
+ pass
103
+
104
+ # Try direct pattern: field_name = X.X or field_name: X.X
105
+ m = re.search(rf'{field_name}\s*[=:]\s*([\d.]+)', text, re.IGNORECASE)
106
+ if m:
107
+ try:
108
+ return float(m.group(1).rstrip('.'))
109
+ except ValueError:
110
+ pass
111
+
112
+ return default
113
+
114
+
115
+ def extract_code(text: str) -> str:
116
+ """
117
+ Extract Python code from LLM text.
118
+
119
+ Handles:
120
+ - Code in ``` blocks
121
+ - Code in "code" JSON field
122
+ - Raw code with def/class keywords
123
+ """
124
+ # Strategy 1: JSON with code field
125
+ obj = extract_json(text)
126
+ if obj:
127
+ # Nested: action.params.code
128
+ action = obj.get("action", {})
129
+ if isinstance(action, dict):
130
+ params = action.get("params", {})
131
+ if isinstance(params, dict) and "code" in params:
132
+ return params["code"]
133
+ if "code" in obj:
134
+ return obj["code"]
135
+
136
+ # Strategy 2: Python code block
137
+ m = re.search(r'```(?:python)?\s*\n(.*?)```', text, re.DOTALL)
138
+ if m:
139
+ return m.group(1).strip()
140
+
141
+ # Strategy 3: Find code starting with def/class
142
+ lines = text.split('\n')
143
+ code_lines = []
144
+ in_code = False
145
+ for line in lines:
146
+ if re.match(r'^(def |class |import |from )', line.strip()):
147
+ in_code = True
148
+ if in_code:
149
+ # Stop at empty line after code, or at non-code text
150
+ if line.strip() == '' and code_lines and not code_lines[-1].strip().endswith(':'):
151
+ # Could be blank line in code — keep going if next line is indented
152
+ code_lines.append(line)
153
+ elif in_code and (line.startswith(' ') or line.startswith('\t') or
154
+ re.match(r'^(def |class |import |from |#|$)', line.strip())):
155
+ code_lines.append(line)
156
+ elif re.match(r'^(def |class )', line.strip()):
157
+ code_lines.append(line)
158
+ else:
159
+ if code_lines:
160
+ break
161
+
162
+ if code_lines:
163
+ return '\n'.join(code_lines).strip()
164
+
165
+ return ""
166
+
167
+
168
+ def parse_actor_response(text: str) -> dict[str, Any]:
169
+ """
170
+ Parse an actor's response into thought/action/expected_delta.
171
+ Works with any format the LLM produces.
172
+ """
173
+ # Try JSON first (best case)
174
+ obj = extract_json(text)
175
+ if obj and ("action" in obj or "thought" in obj):
176
+ return obj
177
+
178
+ # Extract fields individually
179
+ thought = extract_field(text, "thought")
180
+ expected_delta = extract_field(text, "expected_delta")
181
+
182
+ # Extract action name
183
+ action_name = extract_field(text, "name", "")
184
+ if not action_name:
185
+ action_name = extract_field(text, "action", "")
186
+ if action_name and action_name.startswith("{"):
187
+ action_name = "" # It's a JSON object, not a name
188
+
189
+ # Extract code if this is a coding task
190
+ code = extract_code(text)
191
+
192
+ # Build action
193
+ action = {"name": action_name or "UNKNOWN", "params": {}}
194
+ if code:
195
+ action["name"] = action.get("name", "submit_code") if action["name"] == "UNKNOWN" else action["name"]
196
+ action["params"]["code"] = code
197
+
198
+ if not thought:
199
+ # Use the first sentence as thought
200
+ thought = text.split('\n')[0][:200] if text else ""
201
+
202
+ return {
203
+ "thought": thought,
204
+ "action": action,
205
+ "expected_delta": expected_delta or "",
206
+ }
207
+
208
+
209
+ def parse_critic_response(text: str) -> dict[str, Any]:
210
+ """
211
+ Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
212
+ Works with any format.
213
+ """
214
+ # Try JSON first
215
+ obj = extract_json(text)
216
+ if obj and ("phi_before" in obj or "phi_after" in obj):
217
+ return {
218
+ "phi_before": float(obj.get("phi_before", 0)),
219
+ "phi_after": float(obj.get("phi_after", 0)),
220
+ "reasoning": str(obj.get("reasoning", "")),
221
+ "evidence": str(obj.get("evidence", "")),
222
+ "confidence": float(obj.get("confidence", 0.5)),
223
+ }
224
+
225
+ # Extract scores from text
226
+ phi_before = extract_number(text, "phi_before", 0.0)
227
+ if phi_before == 0.0:
228
+ phi_before = extract_number(text, "Φ(state_before)", 0.0)
229
+ if phi_before == 0.0:
230
+ phi_before = extract_number(text, "state_before", 0.0)
231
+
232
+ phi_after = extract_number(text, "phi_after", 0.0)
233
+ if phi_after == 0.0:
234
+ phi_after = extract_number(text, "Φ(state_after)", 0.0)
235
+ if phi_after == 0.0:
236
+ phi_after = extract_number(text, "state_after", 0.0)
237
+
238
+ # Try SCORE: X pattern
239
+ if phi_before == 0.0 and phi_after == 0.0:
240
+ scores = re.findall(r'(?:score|SCORE|Score)\s*[=:]\s*([\d.]+)', text)
241
+ if len(scores) >= 2:
242
+ phi_before = float(scores[0].rstrip('.'))
243
+ phi_after = float(scores[1].rstrip('.'))
244
+ elif len(scores) == 1:
245
+ phi_after = float(scores[0].rstrip('.'))
246
+
247
+ reasoning = extract_field(text, "reasoning")
248
+ evidence = extract_field(text, "evidence")
249
+ confidence = extract_number(text, "confidence", 0.5)
250
+
251
+ if not reasoning:
252
+ reasoning = text[:300]
253
+ if not evidence:
254
+ evidence = text[300:500] if len(text) > 300 else ""
255
+
256
+ return {
257
+ "phi_before": min(10.0, max(0.0, phi_before)),
258
+ "phi_after": min(10.0, max(0.0, phi_after)),
259
+ "reasoning": reasoning,
260
+ "evidence": evidence,
261
+ "confidence": min(1.0, max(0.0, confidence)),
262
+ }
263
+
264
+
265
+ def parse_optimizer_response(text: str) -> dict[str, Any]:
266
+ """
267
+ Parse optimizer output into heuristics list.
268
+ """
269
+ obj = extract_json(text)
270
+ if obj and "heuristics" in obj:
271
+ return obj
272
+
273
+ # Try to find a JSON array
274
+ m = re.search(r'\[.*\]', text, re.DOTALL)
275
+ if m:
276
+ try:
277
+ arr = json.loads(m.group())
278
+ if isinstance(arr, list):
279
+ return {"heuristics": arr}
280
+ except (json.JSONDecodeError, ValueError):
281
+ pass
282
+
283
+ # Extract from text patterns
284
+ heuristics = []
285
+ patterns = re.findall(r'(?:pattern|when|if)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)
286
+ strategies = re.findall(r'(?:strategy|do|then|action)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)
287
+
288
+ for pat, strat in zip(patterns, strategies):
289
+ heuristics.append({"tier": "strategic", "pattern": pat.strip(), "strategy": strat.strip()})
290
+
291
+ # If nothing found, try numbered list items
292
+ if not heuristics:
293
+ items = re.findall(r'\d+\.\s*(.+?)(?:\n|$)', text)
294
+ for item in items[:5]:
295
+ heuristics.append({"tier": "strategic", "pattern": "General", "strategy": item.strip()})
296
+
297
+ return {"heuristics": heuristics}