File size: 9,681 Bytes

d6bcdbc

"""
robust_parser.py — Universal LLM output parser that never requires JSON.

The problem: LLMs are unreliable at producing valid JSON. Different models
format differently. Structured output (json_schema) isn't supported everywhere.

The solution: Parse whatever the LLM gives you. Extract fields by multiple
strategies, fall back gracefully, and always return something usable.

This replaces the fragile generate_structured → json.loads → crash pattern.
"""
from __future__ import annotations

import json
import re
import logging
from typing import Any

logger = logging.getLogger(__name__)


def extract_json(text: str) -> dict[str, Any] | None:
    """
    Try to extract a JSON object from arbitrary LLM text.
    Handles: pure JSON, JSON in code blocks, JSON embedded in prose.
    Returns None if no valid JSON found.
    """
    text = text.strip()

    # Strategy 1: Entire text is JSON
    try:
        return json.loads(text)
    except (json.JSONDecodeError, ValueError):
        pass

    # Strategy 2: JSON in markdown code block
    m = re.search(r'```(?:json)?\s*(\{.*\})\s*```', text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except (json.JSONDecodeError, ValueError):
            pass

    # Strategy 3: Find outermost { ... } by brace matching
    start = text.find('{')
    if start >= 0:
        depth = 0
        for i in range(start, len(text)):
            if text[i] == '{':
                depth += 1
            elif text[i] == '}':
                depth -= 1
            if depth == 0:
                try:
                    return json.loads(text[start:i + 1])
                except (json.JSONDecodeError, ValueError):
                    break

    return None


def extract_field(text: str, field_name: str, default: str = "") -> str:
    """
    Extract a named field value from LLM text, regardless of format.

    Handles:
    - JSON: {"field": "value"}
    - Markdown: **field:** value / field: value
    - Labeled: FIELD: value
    - Line-based: field\nvalue
    """
    text_lower = text.lower()
    name_lower = field_name.lower()

    # Try JSON first
    obj = extract_json(text)
    if obj and field_name in obj:
        return str(obj[field_name])

    # Pattern: "field_name": "value" or field_name: value
    patterns = [
        rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"',          # JSON string
        rf'"{field_name}"\s*:\s*(\d+\.?\d*)',                   # JSON number
        rf'\*?\*?{field_name}\*?\*?\s*:\s*(.+?)(?:\n|$)',       # Markdown/label
        rf'{field_name}\s*[=:]\s*(.+?)(?:\n|$)',                # Assignment
    ]
    for pattern in patterns:
        m = re.search(pattern, text, re.IGNORECASE)
        if m:
            return m.group(1).strip().strip('"').strip("'")

    return default


def extract_number(text: str, field_name: str, default: float = 0.0) -> float:
    """Extract a numeric field from LLM text."""
    val = extract_field(text, field_name)
    if val:
        try:
            return float(val.rstrip('.').rstrip(','))
        except (ValueError, TypeError):
            pass

    # Try direct pattern: field_name = X.X or field_name: X.X
    m = re.search(rf'{field_name}\s*[=:]\s*([\d.]+)', text, re.IGNORECASE)
    if m:
        try:
            return float(m.group(1).rstrip('.'))
        except ValueError:
            pass

    return default


def extract_code(text: str) -> str:
    """
    Extract Python code from LLM text.

    Handles:
    - Code in ``` blocks
    - Code in "code" JSON field
    - Raw code with def/class keywords
    """
    # Strategy 1: JSON with code field
    obj = extract_json(text)
    if obj:
        # Nested: action.params.code
        action = obj.get("action", {})
        if isinstance(action, dict):
            params = action.get("params", {})
            if isinstance(params, dict) and "code" in params:
                return params["code"]
        if "code" in obj:
            return obj["code"]

    # Strategy 2: Python code block
    m = re.search(r'```(?:python)?\s*\n(.*?)```', text, re.DOTALL)
    if m:
        return m.group(1).strip()

    # Strategy 3: Find code starting with def/class
    lines = text.split('\n')
    code_lines = []
    in_code = False
    for line in lines:
        if re.match(r'^(def |class |import |from )', line.strip()):
            in_code = True
        if in_code:
            # Stop at empty line after code, or at non-code text
            if line.strip() == '' and code_lines and not code_lines[-1].strip().endswith(':'):
                # Could be blank line in code — keep going if next line is indented
                code_lines.append(line)
            elif in_code and (line.startswith(' ') or line.startswith('\t') or
                              re.match(r'^(def |class |import |from |#|$)', line.strip())):
                code_lines.append(line)
            elif re.match(r'^(def |class )', line.strip()):
                code_lines.append(line)
            else:
                if code_lines:
                    break

    if code_lines:
        return '\n'.join(code_lines).strip()

    return ""


def parse_actor_response(text: str) -> dict[str, Any]:
    """
    Parse an actor's response into thought/action/expected_delta.
    Works with any format the LLM produces.
    """
    # Try JSON first (best case)
    obj = extract_json(text)
    if obj and ("action" in obj or "thought" in obj):
        return obj

    # Extract fields individually
    thought = extract_field(text, "thought")
    expected_delta = extract_field(text, "expected_delta")

    # Extract action name
    action_name = extract_field(text, "name", "")
    if not action_name:
        action_name = extract_field(text, "action", "")
        if action_name and action_name.startswith("{"):
            action_name = ""  # It's a JSON object, not a name

    # Extract code if this is a coding task
    code = extract_code(text)

    # Build action
    action = {"name": action_name or "UNKNOWN", "params": {}}
    if code:
        action["name"] = action.get("name", "submit_code") if action["name"] == "UNKNOWN" else action["name"]
        action["params"]["code"] = code

    if not thought:
        # Use the first sentence as thought
        thought = text.split('\n')[0][:200] if text else ""

    return {
        "thought": thought,
        "action": action,
        "expected_delta": expected_delta or "",
    }


def parse_critic_response(text: str) -> dict[str, Any]:
    """
    Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
    Works with any format.
    """
    # Try JSON first
    obj = extract_json(text)
    if obj and ("phi_before" in obj or "phi_after" in obj):
        return {
            "phi_before": float(obj.get("phi_before", 0)),
            "phi_after": float(obj.get("phi_after", 0)),
            "reasoning": str(obj.get("reasoning", "")),
            "evidence": str(obj.get("evidence", "")),
            "confidence": float(obj.get("confidence", 0.5)),
        }

    # Extract scores from text
    phi_before = extract_number(text, "phi_before", 0.0)
    if phi_before == 0.0:
        phi_before = extract_number(text, "Φ(state_before)", 0.0)
    if phi_before == 0.0:
        phi_before = extract_number(text, "state_before", 0.0)

    phi_after = extract_number(text, "phi_after", 0.0)
    if phi_after == 0.0:
        phi_after = extract_number(text, "Φ(state_after)", 0.0)
    if phi_after == 0.0:
        phi_after = extract_number(text, "state_after", 0.0)

    # Try SCORE: X pattern
    if phi_before == 0.0 and phi_after == 0.0:
        scores = re.findall(r'(?:score|SCORE|Score)\s*[=:]\s*([\d.]+)', text)
        if len(scores) >= 2:
            phi_before = float(scores[0].rstrip('.'))
            phi_after = float(scores[1].rstrip('.'))
        elif len(scores) == 1:
            phi_after = float(scores[0].rstrip('.'))

    reasoning = extract_field(text, "reasoning")
    evidence = extract_field(text, "evidence")
    confidence = extract_number(text, "confidence", 0.5)

    if not reasoning:
        reasoning = text[:300]
    if not evidence:
        evidence = text[300:500] if len(text) > 300 else ""

    return {
        "phi_before": min(10.0, max(0.0, phi_before)),
        "phi_after": min(10.0, max(0.0, phi_after)),
        "reasoning": reasoning,
        "evidence": evidence,
        "confidence": min(1.0, max(0.0, confidence)),
    }


def parse_optimizer_response(text: str) -> dict[str, Any]:
    """
    Parse optimizer output into heuristics list.
    """
    obj = extract_json(text)
    if obj and "heuristics" in obj:
        return obj

    # Try to find a JSON array
    m = re.search(r'\[.*\]', text, re.DOTALL)
    if m:
        try:
            arr = json.loads(m.group())
            if isinstance(arr, list):
                return {"heuristics": arr}
        except (json.JSONDecodeError, ValueError):
            pass

    # Extract from text patterns
    heuristics = []
    patterns = re.findall(r'(?:pattern|when|if)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)
    strategies = re.findall(r'(?:strategy|do|then|action)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)

    for pat, strat in zip(patterns, strategies):
        heuristics.append({"tier": "strategic", "pattern": pat.strip(), "strategy": strat.strip()})

    # If nothing found, try numbered list items
    if not heuristics:
        items = re.findall(r'\d+\.\s*(.+?)(?:\n|$)', text)
        for item in items[:5]:
            heuristics.append({"tier": "strategic", "pattern": "General", "strategy": item.strip()})

    return {"heuristics": heuristics}