File size: 12,512 Bytes

"""
robust_parser.py — Universal LLM output parser. Handles JSON, TOML, or free text.

TOML is the preferred output format because:
  - ~40% fewer tokens than JSON (no quotes on keys, no braces, no commas)
  - More natural for LLMs to generate (looks like config files)
  - Python 3.11+ has tomllib in stdlib; we include a minimal fallback parser

The parser tries in order: TOML → JSON → field extraction → regex fallback.
Always returns something usable. Never crashes.
"""
from __future__ import annotations

import json
import re
import logging
from typing import Any

logger = logging.getLogger(__name__)


def _parse_toml_minimal(text: str) -> dict[str, Any] | None:
    """
    Minimal TOML parser for LLM output. Handles the subset LLMs actually produce:
      key = "value"
      key = number
      [section]
      key = "value"
    
    Uses stdlib tomllib (Python 3.11+) with fallback regex parser.
    """
    try:
        import tomllib
        return tomllib.loads(text)
    except ImportError:
        pass
    except Exception:
        pass

    # Fallback: regex-based TOML subset parser
    result = {}
    current_section = result
    section_path = []

    for line in text.split('\n'):
        line = line.strip()
        if not line or line.startswith('#'):
            continue

        # Section header: [section] or [section.subsection]
        sec_match = re.match(r'^\[([^\]]+)\]$', line)
        if sec_match:
            parts = sec_match.group(1).split('.')
            current_section = result
            for part in parts:
                part = part.strip()
                if part not in current_section:
                    current_section[part] = {}
                current_section = current_section[part]
            section_path = parts
            continue

        # Key = value
        kv_match = re.match(r'^(\w+)\s*=\s*(.+)$', line)
        if kv_match:
            key = kv_match.group(1).strip()
            val = kv_match.group(2).strip()
            # Parse value type
            if val.startswith('"') and val.endswith('"'):
                val = val[1:-1].replace('\\n', '\n').replace('\\"', '"')
            elif val.startswith("'") and val.endswith("'"):
                val = val[1:-1]
            elif val.lower() in ('true', 'false'):
                val = val.lower() == 'true'
            else:
                try:
                    val = float(val) if '.' in val else int(val)
                except ValueError:
                    pass  # Keep as string
            current_section[key] = val

    return result if result else None


def extract_structured(text: str) -> dict[str, Any] | None:
    """
    Try to extract structured data from LLM text.
    Order: TOML → JSON → None.
    """
    text = text.strip()

    # Try TOML first (preferred — fewer tokens, more natural)
    # Look for TOML-like content (key = value patterns)
    if re.search(r'^\w+\s*=\s*', text, re.MULTILINE):
        # Extract TOML block if in code fence
        toml_match = re.search(r'```(?:toml)?\s*\n(.*?)```', text, re.DOTALL)
        toml_text = toml_match.group(1) if toml_match else text
        result = _parse_toml_minimal(toml_text)
        if result:
            return result

    # Try JSON
    return extract_json(text)


def extract_json(text: str) -> dict[str, Any] | None:
    """
    Try to extract a JSON object from arbitrary LLM text.
    Handles: pure JSON, JSON in code blocks, JSON embedded in prose.
    Returns None if no valid JSON found.
    """
    text = text.strip()

    # Strategy 1: Entire text is JSON
    try:
        return json.loads(text)
    except (json.JSONDecodeError, ValueError):
        pass

    # Strategy 2: JSON in markdown code block
    m = re.search(r'```(?:json)?\s*(\{.*\})\s*```', text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except (json.JSONDecodeError, ValueError):
            pass

    # Strategy 3: Find outermost { ... } by brace matching
    start = text.find('{')
    if start >= 0:
        depth = 0
        for i in range(start, len(text)):
            if text[i] == '{':
                depth += 1
            elif text[i] == '}':
                depth -= 1
            if depth == 0:
                try:
                    return json.loads(text[start:i + 1])
                except (json.JSONDecodeError, ValueError):
                    break

    return None


def extract_field(text: str, field_name: str, default: str = "") -> str:
    """
    Extract a named field value from LLM text, regardless of format.

    Handles:
    - TOML: field = "value"
    - JSON: {"field": "value"}
    - Markdown: **field:** value / field: value
    - Labeled: FIELD: value
    - Line-based: field\nvalue
    """
    text_lower = text.lower()
    name_lower = field_name.lower()

    # Try structured parse first (TOML → JSON)
    obj = extract_structured(text)
    if obj and field_name in obj:
        return str(obj[field_name])

    # Pattern: "field_name": "value" or field_name = value or field_name: value
    patterns = [
        rf'"{field_name}"\s*:\s*"((?:[^"\\]|\\.)*)"',          # JSON string
        rf'"{field_name}"\s*:\s*(\d+\.?\d*)',                   # JSON number
        rf'\*?\*?{field_name}\*?\*?\s*:\s*(.+?)(?:\n|$)',       # Markdown/label
        rf'{field_name}\s*[=:]\s*(.+?)(?:\n|$)',                # Assignment
    ]
    for pattern in patterns:
        m = re.search(pattern, text, re.IGNORECASE)
        if m:
            return m.group(1).strip().strip('"').strip("'")

    return default


def extract_number(text: str, field_name: str, default: float = 0.0) -> float:
    """Extract a numeric field from LLM text."""
    val = extract_field(text, field_name)
    if val:
        try:
            return float(val.rstrip('.').rstrip(','))
        except (ValueError, TypeError):
            pass

    # Try direct pattern: field_name = X.X or field_name: X.X
    m = re.search(rf'{field_name}\s*[=:]\s*([\d.]+)', text, re.IGNORECASE)
    if m:
        try:
            return float(m.group(1).rstrip('.'))
        except ValueError:
            pass

    return default


def extract_code(text: str) -> str:
    """
    Extract Python code from LLM text.

    Handles:
    - Code in ``` blocks
    - Code in "code" JSON field
    - Raw code with def/class keywords
    """
    # Strategy 1: JSON with code field
    obj = extract_json(text)
    if obj:
        # Nested: action.params.code
        action = obj.get("action", {})
        if isinstance(action, dict):
            params = action.get("params", {})
            if isinstance(params, dict) and "code" in params:
                return params["code"]
        if "code" in obj:
            return obj["code"]

    # Strategy 2: Python code block
    m = re.search(r'```(?:python)?\s*\n(.*?)```', text, re.DOTALL)
    if m:
        return m.group(1).strip()

    # Strategy 3: Find code starting with def/class
    lines = text.split('\n')
    code_lines = []
    in_code = False
    for line in lines:
        if re.match(r'^(def |class |import |from )', line.strip()):
            in_code = True
        if in_code:
            # Stop at empty line after code, or at non-code text
            if line.strip() == '' and code_lines and not code_lines[-1].strip().endswith(':'):
                # Could be blank line in code — keep going if next line is indented
                code_lines.append(line)
            elif in_code and (line.startswith(' ') or line.startswith('\t') or
                              re.match(r'^(def |class |import |from |#|$)', line.strip())):
                code_lines.append(line)
            elif re.match(r'^(def |class )', line.strip()):
                code_lines.append(line)
            else:
                if code_lines:
                    break

    if code_lines:
        return '\n'.join(code_lines).strip()

    return ""


def parse_actor_response(text: str) -> dict[str, Any]:
    """
    Parse an actor's response into thought/action/expected_delta.
    Works with TOML, JSON, or free text.
    """
    # Try structured parse (TOML → JSON)
    obj = extract_structured(text)
    if obj and ("action" in obj or "thought" in obj):
        return obj

    # Extract fields individually
    thought = extract_field(text, "thought")
    expected_delta = extract_field(text, "expected_delta")

    # Extract action name
    action_name = extract_field(text, "name", "")
    if not action_name:
        action_name = extract_field(text, "action", "")
        if action_name and action_name.startswith("{"):
            action_name = ""  # It's a JSON object, not a name

    # Extract code if this is a coding task
    code = extract_code(text)

    # Build action
    action = {"name": action_name or "UNKNOWN", "params": {}}
    if code:
        action["name"] = action.get("name", "submit_code") if action["name"] == "UNKNOWN" else action["name"]
        action["params"]["code"] = code

    if not thought:
        # Use the first sentence as thought
        thought = text.split('\n')[0][:200] if text else ""

    return {
        "thought": thought,
        "action": action,
        "expected_delta": expected_delta or "",
    }


def parse_critic_response(text: str) -> dict[str, Any]:
    """
    Parse a critic's response into phi_before/phi_after/reasoning/evidence/confidence.
    Works with TOML, JSON, or free text.
    """
    # Try structured parse (TOML → JSON)
    obj = extract_structured(text)
    if obj and ("phi_before" in obj or "phi_after" in obj):
        return {
            "phi_before": float(obj.get("phi_before", 0)),
            "phi_after": float(obj.get("phi_after", 0)),
            "reasoning": str(obj.get("reasoning", "")),
            "evidence": str(obj.get("evidence", "")),
            "confidence": float(obj.get("confidence", 0.5)),
        }

    # Extract scores from text
    phi_before = extract_number(text, "phi_before", 0.0)
    if phi_before == 0.0:
        phi_before = extract_number(text, "Φ(state_before)", 0.0)
    if phi_before == 0.0:
        phi_before = extract_number(text, "state_before", 0.0)

    phi_after = extract_number(text, "phi_after", 0.0)
    if phi_after == 0.0:
        phi_after = extract_number(text, "Φ(state_after)", 0.0)
    if phi_after == 0.0:
        phi_after = extract_number(text, "state_after", 0.0)

    # Try SCORE: X pattern
    if phi_before == 0.0 and phi_after == 0.0:
        scores = re.findall(r'(?:score|SCORE|Score)\s*[=:]\s*([\d.]+)', text)
        if len(scores) >= 2:
            phi_before = float(scores[0].rstrip('.'))
            phi_after = float(scores[1].rstrip('.'))
        elif len(scores) == 1:
            phi_after = float(scores[0].rstrip('.'))

    reasoning = extract_field(text, "reasoning")
    evidence = extract_field(text, "evidence")
    confidence = extract_number(text, "confidence", 0.5)

    if not reasoning:
        reasoning = text[:300]
    if not evidence:
        evidence = text[300:500] if len(text) > 300 else ""

    return {
        "phi_before": min(10.0, max(0.0, phi_before)),
        "phi_after": min(10.0, max(0.0, phi_after)),
        "reasoning": reasoning,
        "evidence": evidence,
        "confidence": min(1.0, max(0.0, confidence)),
    }


def parse_optimizer_response(text: str) -> dict[str, Any]:
    """
    Parse optimizer output into heuristics list.
    """
    obj = extract_json(text)
    if obj and "heuristics" in obj:
        return obj

    # Try to find a JSON array
    m = re.search(r'\[.*\]', text, re.DOTALL)
    if m:
        try:
            arr = json.loads(m.group())
            if isinstance(arr, list):
                return {"heuristics": arr}
        except (json.JSONDecodeError, ValueError):
            pass

    # Extract from text patterns
    heuristics = []
    patterns = re.findall(r'(?:pattern|when|if)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)
    strategies = re.findall(r'(?:strategy|do|then|action)\s*[:\-]\s*(.+?)(?:\n|$)', text, re.IGNORECASE)

    for pat, strat in zip(patterns, strategies):
        heuristics.append({"tier": "strategic", "pattern": pat.strip(), "strategy": strat.strip()})

    # If nothing found, try numbered list items
    if not heuristics:
        items = re.findall(r'\d+\.\s*(.+?)(?:\n|$)', text)
        for item in items[:5]:
            heuristics.append({"tier": "strategic", "pattern": "General", "strategy": item.strip()})

    return {"heuristics": heuristics}