Spaces:

lablab-ai-amd-developer-hackathon
/

paperhawk

Running

File size: 3,895 Bytes

7ff7119

"""validator_node — source citation check (anti-hallucination).

Verifies that:
  1. At least 1 tool call ran (otherwise skip — e.g. "thanks" message)
  2. The final_answer contains a [Source: X] citation or filename mention
  3. The answer is at least 20 chars
  4. The cited filenames actually appear in the tool outputs

If any of these fail and ``retry_count < settings.validator_max_retries``,
go back to the agent with a HumanMessage: "Please re-call the tools and
include [Source: filename] citations."
"""

from __future__ import annotations

import re

from langchain_core.messages import HumanMessage, ToolMessage

from config import settings
from graph.states.chat_state import ChatState


_FILENAME_PATTERN = re.compile(r"\b([\w_\-]+\.(?:pdf|docx|png|jpg|jpeg|txt))\b", re.I)
_SOURCE_PATTERN = re.compile(r"\[(?:Source|Forrás)[:\s]+([^\]]+)\]", re.I)


async def validator_node(state: ChatState) -> dict:
    """Check the final_answer for source citations."""
    final_answer = state.get("final_answer", "")
    messages = state.get("messages") or []
    retry_count = state.get("validator_retry_count", 0)

    # Was there a tool call?
    tool_msgs = [m for m in messages if isinstance(m, ToolMessage)]
    if not tool_msgs:
        # No tool — plain chat answer, no source check
        return {
            "trace": ["validator: no tool call → skipped"],
            "sources_cited": [],
        }

    # 1. At least 20 chars
    if len(final_answer.strip()) < 20:
        if retry_count < settings.validator_max_retries:
            return _retry(state, retry_count, "The answer is too short (< 20 chars).")
        # Max retry → let it through
        return {
            "trace": ["validator: too short, but max retry → end"],
            "sources_cited": [],
        }

    # 2. Source citation check
    source_matches = _SOURCE_PATTERN.findall(final_answer)
    filename_mentions = _FILENAME_PATTERN.findall(final_answer)

    if not source_matches and not filename_mentions:
        if retry_count < settings.validator_max_retries:
            return _retry(state, retry_count, "Missing source citation in [Source: filename] format.")
        return {
            "trace": ["validator: no source citation, but max retry → end"],
            "sources_cited": [],
        }

    # 3. Do the cited filenames actually appear in the tool outputs?
    available_files: set[str] = set()
    for tm in tool_msgs:
        content = str(tm.content)
        for match in _FILENAME_PATTERN.findall(content):
            available_files.add(match.lower())

    cited_files = []
    for citation in source_matches:
        # Multiple filenames separated by comma (e.g. [Source: a.pdf, b.pdf])
        for f in re.split(r"[,;]", citation):
            f = f.strip()
            if f:
                cited_files.append(f)
    cited_files.extend(filename_mentions)

    invalid_citations = [
        c for c in cited_files
        if c.lower() not in available_files and not any(
            c.lower() in af for af in available_files
        )
    ]

    if invalid_citations and retry_count < settings.validator_max_retries:
        return _retry(state, retry_count,
                      f"Cited filenames are not in the tool results: {invalid_citations}")

    return {
        "trace": [f"validator: ok (sources: {cited_files[:3]})"],
        "sources_cited": list({c.lower() for c in cited_files}),
    }


def _retry(state: ChatState, retry_count: int, reason: str) -> dict:
    """Go back to the agent with a HumanMessage."""
    msg = HumanMessage(content=(
        f"Your answer is not acceptable: {reason} "
        "Please re-call the tools and include [Source: filename.pdf] citations."
    ))
    return {
        "messages": [msg],
        "validator_retry_count": retry_count + 1,
        "trace": [f"validator: retry {retry_count + 1} ({reason})"],
    }