Spaces:

lablab-ai-amd-developer-hackathon
/

paperhawk

Running

File size: 1,527 Bytes

7ff7119

"""drop_repeats_node — 70% word-overlap dedup between LLM and basic risks.

Drops the "same thing in different words" duplicates.

Input:
    {"llm_risks_raw": list[Risk], "basic_risks": list[Risk], ...}

Output:
    {"risks": list[Risk]}  # final, filtered LLM risk list — merged into the
                             parent state's ``risks`` reducer
"""

from __future__ import annotations

from graph.states.pipeline_state import Risk
from nodes.risk.filter_llm_risks_node import _dict_to_risk, _risk_to_dict
from validation.llm_risk_filters import drop_repeats_of_basic


async def drop_repeats_node(state: dict) -> dict:
    """Drop LLM risks that overlap >=70% in content words with a basic risk.

    After this node, ``llm_risks_raw`` is published into ``risks``, where the
    ``merge_risks`` reducer dedups it back into the parent state — closing
    the LLM risk-analysis chain.
    """
    raw = state.get("llm_risks_raw") or []
    basic = state.get("basic_risks") or []
    if not raw:
        return {}

    raw_dicts = [_risk_to_dict(r) for r in raw]
    basic_dicts = [
        _risk_to_dict(b) if isinstance(b, Risk)
        else {"description": b.get("description", "") if isinstance(b, dict) else ""}
        for b in basic
    ]
    filtered_dicts = drop_repeats_of_basic(raw_dicts, basic_dicts)
    filtered = [_dict_to_risk(d) for d in filtered_dicts]

    # Close the chain: write the result under ``risks``, where merge_risks
    # dedups it into the parent state.
    return {"risks": filtered}