Spaces:

lablab-ai-amd-developer-hackathon
/

paperhawk

Running

paperhawk / nodes /risk /drop_repeats_node.py

Nándorfi Vince

Initial paperhawk push to HF Space (LFS for binaries)

7ff7119 4 days ago

1.53 kB

	"""drop_repeats_node — 70% word-overlap dedup between LLM and basic risks.

	Drops the "same thing in different words" duplicates.

	Input:
	{"llm_risks_raw": list[Risk], "basic_risks": list[Risk], ...}

	Output:
	{"risks": list[Risk]} # final, filtered LLM risk list — merged into the
	parent state's ``risks`` reducer
	"""

	from __future__ import annotations

	from graph.states.pipeline_state import Risk
	from nodes.risk.filter_llm_risks_node import _dict_to_risk, _risk_to_dict
	from validation.llm_risk_filters import drop_repeats_of_basic


	async def drop_repeats_node(state: dict) -> dict:
	"""Drop LLM risks that overlap >=70% in content words with a basic risk.

	After this node, ``llm_risks_raw`` is published into ``risks``, where the
	``merge_risks`` reducer dedups it back into the parent state — closing
	the LLM risk-analysis chain.
	"""
	raw = state.get("llm_risks_raw") or []
	basic = state.get("basic_risks") or []
	if not raw:
	return {}

	raw_dicts = [_risk_to_dict(r) for r in raw]
	basic_dicts = [
	_risk_to_dict(b) if isinstance(b, Risk)
	else {"description": b.get("description", "") if isinstance(b, dict) else ""}
	for b in basic
	]
	filtered_dicts = drop_repeats_of_basic(raw_dicts, basic_dicts)
	filtered = [_dict_to_risk(d) for d in filtered_dicts]

	# Close the chain: write the result under ``risks``, where merge_risks
	# dedups it into the parent state.
	return {"risks": filtered}