Spaces:

lablab-ai-amd-developer-hackathon
/

ClauseGuard-AI

Sleeping

ClauseGuard-AI / clauseguard /agents /extractor.py

muhammadbinmurtza

Restructure: clauseguard as package subfolder, app_file: clauseguard/app.py

913a064 14 days ago

3.04 kB

	"""Agent 1: Extractor — segments document into individual clauses."""

	import json
	import logging
	from typing import Optional

	from clauseguard.config.prompts import EXTRACTOR_SYSTEM_PROMPT
	from clauseguard.models.clause import Clause, ClauseList
	from clauseguard.services.model_service import call_model, clean_json_response

	logger = logging.getLogger(__name__)

	MIN_CLAUSES = 3
	MAX_RETRIES = 1


	async def run_extractor(raw_text: str, filename: str = "document") -> ClauseList:
	"""Extract clauses from raw contract text using the Extractor agent.

	Args:
	raw_text: The raw text content of the contract.
	filename: Name of the source file (for context).

	Returns:
	A ClauseList containing the extracted clauses.

	Raises:
	ValueError: If fewer than MIN_CLAUSES clauses are found.
	"""
	if not raw_text or not raw_text.strip():
	raise ValueError("Document is empty or unreadable")

	prompt = _build_user_prompt(raw_text, filename)

	content = await call_model(
	system_prompt=EXTRACTOR_SYSTEM_PROMPT,
	user_prompt=prompt,
	agent_name="Extractor",
	max_retries=MAX_RETRIES,
	)

	if content is None:
	raise ValueError("Extractor agent failed to produce a valid response")

	clause_list = _parse_response(content)
	_validate_clause_list(clause_list)
	return clause_list


	def _build_user_prompt(raw_text: str, filename: str) -> str:
	"""Build the user prompt with the contract text."""
	return f"""Extract all clauses from the following contract document.

	Filename: {filename}

	Document text:
	{raw_text}
	"""


	def _parse_response(content: str) -> ClauseList:
	"""Parse the LLM JSON response into a ClauseList."""
	cleaned = clean_json_response(content)
	data = json.loads(cleaned)

	if isinstance(data, list):
	clauses_data = data
	elif isinstance(data, dict):
	clauses_data = data.get("clauses", [])
	else:
	clauses_data = []

	clauses: list[Clause] = []
	for c in clauses_data:
	clauses.append(
	Clause(
	id=c.get("id", 0),
	raw_text=c.get("raw_text", ""),
	plain_english=c.get("plain_english"),
	clause_type=c.get("clause_type", "OTHER"),
	section_heading=c.get("section_heading"),
	position=c.get("position", 0),
	)
	)

	contract_type_raw = data.get("contract_type", "Other") if isinstance(data, dict) else "Other"

	return ClauseList(
	clauses=clauses,
	contract_type=contract_type_raw,
	total_clauses=len(clauses),
	)


	def _validate_clause_list(clause_list: ClauseList) -> None:
	"""Validate the extracted clause list meets minimum requirements.

	Raises:
	ValueError: If fewer than MIN_CLAUSES clauses are found.
	"""
	if clause_list.total_clauses < MIN_CLAUSES:
	raise ValueError(
	f"Document too short or unreadable — minimum {MIN_CLAUSES} clauses required"
	)