File size: 3,037 Bytes
3552405 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | """Agent 1: Extractor — segments document into individual clauses."""
import json
import logging
from typing import Optional
from clauseguard.config.prompts import EXTRACTOR_SYSTEM_PROMPT
from clauseguard.models.clause import Clause, ClauseList
from clauseguard.services.model_service import call_model, clean_json_response
logger = logging.getLogger(__name__)
MIN_CLAUSES = 3
MAX_RETRIES = 1
async def run_extractor(raw_text: str, filename: str = "document") -> ClauseList:
"""Extract clauses from raw contract text using the Extractor agent.
Args:
raw_text: The raw text content of the contract.
filename: Name of the source file (for context).
Returns:
A ClauseList containing the extracted clauses.
Raises:
ValueError: If fewer than MIN_CLAUSES clauses are found.
"""
if not raw_text or not raw_text.strip():
raise ValueError("Document is empty or unreadable")
prompt = _build_user_prompt(raw_text, filename)
content = await call_model(
system_prompt=EXTRACTOR_SYSTEM_PROMPT,
user_prompt=prompt,
agent_name="Extractor",
max_retries=MAX_RETRIES,
)
if content is None:
raise ValueError("Extractor agent failed to produce a valid response")
clause_list = _parse_response(content)
_validate_clause_list(clause_list)
return clause_list
def _build_user_prompt(raw_text: str, filename: str) -> str:
"""Build the user prompt with the contract text."""
return f"""Extract all clauses from the following contract document.
Filename: {filename}
Document text:
{raw_text}
"""
def _parse_response(content: str) -> ClauseList:
"""Parse the LLM JSON response into a ClauseList."""
cleaned = clean_json_response(content)
data = json.loads(cleaned)
if isinstance(data, list):
clauses_data = data
elif isinstance(data, dict):
clauses_data = data.get("clauses", [])
else:
clauses_data = []
clauses: list[Clause] = []
for c in clauses_data:
clauses.append(
Clause(
id=c.get("id", 0),
raw_text=c.get("raw_text", ""),
plain_english=c.get("plain_english"),
clause_type=c.get("clause_type", "OTHER"),
section_heading=c.get("section_heading"),
position=c.get("position", 0),
)
)
contract_type_raw = data.get("contract_type", "Other") if isinstance(data, dict) else "Other"
return ClauseList(
clauses=clauses,
contract_type=contract_type_raw,
total_clauses=len(clauses),
)
def _validate_clause_list(clause_list: ClauseList) -> None:
"""Validate the extracted clause list meets minimum requirements.
Raises:
ValueError: If fewer than MIN_CLAUSES clauses are found.
"""
if clause_list.total_clauses < MIN_CLAUSES:
raise ValueError(
f"Document too short or unreadable — minimum {MIN_CLAUSES} clauses required"
)
|