File size: 3,037 Bytes
3552405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Agent 1: Extractor — segments document into individual clauses."""

import json
import logging
from typing import Optional

from clauseguard.config.prompts import EXTRACTOR_SYSTEM_PROMPT
from clauseguard.models.clause import Clause, ClauseList
from clauseguard.services.model_service import call_model, clean_json_response

logger = logging.getLogger(__name__)

MIN_CLAUSES = 3
MAX_RETRIES = 1


async def run_extractor(raw_text: str, filename: str = "document") -> ClauseList:
    """Extract clauses from raw contract text using the Extractor agent.

    Args:
        raw_text: The raw text content of the contract.
        filename: Name of the source file (for context).

    Returns:
        A ClauseList containing the extracted clauses.

    Raises:
        ValueError: If fewer than MIN_CLAUSES clauses are found.
    """
    if not raw_text or not raw_text.strip():
        raise ValueError("Document is empty or unreadable")

    prompt = _build_user_prompt(raw_text, filename)

    content = await call_model(
        system_prompt=EXTRACTOR_SYSTEM_PROMPT,
        user_prompt=prompt,
        agent_name="Extractor",
        max_retries=MAX_RETRIES,
    )

    if content is None:
        raise ValueError("Extractor agent failed to produce a valid response")

    clause_list = _parse_response(content)
    _validate_clause_list(clause_list)
    return clause_list


def _build_user_prompt(raw_text: str, filename: str) -> str:
    """Build the user prompt with the contract text."""
    return f"""Extract all clauses from the following contract document.

Filename: {filename}

Document text:
{raw_text}
"""


def _parse_response(content: str) -> ClauseList:
    """Parse the LLM JSON response into a ClauseList."""
    cleaned = clean_json_response(content)
    data = json.loads(cleaned)

    if isinstance(data, list):
        clauses_data = data
    elif isinstance(data, dict):
        clauses_data = data.get("clauses", [])
    else:
        clauses_data = []

    clauses: list[Clause] = []
    for c in clauses_data:
        clauses.append(
            Clause(
                id=c.get("id", 0),
                raw_text=c.get("raw_text", ""),
                plain_english=c.get("plain_english"),
                clause_type=c.get("clause_type", "OTHER"),
                section_heading=c.get("section_heading"),
                position=c.get("position", 0),
            )
        )

    contract_type_raw = data.get("contract_type", "Other") if isinstance(data, dict) else "Other"

    return ClauseList(
        clauses=clauses,
        contract_type=contract_type_raw,
        total_clauses=len(clauses),
    )


def _validate_clause_list(clause_list: ClauseList) -> None:
    """Validate the extracted clause list meets minimum requirements.

    Raises:
        ValueError: If fewer than MIN_CLAUSES clauses are found.
    """
    if clause_list.total_clauses < MIN_CLAUSES:
        raise ValueError(
            f"Document too short or unreadable — minimum {MIN_CLAUSES} clauses required"
        )