File size: 3,912 Bytes
3552405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""Clause processing utility functions."""

import re
from typing import List


def split_into_clauses(text: str) -> List[str]:
    """Split a contract document into individual clauses.

    Splits on: numbered patterns (1., 1.1, Article 1, Section 1, etc.),
    ALL CAPS headings, and double newline breaks.

    Args:
        text: The full text of the contract document.

    Returns:
        A list of non-empty clause strings.
    """
    if not text or not text.strip():
        return []

    _paragraphs = _split_by_numbered_headings(text)
    clauses: List[str] = []

    for para in _paragraphs:
        sub_clauses = _split_by_double_newlines(para)
        clauses.extend(c for c in sub_clauses if c.strip())

    return [c for c in clauses if len(c.split()) >= 5]


def _split_by_numbered_headings(text: str) -> List[str]:
    """Split text by numbered section patterns and ALL CAPS headings."""
    pattern = r"(?:(?<=\n)\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+[\.:\s]|\n\s*(?:\d+[\.\)]\s*[A-Z]|\d+\.\d+\s+[A-Z]|[IVX]+\.\s+[A-Z])|\n\s*[A-Z][A-Z\s]{10,}\n)"
    parts = re.split(pattern, text)
    return [p.strip() for p in parts if p.strip()]


def _split_by_double_newlines(text: str) -> List[str]:
    """Split text by double newline breaks."""
    parts = re.split(r"\n\s*\n", text)
    return [p.strip() for p in parts if p.strip()]


def clean_text(text: str) -> str:
    """Clean and normalize text by removing excessive whitespace.

    Args:
        text: Raw text to clean.

    Returns:
        Cleaned and normalized text.
    """
    if not text:
        return ""

    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r" {2,}", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"\t+", " ", text)
    text = re.sub(r" +\n", "\n", text)
    text = re.sub(r"\n +", "\n", text)
    return text.strip()


_CONTRACT_KEYWORDS: dict[str, List[str]] = {
    "NDA": ["non-disclosure", "confidential", "confidentiality", "trade secret", "nda", "non disclosure"],
    "Employment": ["employment", "employee", "salary", "benefits", "at-will", "at will", "offer letter"],
    "Freelance": ["freelance", "independent contractor", "consultant", "statement of work", "contractor"],
    "SaaS": ["software as a service", "subscription", "saas", "service level agreement", "sla", "license"],
}


def detect_contract_type(text: str) -> str:
    """Detect the type of contract based on keyword analysis.

    Args:
        text: The full text of the contract document.

    Returns:
        Detected contract type string (NDA, Employment, Freelance, SaaS, or Other).
    """
    if not text:
        return "Other"

    text_lower = text.lower()
    scores: dict[str, int] = {}

    for contract_type, keywords in _CONTRACT_KEYWORDS.items():
        score = sum(1 for kw in keywords if kw in text_lower)
        if score > 0:
            scores[contract_type] = score

    if not scores:
        return "Other"

    return max(scores, key=lambda k: scores[k])


def detect_headings(text: str) -> list[str]:
    """Detect section headings from a contract document.

    Identifies ALL CAPS lines and numbered section headers.

    Args:
        text: The full text of the contract document.

    Returns:
        A list of detected heading strings.
    """
    if not text:
        return []

    headings: list[str] = []
    lines = text.split("\n")

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        if re.match(r"^\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+", stripped):
            headings.append(stripped)
            continue

        if re.match(r"^\s*\d+[\.\)]\s+[A-Z]", stripped):
            headings.append(stripped)
            continue

        if re.match(r"^[A-Z][A-Z\s]{10,}$", stripped) and len(stripped.split()) <= 6:
            headings.append(stripped)

    return headings