File size: 3,912 Bytes
3552405 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | """Clause processing utility functions."""
import re
from typing import List
def split_into_clauses(text: str) -> List[str]:
"""Split a contract document into individual clauses.
Splits on: numbered patterns (1., 1.1, Article 1, Section 1, etc.),
ALL CAPS headings, and double newline breaks.
Args:
text: The full text of the contract document.
Returns:
A list of non-empty clause strings.
"""
if not text or not text.strip():
return []
_paragraphs = _split_by_numbered_headings(text)
clauses: List[str] = []
for para in _paragraphs:
sub_clauses = _split_by_double_newlines(para)
clauses.extend(c for c in sub_clauses if c.strip())
return [c for c in clauses if len(c.split()) >= 5]
def _split_by_numbered_headings(text: str) -> List[str]:
"""Split text by numbered section patterns and ALL CAPS headings."""
pattern = r"(?:(?<=\n)\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+[\.:\s]|\n\s*(?:\d+[\.\)]\s*[A-Z]|\d+\.\d+\s+[A-Z]|[IVX]+\.\s+[A-Z])|\n\s*[A-Z][A-Z\s]{10,}\n)"
parts = re.split(pattern, text)
return [p.strip() for p in parts if p.strip()]
def _split_by_double_newlines(text: str) -> List[str]:
"""Split text by double newline breaks."""
parts = re.split(r"\n\s*\n", text)
return [p.strip() for p in parts if p.strip()]
def clean_text(text: str) -> str:
"""Clean and normalize text by removing excessive whitespace.
Args:
text: Raw text to clean.
Returns:
Cleaned and normalized text.
"""
if not text:
return ""
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = re.sub(r" {2,}", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"\t+", " ", text)
text = re.sub(r" +\n", "\n", text)
text = re.sub(r"\n +", "\n", text)
return text.strip()
_CONTRACT_KEYWORDS: dict[str, List[str]] = {
"NDA": ["non-disclosure", "confidential", "confidentiality", "trade secret", "nda", "non disclosure"],
"Employment": ["employment", "employee", "salary", "benefits", "at-will", "at will", "offer letter"],
"Freelance": ["freelance", "independent contractor", "consultant", "statement of work", "contractor"],
"SaaS": ["software as a service", "subscription", "saas", "service level agreement", "sla", "license"],
}
def detect_contract_type(text: str) -> str:
"""Detect the type of contract based on keyword analysis.
Args:
text: The full text of the contract document.
Returns:
Detected contract type string (NDA, Employment, Freelance, SaaS, or Other).
"""
if not text:
return "Other"
text_lower = text.lower()
scores: dict[str, int] = {}
for contract_type, keywords in _CONTRACT_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[contract_type] = score
if not scores:
return "Other"
return max(scores, key=lambda k: scores[k])
def detect_headings(text: str) -> list[str]:
"""Detect section headings from a contract document.
Identifies ALL CAPS lines and numbered section headers.
Args:
text: The full text of the contract document.
Returns:
A list of detected heading strings.
"""
if not text:
return []
headings: list[str] = []
lines = text.split("\n")
for line in lines:
stripped = line.strip()
if not stripped:
continue
if re.match(r"^\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+", stripped):
headings.append(stripped)
continue
if re.match(r"^\s*\d+[\.\)]\s+[A-Z]", stripped):
headings.append(stripped)
continue
if re.match(r"^[A-Z][A-Z\s]{10,}$", stripped) and len(stripped.split()) <= 6:
headings.append(stripped)
return headings
|