"""Clause processing utility functions.""" import re from typing import List def split_into_clauses(text: str) -> List[str]: """Split a contract document into individual clauses. Splits on: numbered patterns (1., 1.1, Article 1, Section 1, etc.), ALL CAPS headings, and double newline breaks. Args: text: The full text of the contract document. Returns: A list of non-empty clause strings. """ if not text or not text.strip(): return [] _paragraphs = _split_by_numbered_headings(text) clauses: List[str] = [] for para in _paragraphs: sub_clauses = _split_by_double_newlines(para) clauses.extend(c for c in sub_clauses if c.strip()) return [c for c in clauses if len(c.split()) >= 5] def _split_by_numbered_headings(text: str) -> List[str]: """Split text by numbered section patterns and ALL CAPS headings.""" pattern = r"(?:(?<=\n)\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+[\.:\s]|\n\s*(?:\d+[\.\)]\s*[A-Z]|\d+\.\d+\s+[A-Z]|[IVX]+\.\s+[A-Z])|\n\s*[A-Z][A-Z\s]{10,}\n)" parts = re.split(pattern, text) return [p.strip() for p in parts if p.strip()] def _split_by_double_newlines(text: str) -> List[str]: """Split text by double newline breaks.""" parts = re.split(r"\n\s*\n", text) return [p.strip() for p in parts if p.strip()] def clean_text(text: str) -> str: """Clean and normalize text by removing excessive whitespace. Args: text: Raw text to clean. Returns: Cleaned and normalized text. """ if not text: return "" text = text.replace("\r\n", "\n").replace("\r", "\n") text = re.sub(r" {2,}", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r"\t+", " ", text) text = re.sub(r" +\n", "\n", text) text = re.sub(r"\n +", "\n", text) return text.strip() _CONTRACT_KEYWORDS: dict[str, List[str]] = { "NDA": ["non-disclosure", "confidential", "confidentiality", "trade secret", "nda", "non disclosure"], "Employment": ["employment", "employee", "salary", "benefits", "at-will", "at will", "offer letter"], "Freelance": ["freelance", "independent contractor", "consultant", "statement of work", "contractor"], "SaaS": ["software as a service", "subscription", "saas", "service level agreement", "sla", "license"], } def detect_contract_type(text: str) -> str: """Detect the type of contract based on keyword analysis. Args: text: The full text of the contract document. Returns: Detected contract type string (NDA, Employment, Freelance, SaaS, or Other). """ if not text: return "Other" text_lower = text.lower() scores: dict[str, int] = {} for contract_type, keywords in _CONTRACT_KEYWORDS.items(): score = sum(1 for kw in keywords if kw in text_lower) if score > 0: scores[contract_type] = score if not scores: return "Other" return max(scores, key=lambda k: scores[k]) def detect_headings(text: str) -> list[str]: """Detect section headings from a contract document. Identifies ALL CAPS lines and numbered section headers. Args: text: The full text of the contract document. Returns: A list of detected heading strings. """ if not text: return [] headings: list[str] = [] lines = text.split("\n") for line in lines: stripped = line.strip() if not stripped: continue if re.match(r"^\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+", stripped): headings.append(stripped) continue if re.match(r"^\s*\d+[\.\)]\s+[A-Z]", stripped): headings.append(stripped) continue if re.match(r"^[A-Z][A-Z\s]{10,}$", stripped) and len(stripped.split()) <= 6: headings.append(stripped) return headings