ClauseGuard-AI / clauseguard /tools /clause_tools.py
muhammadbinmurtza
Restructure: clauseguard as package subfolder, app_file: clauseguard/app.py
913a064
"""Clause processing utility functions."""
import re
from typing import List
def split_into_clauses(text: str) -> List[str]:
"""Split a contract document into individual clauses.
Splits on: numbered patterns (1., 1.1, Article 1, Section 1, etc.),
ALL CAPS headings, and double newline breaks.
Args:
text: The full text of the contract document.
Returns:
A list of non-empty clause strings.
"""
if not text or not text.strip():
return []
_paragraphs = _split_by_numbered_headings(text)
clauses: List[str] = []
for para in _paragraphs:
sub_clauses = _split_by_double_newlines(para)
clauses.extend(c for c in sub_clauses if c.strip())
return [c for c in clauses if len(c.split()) >= 5]
def _split_by_numbered_headings(text: str) -> List[str]:
"""Split text by numbered section patterns and ALL CAPS headings."""
pattern = r"(?:(?<=\n)\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+[\.:\s]|\n\s*(?:\d+[\.\)]\s*[A-Z]|\d+\.\d+\s+[A-Z]|[IVX]+\.\s+[A-Z])|\n\s*[A-Z][A-Z\s]{10,}\n)"
parts = re.split(pattern, text)
return [p.strip() for p in parts if p.strip()]
def _split_by_double_newlines(text: str) -> List[str]:
"""Split text by double newline breaks."""
parts = re.split(r"\n\s*\n", text)
return [p.strip() for p in parts if p.strip()]
def clean_text(text: str) -> str:
"""Clean and normalize text by removing excessive whitespace.
Args:
text: Raw text to clean.
Returns:
Cleaned and normalized text.
"""
if not text:
return ""
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = re.sub(r" {2,}", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r"\t+", " ", text)
text = re.sub(r" +\n", "\n", text)
text = re.sub(r"\n +", "\n", text)
return text.strip()
_CONTRACT_KEYWORDS: dict[str, List[str]] = {
"NDA": ["non-disclosure", "confidential", "confidentiality", "trade secret", "nda", "non disclosure"],
"Employment": ["employment", "employee", "salary", "benefits", "at-will", "at will", "offer letter"],
"Freelance": ["freelance", "independent contractor", "consultant", "statement of work", "contractor"],
"SaaS": ["software as a service", "subscription", "saas", "service level agreement", "sla", "license"],
}
def detect_contract_type(text: str) -> str:
"""Detect the type of contract based on keyword analysis.
Args:
text: The full text of the contract document.
Returns:
Detected contract type string (NDA, Employment, Freelance, SaaS, or Other).
"""
if not text:
return "Other"
text_lower = text.lower()
scores: dict[str, int] = {}
for contract_type, keywords in _CONTRACT_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[contract_type] = score
if not scores:
return "Other"
return max(scores, key=lambda k: scores[k])
def detect_headings(text: str) -> list[str]:
"""Detect section headings from a contract document.
Identifies ALL CAPS lines and numbered section headers.
Args:
text: The full text of the contract document.
Returns:
A list of detected heading strings.
"""
if not text:
return []
headings: list[str] = []
lines = text.split("\n")
for line in lines:
stripped = line.strip()
if not stripped:
continue
if re.match(r"^\s*(?:Article|Section|SECTION|ARTICLE)\s+\d+", stripped):
headings.append(stripped)
continue
if re.match(r"^\s*\d+[\.\)]\s+[A-Z]", stripped):
headings.append(stripped)
continue
if re.match(r"^[A-Z][A-Z\s]{10,}$", stripped) and len(stripped.split()) <= 6:
headings.append(stripped)
return headings