Spaces:

lablab-ai-amd-developer-hackathon
/

ClauseGuard-AI

Sleeping

ClauseGuard-AI / clauseguard /tools /clause_tools.py

muhammadbinmurtza

Restructure: clauseguard as package subfolder, app_file: clauseguard/app.py

913a064 14 days ago

3.91 kB

	"""Clause processing utility functions."""

	import re
	from typing import List


	def split_into_clauses(text: str) -> List[str]:
	"""Split a contract document into individual clauses.

	Splits on: numbered patterns (1., 1.1, Article 1, Section 1, etc.),
	ALL CAPS headings, and double newline breaks.

	Args:
	text: The full text of the contract document.

	Returns:
	A list of non-empty clause strings.
	"""
	if not text or not text.strip():
	return []

	_paragraphs = _split_by_numbered_headings(text)
	clauses: List[str] = []

	for para in _paragraphs:
	sub_clauses = _split_by_double_newlines(para)
	clauses.extend(c for c in sub_clauses if c.strip())

	return [c for c in clauses if len(c.split()) >= 5]


	def _split_by_numbered_headings(text: str) -> List[str]:
	"""Split text by numbered section patterns and ALL CAPS headings."""
	pattern = r"(?:(?<=\n)\s(?:Article\|Section\|SECTION\|ARTICLE)\s+\d+[\.:\s]\|\n\s(?:\d+[\.\)]\s[A-Z]\|\d+\.\d+\s+[A-Z]\|[IVX]+\.\s+[A-Z])\|\n\s[A-Z][A-Z\s]{10,}\n)"
	parts = re.split(pattern, text)
	return [p.strip() for p in parts if p.strip()]


	def _split_by_double_newlines(text: str) -> List[str]:
	"""Split text by double newline breaks."""
	parts = re.split(r"\n\s*\n", text)
	return [p.strip() for p in parts if p.strip()]


	def clean_text(text: str) -> str:
	"""Clean and normalize text by removing excessive whitespace.

	Args:
	text: Raw text to clean.

	Returns:
	Cleaned and normalized text.
	"""
	if not text:
	return ""

	text = text.replace("\r\n", "\n").replace("\r", "\n")
	text = re.sub(r" {2,}", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	text = re.sub(r"\t+", " ", text)
	text = re.sub(r" +\n", "\n", text)
	text = re.sub(r"\n +", "\n", text)
	return text.strip()


	_CONTRACT_KEYWORDS: dict[str, List[str]] = {
	"NDA": ["non-disclosure", "confidential", "confidentiality", "trade secret", "nda", "non disclosure"],
	"Employment": ["employment", "employee", "salary", "benefits", "at-will", "at will", "offer letter"],
	"Freelance": ["freelance", "independent contractor", "consultant", "statement of work", "contractor"],
	"SaaS": ["software as a service", "subscription", "saas", "service level agreement", "sla", "license"],
	}


	def detect_contract_type(text: str) -> str:
	"""Detect the type of contract based on keyword analysis.

	Args:
	text: The full text of the contract document.

	Returns:
	Detected contract type string (NDA, Employment, Freelance, SaaS, or Other).
	"""
	if not text:
	return "Other"

	text_lower = text.lower()
	scores: dict[str, int] = {}

	for contract_type, keywords in _CONTRACT_KEYWORDS.items():
	score = sum(1 for kw in keywords if kw in text_lower)
	if score > 0:
	scores[contract_type] = score

	if not scores:
	return "Other"

	return max(scores, key=lambda k: scores[k])


	def detect_headings(text: str) -> list[str]:
	"""Detect section headings from a contract document.

	Identifies ALL CAPS lines and numbered section headers.

	Args:
	text: The full text of the contract document.

	Returns:
	A list of detected heading strings.
	"""
	if not text:
	return []

	headings: list[str] = []
	lines = text.split("\n")

	for line in lines:
	stripped = line.strip()
	if not stripped:
	continue

	if re.match(r"^\s*(?:Article\|Section\|SECTION\|ARTICLE)\s+\d+", stripped):
	headings.append(stripped)
	continue

	if re.match(r"^\s*\d+[\.\)]\s+[A-Z]", stripped):
	headings.append(stripped)
	continue

	if re.match(r"^[A-Z][A-Z\s]{10,}$", stripped) and len(stripped.split()) <= 6:
	headings.append(stripped)

	return headings