Spaces:

gaurv007
/

ClauseGuard

Sleeping

App Files Files Community

ClauseGuard / compliance.py

gaurv007

v4.2: Update compliance.py

b16b7fa verified 12 days ago

raw

history blame

17.3 kB

	"""
	ClauseGuard — Compliance Checker v3.1
	═════════════════════════════════════
	FIXED in v3.1:
	• FIX: Expanded negation window from 100 to 200 chars to catch cross-sentence negation
	• FIX: Added sentence-boundary-aware negation detection
	• FIX: Improved context extraction with sentence boundaries
	• FIX: Added AMBIGUOUS handling for mixed positive/negative signals
	"""

	import re
	from collections import defaultdict

	# Negation patterns that invert compliance meaning
	_NEGATION_PATTERNS = [
	r"(?:does?\s+)?not\s+(?:require\|provide\|include\|offer\|grant\|guarantee\|ensure\|maintain\|comply\|adhere\|support\|acknowledge)",
	r"(?:no\|without)\s+(?:obligation\|requirement\|guarantee\|warranty\|commitment\|responsibility\|duty)",
	r"(?:exclud\|waiv\|disclaim\|exempt\|refus\|deny\|reject\|eliminat\|remov\|revok)",
	r"shall\s+not\s+be\s+(?:required\|obligated\|responsible\|liable\|bound\|subject)",
	r"is\s+not\s+(?:responsible\|liable\|required\|obligated\|bound\|subject)",
	r"expressly\s+(?:disclaim\|exclud\|waiv\|reject)",
	r"to\s+the\s+(?:maximum\|fullest)\s+extent\s+(?:permitted\|allowed).*(?:disclaim\|exclud\|waiv)",
	r"notwithstanding.*(?:shall\s+not\|does\s+not\|is\s+not)",
	]

	# FIX v4.2: Pre-compile negation patterns at module level
	_NEGATION_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _NEGATION_PATTERNS]

	# Regulatory requirement definitions
	REGULATIONS = {
	"GDPR": {
	"description": "EU General Data Protection Regulation (Regulation 2016/679)",
	"requirements": {
	"lawful_basis": {
	"keywords": ["lawful basis", "legal basis", "legitimate interest", "consent", "performance of contract", "legal obligation"],
	"description": "Must specify lawful basis for data processing (Art. 6)",
	"severity": "HIGH",
	},
	"data_subject_rights": {
	"keywords": ["right to access", "right to erasure", "right to be forgotten", "data portability", "rectification", "object to processing"],
	"description": "Must acknowledge data subject rights (Arts. 15-22)",
	"severity": "HIGH",
	},
	"data_breach_notification": {
	"keywords": ["data breach", "breach notification", "notify supervisory authority", "72 hours"],
	"description": "Must include data breach notification obligations (Art. 33)",
	"severity": "MEDIUM",
	},
	"data_protection_officer": {
	"keywords": ["data protection officer", "DPO"],
	"description": "Should reference Data Protection Officer if applicable (Art. 37)",
	"severity": "LOW",
	},
	"cross_border_transfer": {
	"keywords": ["standard contractual clauses", "SCCs", "adequacy decision", "transfer mechanism", "third country"],
	"description": "Must specify transfer safeguards for cross-border data (Arts. 44-49)",
	"severity": "HIGH",
	},
	"privacy_by_design": {
	"keywords": ["privacy by design", "privacy by default", "data minimization", "purpose limitation"],
	"description": "Should reference privacy-by-design principles (Art. 25)",
	"severity": "MEDIUM",
	},
	"data_processing_agreement": {
	"keywords": ["data processing agreement", "DPA", "data processor", "sub-processor"],
	"description": "Must include data processing agreement if sharing data (Art. 28)",
	"severity": "HIGH",
	},
	},
	},
	"CCPA": {
	"description": "California Consumer Privacy Act (Cal. Civ. Code § 1798.100 et seq.)",
	"requirements": {
	"consumer_rights": {
	"keywords": ["right to know", "right to delete", "right to opt out", "right to non-discrimination", "consumer rights"],
	"description": "Must acknowledge California consumer rights",
	"severity": "HIGH",
	},
	"data_categories": {
	"keywords": ["categories of personal information", "personal information categories", "identifiers", "commercial information"],
	"description": "Must disclose categories of personal information collected",
	"severity": "HIGH",
	},
	"sale_of_data": {
	"keywords": ["do not sell my personal information", "opt-out of sale", "sale of personal information"],
	"description": "Must provide opt-out mechanism for data sales",
	"severity": "HIGH",
	},
	"service_providers": {
	"keywords": ["service provider", "third party", "contractor", "business purpose"],
	"description": "Should limit data use to business/service provider purposes",
	"severity": "MEDIUM",
	},
	},
	},
	"SOX": {
	"description": "Sarbanes-Oxley Act (US, 2002)",
	"requirements": {
	"internal_controls": {
	"keywords": ["internal controls", "internal control over financial reporting", "ICFR"],
	"description": "Must reference internal controls over financial reporting (§ 404)",
	"severity": "HIGH",
	},
	"audit_committee": {
	"keywords": ["audit committee", "independent auditor", "PCAOB"],
	"description": "Should reference audit committee oversight",
	"severity": "MEDIUM",
	},
	"whistleblower": {
	"keywords": ["whistleblower", "anonymous reporting", "reporting hotline", "retaliation"],
	"description": "Should protect whistleblower provisions (§ 806)",
	"severity": "HIGH",
	},
	"document_retention": {
	"keywords": ["document retention", "record retention", "retention policy", "preserve records"],
	"description": "Must include document retention obligations (§ 802)",
	"severity": "HIGH",
	},
	},
	},
	"HIPAA": {
	"description": "Health Insurance Portability and Accountability Act (US, 1996)",
	"requirements": {
	"phi_protection": {
	"keywords": ["protected health information", "PHI", "health information", "ePHI"],
	"description": "Must protect PHI and limit uses/disclosures",
	"severity": "CRITICAL",
	},
	"business_associate": {
	"keywords": ["business associate agreement", "BAA", "business associate", "covered entity"],
	"description": "Should reference Business Associate Agreement (§ 164.504(e))",
	"severity": "HIGH",
	},
	"security_safeguards": {
	"keywords": ["administrative safeguards", "technical safeguards", "physical safeguards", "encryption", "access controls"],
	"description": "Must implement security safeguards (§ 164.308-312)",
	"severity": "HIGH",
	},
	"breach_notification": {
	"keywords": ["breach notification", "notification of breach", "unauthorized access"],
	"description": "Must include breach notification obligations (§ 164.400-414)",
	"severity": "HIGH",
	},
	},
	},
	"FINRA": {
	"description": "Financial Industry Regulatory Authority (US)",
	"requirements": {
	"recordkeeping": {
	"keywords": ["recordkeeping", "books and records", "retain records", "SEC Rule 17a-4"],
	"description": "Must comply with recordkeeping rules (FINRA Rule 4511)",
	"severity": "HIGH",
	},
	"supervision": {
	"keywords": ["supervision", "supervisory system", "review and approval"],
	"description": "Should reference supervisory obligations (FINRA Rule 3110)",
	"severity": "MEDIUM",
	},
	"anti_money_laundering": {
	"keywords": ["anti-money laundering", "AML", "suspicious activity", "SAR", "OFAC"],
	"description": "Must reference AML compliance (FINRA Rule 3310)",
	"severity": "HIGH",
	},
	"privacy": {
	"keywords": ["privacy policy", "customer information", "Regulation S-P", "nonpublic personal information"],
	"description": "Must protect customer information (Regulation S-P)",
	"severity": "HIGH",
	},
	},
	},
	}

	RISK_STYLES = {
	"CRITICAL": ("#dc2626", "#fef2f2"),
	"HIGH": ("#ea580c", "#fff7ed"),
	"MEDIUM": ("#ca8a04", "#fefce8"),
	"LOW": ("#16a34a", "#f0fdf4"),
	}


	def _get_sentence_containing(text_lower, keyword_lower, start_idx):
	"""FIX v3.1: Extract the full sentence containing the keyword match."""
	# Find sentence boundaries around the match
	# Look backward for sentence start
	sent_start = start_idx
	for i in range(start_idx - 1, max(0, start_idx - 500), -1):
	if text_lower[i] in '.!?' and i < start_idx - 2:
	sent_start = i + 1
	break
	else:
	sent_start = max(0, start_idx - 500)

	# Look forward for sentence end
	sent_end = start_idx + len(keyword_lower)
	for i in range(sent_end, min(len(text_lower), sent_end + 500)):
	if text_lower[i] in '.!?':
	sent_end = i + 1
	break
	else:
	sent_end = min(len(text_lower), sent_end + 500)

	return text_lower[sent_start:sent_end].strip()


	def _check_negation(text_lower, keyword, window=200):
	"""FIX v3.1: Check if a keyword match is negated — uses sentence-aware window."""
	idx = text_lower.find(keyword.lower())
	if idx == -1:
	return False

	# Get sentence-aware context (more accurate than fixed window)
	sentence = _get_sentence_containing(text_lower, keyword.lower(), idx)

	# Also get a wider window for cross-sentence negation
	start = max(0, idx - window)
	end = min(len(text_lower), idx + len(keyword) + window)
	wider_context = text_lower[start:end]

	# Check sentence first (higher confidence)
	for neg_pat in _NEGATION_PATTERNS_COMPILED:
	if neg_pat.search(sentence):
	return True

	# Then check wider window (lower confidence, still relevant)
	for neg_pat in _NEGATION_PATTERNS_COMPILED[:4]: # Only strong negation patterns for wider window
	if neg_pat.search(wider_context):
	return True

	return False


	def _get_context(text, keyword, window=100):
	"""Extract context around a keyword match with sentence boundaries."""
	text_lower = text.lower()
	idx = text_lower.find(keyword.lower())
	if idx == -1:
	return ""
	start = max(0, idx - window)
	end = min(len(text), idx + len(keyword) + window)
	context = text[start:end].strip()
	if start > 0:
	context = "..." + context
	if end < len(text):
	context = context + "..."
	return context


	def check_compliance(text):
	"""Check contract text against all regulatory frameworks with negation handling."""
	text_lower = text.lower()
	results = {}

	for reg_name, reg_data in REGULATIONS.items():
	checks = []
	for req_name, req_data in reg_data["requirements"].items():
	matched = False
	negated = False
	matched_keywords = []
	context_snippets = []

	for kw in req_data["keywords"]:
	if kw.lower() in text_lower:
	matched_keywords.append(kw)
	if _check_negation(text_lower, kw):
	negated = True
	else:
	matched = True
	ctx = _get_context(text, kw)
	if ctx:
	context_snippets.append(ctx)

	if matched and not negated:
	status = "PASS"
	elif negated and not matched:
	status = "NEGATED"
	elif matched and negated:
	status = "AMBIGUOUS"
	else:
	status = "MISSING"

	checks.append({
	"requirement": req_name,
	"description": req_data["description"],
	"severity": req_data["severity"],
	"status": status,
	"matched_keywords": matched_keywords,
	"context": context_snippets[:2],
	})

	passed = sum(1 for c in checks if c["status"] == "PASS")
	total = len(checks)
	compliance_rate = round(passed / total * 100) if total > 0 else 0

	negated_count = sum(1 for c in checks if c["status"] == "NEGATED")
	ambiguous_count = sum(1 for c in checks if c["status"] == "AMBIGUOUS")

	if compliance_rate >= 80:
	overall = "COMPLIANT"
	elif compliance_rate >= 40:
	overall = "PARTIAL"
	else:
	overall = "NON-COMPLIANT"

	if any(c["status"] == "NEGATED" and c["severity"] in ("CRITICAL", "HIGH") for c in checks):
	overall = "WARNING"

	results[reg_name] = {
	"description": reg_data["description"],
	"compliance_rate": compliance_rate,
	"checks": checks,
	"overall_status": overall,
	"negated_count": negated_count,
	"ambiguous_count": ambiguous_count,
	}

	return results


	def render_compliance_html(results):
	"""Render compliance results as HTML for Gradio."""
	html = '<div style="font-family:system-ui,sans-serif;">'

	for reg_name, reg_result in results.items():
	rate = reg_result["compliance_rate"]
	status = reg_result["overall_status"]

	status_colors = {
	"COMPLIANT": ("#16a34a", "#f0fdf4"),
	"PARTIAL": ("#ca8a04", "#fefce8"),
	"NON-COMPLIANT": ("#dc2626", "#fef2f2"),
	"WARNING": ("#ea580c", "#fff7ed"),
	}
	status_color, status_bg = status_colors.get(status, ("#6b7280", "#f9fafb"))

	neg = reg_result.get("negated_count", 0)
	amb = reg_result.get("ambiguous_count", 0)
	warnings = ""
	if neg > 0:
	warnings += f'<span style="font-size:10px;color:#ea580c;margin-left:8px;">⚠️ {neg} negated</span>'
	if amb > 0:
	warnings += f'<span style="font-size:10px;color:#ca8a04;margin-left:8px;">❓ {amb} ambiguous</span>'

	html += f'''
	<div style="border:1px solid #e5e7eb;border-radius:10px;margin-bottom:16px;overflow:hidden;">
	<div style="display:flex;justify-content:space-between;align-items:center;padding:12px 16px;background:{status_bg};border-bottom:1px solid #e5e7eb;">
	<div>
	<span style="font-size:16px;font-weight:700;color:#1f2937;">{reg_name}</span>
	{warnings}
	<p style="font-size:11px;color:#6b7280;margin:2px 0 0 0;">{reg_result["description"]}</p>
	</div>
	<div style="text-align:right;">
	<div style="font-size:24px;font-weight:700;color:{status_color};">{rate}%</div>
	<div style="font-size:11px;color:{status_color};font-weight:500;">{status}</div>
	</div>
	</div>
	<div style="padding:8px 16px;">
	'''

	for check in reg_result["checks"]:
	color, bg = RISK_STYLES[check["severity"]]
	status_icons = {"PASS": "✅", "MISSING": "❌", "NEGATED": "🚫", "AMBIGUOUS": "❓"}
	status_icon = status_icons.get(check["status"], "❓")
	status_text_map = {"PASS": "Found", "MISSING": "Missing", "NEGATED": "Negated", "AMBIGUOUS": "Ambiguous"}
	status_text = status_text_map.get(check["status"], "Unknown")
	keywords = ", ".join(check["matched_keywords"][:3]) if check["matched_keywords"] else "—"

	context_html = ""
	if check.get("context"):
	ctx = check["context"][0][:120].replace("<", "<").replace(">", ">")
	context_html = f'<div style="font-size:10px;color:#6b7280;margin-top:2px;font-style:italic;">"{ctx}"</div>'

	html += f'''
	<div style="display:flex;justify-content:space-between;align-items:flex-start;padding:8px 0;border-bottom:1px solid #f3f4f6;">
	<div style="flex:1;">
	<div style="font-size:12px;font-weight:500;color:#374151;">{check["description"]}</div>
	<div style="font-size:10px;color:#9ca3af;margin-top:2px;">Keywords: {keywords}</div>
	{context_html}
	</div>
	<div style="display:flex;align-items:center;gap:6px;margin-left:8px;">
	<span style="font-size:10px;color:{color};font-weight:600;background:{bg};padding:2px 8px;border-radius:4px;">{check["severity"]}</span>
	<span style="font-size:13px;" title="{status_text}">{status_icon}</span>
	</div>
	</div>
	'''

	html += '</div></div>'

	html += '</div>'
	return html