ClauseGuard / compliance.py
gaurv007's picture
v4.2: Update compliance.py
b16b7fa verified
raw
history blame
17.3 kB
"""
ClauseGuard β€” Compliance Checker v3.1
═════════════════════════════════════
FIXED in v3.1:
β€’ FIX: Expanded negation window from 100 to 200 chars to catch cross-sentence negation
β€’ FIX: Added sentence-boundary-aware negation detection
β€’ FIX: Improved context extraction with sentence boundaries
β€’ FIX: Added AMBIGUOUS handling for mixed positive/negative signals
"""
import re
from collections import defaultdict
# Negation patterns that invert compliance meaning
_NEGATION_PATTERNS = [
r"(?:does?\s+)?not\s+(?:require|provide|include|offer|grant|guarantee|ensure|maintain|comply|adhere|support|acknowledge)",
r"(?:no|without)\s+(?:obligation|requirement|guarantee|warranty|commitment|responsibility|duty)",
r"(?:exclud|waiv|disclaim|exempt|refus|deny|reject|eliminat|remov|revok)",
r"shall\s+not\s+be\s+(?:required|obligated|responsible|liable|bound|subject)",
r"is\s+not\s+(?:responsible|liable|required|obligated|bound|subject)",
r"expressly\s+(?:disclaim|exclud|waiv|reject)",
r"to\s+the\s+(?:maximum|fullest)\s+extent\s+(?:permitted|allowed).*(?:disclaim|exclud|waiv)",
r"notwithstanding.*(?:shall\s+not|does\s+not|is\s+not)",
]
# FIX v4.2: Pre-compile negation patterns at module level
_NEGATION_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _NEGATION_PATTERNS]
# Regulatory requirement definitions
REGULATIONS = {
"GDPR": {
"description": "EU General Data Protection Regulation (Regulation 2016/679)",
"requirements": {
"lawful_basis": {
"keywords": ["lawful basis", "legal basis", "legitimate interest", "consent", "performance of contract", "legal obligation"],
"description": "Must specify lawful basis for data processing (Art. 6)",
"severity": "HIGH",
},
"data_subject_rights": {
"keywords": ["right to access", "right to erasure", "right to be forgotten", "data portability", "rectification", "object to processing"],
"description": "Must acknowledge data subject rights (Arts. 15-22)",
"severity": "HIGH",
},
"data_breach_notification": {
"keywords": ["data breach", "breach notification", "notify supervisory authority", "72 hours"],
"description": "Must include data breach notification obligations (Art. 33)",
"severity": "MEDIUM",
},
"data_protection_officer": {
"keywords": ["data protection officer", "DPO"],
"description": "Should reference Data Protection Officer if applicable (Art. 37)",
"severity": "LOW",
},
"cross_border_transfer": {
"keywords": ["standard contractual clauses", "SCCs", "adequacy decision", "transfer mechanism", "third country"],
"description": "Must specify transfer safeguards for cross-border data (Arts. 44-49)",
"severity": "HIGH",
},
"privacy_by_design": {
"keywords": ["privacy by design", "privacy by default", "data minimization", "purpose limitation"],
"description": "Should reference privacy-by-design principles (Art. 25)",
"severity": "MEDIUM",
},
"data_processing_agreement": {
"keywords": ["data processing agreement", "DPA", "data processor", "sub-processor"],
"description": "Must include data processing agreement if sharing data (Art. 28)",
"severity": "HIGH",
},
},
},
"CCPA": {
"description": "California Consumer Privacy Act (Cal. Civ. Code Β§ 1798.100 et seq.)",
"requirements": {
"consumer_rights": {
"keywords": ["right to know", "right to delete", "right to opt out", "right to non-discrimination", "consumer rights"],
"description": "Must acknowledge California consumer rights",
"severity": "HIGH",
},
"data_categories": {
"keywords": ["categories of personal information", "personal information categories", "identifiers", "commercial information"],
"description": "Must disclose categories of personal information collected",
"severity": "HIGH",
},
"sale_of_data": {
"keywords": ["do not sell my personal information", "opt-out of sale", "sale of personal information"],
"description": "Must provide opt-out mechanism for data sales",
"severity": "HIGH",
},
"service_providers": {
"keywords": ["service provider", "third party", "contractor", "business purpose"],
"description": "Should limit data use to business/service provider purposes",
"severity": "MEDIUM",
},
},
},
"SOX": {
"description": "Sarbanes-Oxley Act (US, 2002)",
"requirements": {
"internal_controls": {
"keywords": ["internal controls", "internal control over financial reporting", "ICFR"],
"description": "Must reference internal controls over financial reporting (Β§ 404)",
"severity": "HIGH",
},
"audit_committee": {
"keywords": ["audit committee", "independent auditor", "PCAOB"],
"description": "Should reference audit committee oversight",
"severity": "MEDIUM",
},
"whistleblower": {
"keywords": ["whistleblower", "anonymous reporting", "reporting hotline", "retaliation"],
"description": "Should protect whistleblower provisions (Β§ 806)",
"severity": "HIGH",
},
"document_retention": {
"keywords": ["document retention", "record retention", "retention policy", "preserve records"],
"description": "Must include document retention obligations (Β§ 802)",
"severity": "HIGH",
},
},
},
"HIPAA": {
"description": "Health Insurance Portability and Accountability Act (US, 1996)",
"requirements": {
"phi_protection": {
"keywords": ["protected health information", "PHI", "health information", "ePHI"],
"description": "Must protect PHI and limit uses/disclosures",
"severity": "CRITICAL",
},
"business_associate": {
"keywords": ["business associate agreement", "BAA", "business associate", "covered entity"],
"description": "Should reference Business Associate Agreement (Β§ 164.504(e))",
"severity": "HIGH",
},
"security_safeguards": {
"keywords": ["administrative safeguards", "technical safeguards", "physical safeguards", "encryption", "access controls"],
"description": "Must implement security safeguards (Β§ 164.308-312)",
"severity": "HIGH",
},
"breach_notification": {
"keywords": ["breach notification", "notification of breach", "unauthorized access"],
"description": "Must include breach notification obligations (Β§ 164.400-414)",
"severity": "HIGH",
},
},
},
"FINRA": {
"description": "Financial Industry Regulatory Authority (US)",
"requirements": {
"recordkeeping": {
"keywords": ["recordkeeping", "books and records", "retain records", "SEC Rule 17a-4"],
"description": "Must comply with recordkeeping rules (FINRA Rule 4511)",
"severity": "HIGH",
},
"supervision": {
"keywords": ["supervision", "supervisory system", "review and approval"],
"description": "Should reference supervisory obligations (FINRA Rule 3110)",
"severity": "MEDIUM",
},
"anti_money_laundering": {
"keywords": ["anti-money laundering", "AML", "suspicious activity", "SAR", "OFAC"],
"description": "Must reference AML compliance (FINRA Rule 3310)",
"severity": "HIGH",
},
"privacy": {
"keywords": ["privacy policy", "customer information", "Regulation S-P", "nonpublic personal information"],
"description": "Must protect customer information (Regulation S-P)",
"severity": "HIGH",
},
},
},
}
RISK_STYLES = {
"CRITICAL": ("#dc2626", "#fef2f2"),
"HIGH": ("#ea580c", "#fff7ed"),
"MEDIUM": ("#ca8a04", "#fefce8"),
"LOW": ("#16a34a", "#f0fdf4"),
}
def _get_sentence_containing(text_lower, keyword_lower, start_idx):
"""FIX v3.1: Extract the full sentence containing the keyword match."""
# Find sentence boundaries around the match
# Look backward for sentence start
sent_start = start_idx
for i in range(start_idx - 1, max(0, start_idx - 500), -1):
if text_lower[i] in '.!?' and i < start_idx - 2:
sent_start = i + 1
break
else:
sent_start = max(0, start_idx - 500)
# Look forward for sentence end
sent_end = start_idx + len(keyword_lower)
for i in range(sent_end, min(len(text_lower), sent_end + 500)):
if text_lower[i] in '.!?':
sent_end = i + 1
break
else:
sent_end = min(len(text_lower), sent_end + 500)
return text_lower[sent_start:sent_end].strip()
def _check_negation(text_lower, keyword, window=200):
"""FIX v3.1: Check if a keyword match is negated β€” uses sentence-aware window."""
idx = text_lower.find(keyword.lower())
if idx == -1:
return False
# Get sentence-aware context (more accurate than fixed window)
sentence = _get_sentence_containing(text_lower, keyword.lower(), idx)
# Also get a wider window for cross-sentence negation
start = max(0, idx - window)
end = min(len(text_lower), idx + len(keyword) + window)
wider_context = text_lower[start:end]
# Check sentence first (higher confidence)
for neg_pat in _NEGATION_PATTERNS_COMPILED:
if neg_pat.search(sentence):
return True
# Then check wider window (lower confidence, still relevant)
for neg_pat in _NEGATION_PATTERNS_COMPILED[:4]: # Only strong negation patterns for wider window
if neg_pat.search(wider_context):
return True
return False
def _get_context(text, keyword, window=100):
"""Extract context around a keyword match with sentence boundaries."""
text_lower = text.lower()
idx = text_lower.find(keyword.lower())
if idx == -1:
return ""
start = max(0, idx - window)
end = min(len(text), idx + len(keyword) + window)
context = text[start:end].strip()
if start > 0:
context = "..." + context
if end < len(text):
context = context + "..."
return context
def check_compliance(text):
"""Check contract text against all regulatory frameworks with negation handling."""
text_lower = text.lower()
results = {}
for reg_name, reg_data in REGULATIONS.items():
checks = []
for req_name, req_data in reg_data["requirements"].items():
matched = False
negated = False
matched_keywords = []
context_snippets = []
for kw in req_data["keywords"]:
if kw.lower() in text_lower:
matched_keywords.append(kw)
if _check_negation(text_lower, kw):
negated = True
else:
matched = True
ctx = _get_context(text, kw)
if ctx:
context_snippets.append(ctx)
if matched and not negated:
status = "PASS"
elif negated and not matched:
status = "NEGATED"
elif matched and negated:
status = "AMBIGUOUS"
else:
status = "MISSING"
checks.append({
"requirement": req_name,
"description": req_data["description"],
"severity": req_data["severity"],
"status": status,
"matched_keywords": matched_keywords,
"context": context_snippets[:2],
})
passed = sum(1 for c in checks if c["status"] == "PASS")
total = len(checks)
compliance_rate = round(passed / total * 100) if total > 0 else 0
negated_count = sum(1 for c in checks if c["status"] == "NEGATED")
ambiguous_count = sum(1 for c in checks if c["status"] == "AMBIGUOUS")
if compliance_rate >= 80:
overall = "COMPLIANT"
elif compliance_rate >= 40:
overall = "PARTIAL"
else:
overall = "NON-COMPLIANT"
if any(c["status"] == "NEGATED" and c["severity"] in ("CRITICAL", "HIGH") for c in checks):
overall = "WARNING"
results[reg_name] = {
"description": reg_data["description"],
"compliance_rate": compliance_rate,
"checks": checks,
"overall_status": overall,
"negated_count": negated_count,
"ambiguous_count": ambiguous_count,
}
return results
def render_compliance_html(results):
"""Render compliance results as HTML for Gradio."""
html = '<div style="font-family:system-ui,sans-serif;">'
for reg_name, reg_result in results.items():
rate = reg_result["compliance_rate"]
status = reg_result["overall_status"]
status_colors = {
"COMPLIANT": ("#16a34a", "#f0fdf4"),
"PARTIAL": ("#ca8a04", "#fefce8"),
"NON-COMPLIANT": ("#dc2626", "#fef2f2"),
"WARNING": ("#ea580c", "#fff7ed"),
}
status_color, status_bg = status_colors.get(status, ("#6b7280", "#f9fafb"))
neg = reg_result.get("negated_count", 0)
amb = reg_result.get("ambiguous_count", 0)
warnings = ""
if neg > 0:
warnings += f'<span style="font-size:10px;color:#ea580c;margin-left:8px;">⚠️ {neg} negated</span>'
if amb > 0:
warnings += f'<span style="font-size:10px;color:#ca8a04;margin-left:8px;">❓ {amb} ambiguous</span>'
html += f'''
<div style="border:1px solid #e5e7eb;border-radius:10px;margin-bottom:16px;overflow:hidden;">
<div style="display:flex;justify-content:space-between;align-items:center;padding:12px 16px;background:{status_bg};border-bottom:1px solid #e5e7eb;">
<div>
<span style="font-size:16px;font-weight:700;color:#1f2937;">{reg_name}</span>
{warnings}
<p style="font-size:11px;color:#6b7280;margin:2px 0 0 0;">{reg_result["description"]}</p>
</div>
<div style="text-align:right;">
<div style="font-size:24px;font-weight:700;color:{status_color};">{rate}%</div>
<div style="font-size:11px;color:{status_color};font-weight:500;">{status}</div>
</div>
</div>
<div style="padding:8px 16px;">
'''
for check in reg_result["checks"]:
color, bg = RISK_STYLES[check["severity"]]
status_icons = {"PASS": "βœ…", "MISSING": "❌", "NEGATED": "🚫", "AMBIGUOUS": "❓"}
status_icon = status_icons.get(check["status"], "❓")
status_text_map = {"PASS": "Found", "MISSING": "Missing", "NEGATED": "Negated", "AMBIGUOUS": "Ambiguous"}
status_text = status_text_map.get(check["status"], "Unknown")
keywords = ", ".join(check["matched_keywords"][:3]) if check["matched_keywords"] else "β€”"
context_html = ""
if check.get("context"):
ctx = check["context"][0][:120].replace("<", "&lt;").replace(">", "&gt;")
context_html = f'<div style="font-size:10px;color:#6b7280;margin-top:2px;font-style:italic;">"{ctx}"</div>'
html += f'''
<div style="display:flex;justify-content:space-between;align-items:flex-start;padding:8px 0;border-bottom:1px solid #f3f4f6;">
<div style="flex:1;">
<div style="font-size:12px;font-weight:500;color:#374151;">{check["description"]}</div>
<div style="font-size:10px;color:#9ca3af;margin-top:2px;">Keywords: {keywords}</div>
{context_html}
</div>
<div style="display:flex;align-items:center;gap:6px;margin-left:8px;">
<span style="font-size:10px;color:{color};font-weight:600;background:{bg};padding:2px 8px;border-radius:4px;">{check["severity"]}</span>
<span style="font-size:13px;" title="{status_text}">{status_icon}</span>
</div>
</div>
'''
html += '</div></div>'
html += '</div>'
return html