""" ClauseGuard — Compliance Checker v3.1 ═════════════════════════════════════ FIXED in v3.1: • FIX: Expanded negation window from 100 to 200 chars to catch cross-sentence negation • FIX: Added sentence-boundary-aware negation detection • FIX: Improved context extraction with sentence boundaries • FIX: Added AMBIGUOUS handling for mixed positive/negative signals """ import re from collections import defaultdict # Negation patterns that invert compliance meaning _NEGATION_PATTERNS = [ r"(?:does?\s+)?not\s+(?:require|provide|include|offer|grant|guarantee|ensure|maintain|comply|adhere|support|acknowledge)", r"(?:no|without)\s+(?:obligation|requirement|guarantee|warranty|commitment|responsibility|duty)", r"(?:exclud|waiv|disclaim|exempt|refus|deny|reject|eliminat|remov|revok)", r"shall\s+not\s+be\s+(?:required|obligated|responsible|liable|bound|subject)", r"is\s+not\s+(?:responsible|liable|required|obligated|bound|subject)", r"expressly\s+(?:disclaim|exclud|waiv|reject)", r"to\s+the\s+(?:maximum|fullest)\s+extent\s+(?:permitted|allowed).*(?:disclaim|exclud|waiv)", r"notwithstanding.*(?:shall\s+not|does\s+not|is\s+not)", ] # FIX v4.2: Pre-compile negation patterns at module level _NEGATION_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _NEGATION_PATTERNS] # Regulatory requirement definitions REGULATIONS = { "GDPR": { "description": "EU General Data Protection Regulation (Regulation 2016/679)", "requirements": { "lawful_basis": { "keywords": ["lawful basis", "legal basis", "legitimate interest", "consent", "performance of contract", "legal obligation"], "description": "Must specify lawful basis for data processing (Art. 6)", "severity": "HIGH", }, "data_subject_rights": { "keywords": ["right to access", "right to erasure", "right to be forgotten", "data portability", "rectification", "object to processing"], "description": "Must acknowledge data subject rights (Arts. 15-22)", "severity": "HIGH", }, "data_breach_notification": { "keywords": ["data breach", "breach notification", "notify supervisory authority", "72 hours"], "description": "Must include data breach notification obligations (Art. 33)", "severity": "MEDIUM", }, "data_protection_officer": { "keywords": ["data protection officer", "DPO"], "description": "Should reference Data Protection Officer if applicable (Art. 37)", "severity": "LOW", }, "cross_border_transfer": { "keywords": ["standard contractual clauses", "SCCs", "adequacy decision", "transfer mechanism", "third country"], "description": "Must specify transfer safeguards for cross-border data (Arts. 44-49)", "severity": "HIGH", }, "privacy_by_design": { "keywords": ["privacy by design", "privacy by default", "data minimization", "purpose limitation"], "description": "Should reference privacy-by-design principles (Art. 25)", "severity": "MEDIUM", }, "data_processing_agreement": { "keywords": ["data processing agreement", "DPA", "data processor", "sub-processor"], "description": "Must include data processing agreement if sharing data (Art. 28)", "severity": "HIGH", }, }, }, "CCPA": { "description": "California Consumer Privacy Act (Cal. Civ. Code § 1798.100 et seq.)", "requirements": { "consumer_rights": { "keywords": ["right to know", "right to delete", "right to opt out", "right to non-discrimination", "consumer rights"], "description": "Must acknowledge California consumer rights", "severity": "HIGH", }, "data_categories": { "keywords": ["categories of personal information", "personal information categories", "identifiers", "commercial information"], "description": "Must disclose categories of personal information collected", "severity": "HIGH", }, "sale_of_data": { "keywords": ["do not sell my personal information", "opt-out of sale", "sale of personal information"], "description": "Must provide opt-out mechanism for data sales", "severity": "HIGH", }, "service_providers": { "keywords": ["service provider", "third party", "contractor", "business purpose"], "description": "Should limit data use to business/service provider purposes", "severity": "MEDIUM", }, }, }, "SOX": { "description": "Sarbanes-Oxley Act (US, 2002)", "requirements": { "internal_controls": { "keywords": ["internal controls", "internal control over financial reporting", "ICFR"], "description": "Must reference internal controls over financial reporting (§ 404)", "severity": "HIGH", }, "audit_committee": { "keywords": ["audit committee", "independent auditor", "PCAOB"], "description": "Should reference audit committee oversight", "severity": "MEDIUM", }, "whistleblower": { "keywords": ["whistleblower", "anonymous reporting", "reporting hotline", "retaliation"], "description": "Should protect whistleblower provisions (§ 806)", "severity": "HIGH", }, "document_retention": { "keywords": ["document retention", "record retention", "retention policy", "preserve records"], "description": "Must include document retention obligations (§ 802)", "severity": "HIGH", }, }, }, "HIPAA": { "description": "Health Insurance Portability and Accountability Act (US, 1996)", "requirements": { "phi_protection": { "keywords": ["protected health information", "PHI", "health information", "ePHI"], "description": "Must protect PHI and limit uses/disclosures", "severity": "CRITICAL", }, "business_associate": { "keywords": ["business associate agreement", "BAA", "business associate", "covered entity"], "description": "Should reference Business Associate Agreement (§ 164.504(e))", "severity": "HIGH", }, "security_safeguards": { "keywords": ["administrative safeguards", "technical safeguards", "physical safeguards", "encryption", "access controls"], "description": "Must implement security safeguards (§ 164.308-312)", "severity": "HIGH", }, "breach_notification": { "keywords": ["breach notification", "notification of breach", "unauthorized access"], "description": "Must include breach notification obligations (§ 164.400-414)", "severity": "HIGH", }, }, }, "FINRA": { "description": "Financial Industry Regulatory Authority (US)", "requirements": { "recordkeeping": { "keywords": ["recordkeeping", "books and records", "retain records", "SEC Rule 17a-4"], "description": "Must comply with recordkeeping rules (FINRA Rule 4511)", "severity": "HIGH", }, "supervision": { "keywords": ["supervision", "supervisory system", "review and approval"], "description": "Should reference supervisory obligations (FINRA Rule 3110)", "severity": "MEDIUM", }, "anti_money_laundering": { "keywords": ["anti-money laundering", "AML", "suspicious activity", "SAR", "OFAC"], "description": "Must reference AML compliance (FINRA Rule 3310)", "severity": "HIGH", }, "privacy": { "keywords": ["privacy policy", "customer information", "Regulation S-P", "nonpublic personal information"], "description": "Must protect customer information (Regulation S-P)", "severity": "HIGH", }, }, }, } RISK_STYLES = { "CRITICAL": ("#dc2626", "#fef2f2"), "HIGH": ("#ea580c", "#fff7ed"), "MEDIUM": ("#ca8a04", "#fefce8"), "LOW": ("#16a34a", "#f0fdf4"), } def _get_sentence_containing(text_lower, keyword_lower, start_idx): """FIX v3.1: Extract the full sentence containing the keyword match.""" # Find sentence boundaries around the match # Look backward for sentence start sent_start = start_idx for i in range(start_idx - 1, max(0, start_idx - 500), -1): if text_lower[i] in '.!?' and i < start_idx - 2: sent_start = i + 1 break else: sent_start = max(0, start_idx - 500) # Look forward for sentence end sent_end = start_idx + len(keyword_lower) for i in range(sent_end, min(len(text_lower), sent_end + 500)): if text_lower[i] in '.!?': sent_end = i + 1 break else: sent_end = min(len(text_lower), sent_end + 500) return text_lower[sent_start:sent_end].strip() def _check_negation(text_lower, keyword, window=200): """FIX v3.1: Check if a keyword match is negated — uses sentence-aware window.""" idx = text_lower.find(keyword.lower()) if idx == -1: return False # Get sentence-aware context (more accurate than fixed window) sentence = _get_sentence_containing(text_lower, keyword.lower(), idx) # Also get a wider window for cross-sentence negation start = max(0, idx - window) end = min(len(text_lower), idx + len(keyword) + window) wider_context = text_lower[start:end] # Check sentence first (higher confidence) for neg_pat in _NEGATION_PATTERNS_COMPILED: if neg_pat.search(sentence): return True # Then check wider window (lower confidence, still relevant) for neg_pat in _NEGATION_PATTERNS_COMPILED[:4]: # Only strong negation patterns for wider window if neg_pat.search(wider_context): return True return False def _get_context(text, keyword, window=100): """Extract context around a keyword match with sentence boundaries.""" text_lower = text.lower() idx = text_lower.find(keyword.lower()) if idx == -1: return "" start = max(0, idx - window) end = min(len(text), idx + len(keyword) + window) context = text[start:end].strip() if start > 0: context = "..." + context if end < len(text): context = context + "..." return context # FIX v4.3: Regulation applicability gates — only apply regulations relevant to the contract type _REGULATION_GATES = { "SOX": re.compile( r'financial\s+statement|internal\s+control|audit\s+committee|public\s+company|sec\s+filing|pcaob|sarbanes', re.IGNORECASE ), "HIPAA": re.compile( r'protected\s+health|(? 0 else 0 negated_count = sum(1 for c in checks if c["status"] == "NEGATED") ambiguous_count = sum(1 for c in checks if c["status"] == "AMBIGUOUS") if compliance_rate >= 80: overall = "COMPLIANT" elif compliance_rate >= 40: overall = "PARTIAL" else: overall = "NON-COMPLIANT" if any(c["status"] == "NEGATED" and c["severity"] in ("CRITICAL", "HIGH") for c in checks): overall = "WARNING" results[reg_name] = { "description": reg_data["description"], "compliance_rate": compliance_rate, "checks": checks, "overall_status": overall, "negated_count": negated_count, "ambiguous_count": ambiguous_count, } return results def render_compliance_html(results): """Render compliance results as HTML for Gradio.""" html = '
' for reg_name, reg_result in results.items(): rate = reg_result["compliance_rate"] status = reg_result["overall_status"] # FIX v4.3: Handle NOT_APPLICABLE regulations if status == "NOT_APPLICABLE": note = reg_result.get("note", f"{reg_name} not applicable to this contract.") html += f'''
{reg_name}

{reg_result["description"]}

N/A
Not Applicable
{note}
''' continue status_colors = { "COMPLIANT": ("#16a34a", "#f0fdf4"), "PARTIAL": ("#ca8a04", "#fefce8"), "NON-COMPLIANT": ("#dc2626", "#fef2f2"), "WARNING": ("#ea580c", "#fff7ed"), } status_color, status_bg = status_colors.get(status, ("#6b7280", "#f9fafb")) neg = reg_result.get("negated_count", 0) amb = reg_result.get("ambiguous_count", 0) warnings = "" if neg > 0: warnings += f'⚠️ {neg} negated' if amb > 0: warnings += f'❓ {amb} ambiguous' html += f'''
{reg_name} {warnings}

{reg_result["description"]}

{rate}%
{status}
''' for check in reg_result["checks"]: color, bg = RISK_STYLES[check["severity"]] status_icons = {"PASS": "✅", "MISSING": "❌", "NEGATED": "🚫", "AMBIGUOUS": "❓"} status_icon = status_icons.get(check["status"], "❓") status_text_map = {"PASS": "Found", "MISSING": "Missing", "NEGATED": "Negated", "AMBIGUOUS": "Ambiguous"} status_text = status_text_map.get(check["status"], "Unknown") keywords = ", ".join(check["matched_keywords"][:3]) if check["matched_keywords"] else "—" context_html = "" if check.get("context"): ctx = check["context"][0][:120].replace("<", "<").replace(">", ">") context_html = f'
"{ctx}"
' html += f'''
{check["description"]}
Keywords: {keywords}
{context_html}
{check["severity"]} {status_icon}
''' html += '
' html += '
' return html