Spaces:

gaurv007
/

ClauseGuard

Sleeping

File size: 20,939 Bytes

"""
ClauseGuard — Compliance Checker v3.1
═════════════════════════════════════
FIXED in v3.1:
  • FIX: Expanded negation window from 100 to 200 chars to catch cross-sentence negation
  • FIX: Added sentence-boundary-aware negation detection
  • FIX: Improved context extraction with sentence boundaries
  • FIX: Added AMBIGUOUS handling for mixed positive/negative signals
"""

import re
from collections import defaultdict

# Negation patterns that invert compliance meaning
_NEGATION_PATTERNS = [
    r"(?:does?\s+)?not\s+(?:require|provide|include|offer|grant|guarantee|ensure|maintain|comply|adhere|support|acknowledge)",
    r"(?:no|without)\s+(?:obligation|requirement|guarantee|warranty|commitment|responsibility|duty)",
    r"(?:exclud|waiv|disclaim|exempt|refus|deny|reject|eliminat|remov|revok)",
    r"shall\s+not\s+be\s+(?:required|obligated|responsible|liable|bound|subject)",
    r"is\s+not\s+(?:responsible|liable|required|obligated|bound|subject)",
    r"expressly\s+(?:disclaim|exclud|waiv|reject)",
    r"to\s+the\s+(?:maximum|fullest)\s+extent\s+(?:permitted|allowed).*(?:disclaim|exclud|waiv)",
    r"notwithstanding.*(?:shall\s+not|does\s+not|is\s+not)",
]

# FIX v4.2: Pre-compile negation patterns at module level
_NEGATION_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _NEGATION_PATTERNS]

# Regulatory requirement definitions
REGULATIONS = {
    "GDPR": {
        "description": "EU General Data Protection Regulation (Regulation 2016/679)",
        "requirements": {
            "lawful_basis": {
                "keywords": ["lawful basis", "legal basis", "legitimate interest", "consent", "performance of contract", "legal obligation"],
                "description": "Must specify lawful basis for data processing (Art. 6)",
                "severity": "HIGH",
            },
            "data_subject_rights": {
                "keywords": ["right to access", "right to erasure", "right to be forgotten", "data portability", "rectification", "object to processing"],
                "description": "Must acknowledge data subject rights (Arts. 15-22)",
                "severity": "HIGH",
            },
            "data_breach_notification": {
                "keywords": ["data breach", "breach notification", "notify supervisory authority", "72 hours"],
                "description": "Must include data breach notification obligations (Art. 33)",
                "severity": "MEDIUM",
            },
            "data_protection_officer": {
                "keywords": ["data protection officer", "DPO"],
                "description": "Should reference Data Protection Officer if applicable (Art. 37)",
                "severity": "LOW",
            },
            "cross_border_transfer": {
                "keywords": ["standard contractual clauses", "SCCs", "adequacy decision", "transfer mechanism", "third country"],
                "description": "Must specify transfer safeguards for cross-border data (Arts. 44-49)",
                "severity": "HIGH",
            },
            "privacy_by_design": {
                "keywords": ["privacy by design", "privacy by default", "data minimization", "purpose limitation"],
                "description": "Should reference privacy-by-design principles (Art. 25)",
                "severity": "MEDIUM",
            },
            "data_processing_agreement": {
                "keywords": ["data processing agreement", "DPA", "data processor", "sub-processor"],
                "description": "Must include data processing agreement if sharing data (Art. 28)",
                "severity": "HIGH",
            },
        },
    },
    "CCPA": {
        "description": "California Consumer Privacy Act (Cal. Civ. Code § 1798.100 et seq.)",
        "requirements": {
            "consumer_rights": {
                "keywords": ["right to know", "right to delete", "right to opt out", "right to non-discrimination", "consumer rights"],
                "description": "Must acknowledge California consumer rights",
                "severity": "HIGH",
            },
            "data_categories": {
                "keywords": ["categories of personal information", "personal information categories", "identifiers", "commercial information"],
                "description": "Must disclose categories of personal information collected",
                "severity": "HIGH",
            },
            "sale_of_data": {
                "keywords": ["do not sell my personal information", "opt-out of sale", "sale of personal information"],
                "description": "Must provide opt-out mechanism for data sales",
                "severity": "HIGH",
            },
            "service_providers": {
                "keywords": ["service provider", "third party", "contractor", "business purpose"],
                "description": "Should limit data use to business/service provider purposes",
                "severity": "MEDIUM",
            },
        },
    },
    "SOX": {
        "description": "Sarbanes-Oxley Act (US, 2002)",
        "requirements": {
            "internal_controls": {
                "keywords": ["internal controls", "internal control over financial reporting", "ICFR"],
                "description": "Must reference internal controls over financial reporting (§ 404)",
                "severity": "HIGH",
            },
            "audit_committee": {
                "keywords": ["audit committee", "independent auditor", "PCAOB"],
                "description": "Should reference audit committee oversight",
                "severity": "MEDIUM",
            },
            "whistleblower": {
                "keywords": ["whistleblower", "anonymous reporting", "reporting hotline", "retaliation"],
                "description": "Should protect whistleblower provisions (§ 806)",
                "severity": "HIGH",
            },
            "document_retention": {
                "keywords": ["document retention", "record retention", "retention policy", "preserve records"],
                "description": "Must include document retention obligations (§ 802)",
                "severity": "HIGH",
            },
        },
    },
    "HIPAA": {
        "description": "Health Insurance Portability and Accountability Act (US, 1996)",
        "requirements": {
            "phi_protection": {
                "keywords": ["protected health information", "PHI", "health information", "ePHI"],
                "description": "Must protect PHI and limit uses/disclosures",
                "severity": "CRITICAL",
            },
            "business_associate": {
                "keywords": ["business associate agreement", "BAA", "business associate", "covered entity"],
                "description": "Should reference Business Associate Agreement (§ 164.504(e))",
                "severity": "HIGH",
            },
            "security_safeguards": {
                "keywords": ["administrative safeguards", "technical safeguards", "physical safeguards", "encryption", "access controls"],
                "description": "Must implement security safeguards (§ 164.308-312)",
                "severity": "HIGH",
            },
            "breach_notification": {
                "keywords": ["breach notification", "notification of breach", "unauthorized access"],
                "description": "Must include breach notification obligations (§ 164.400-414)",
                "severity": "HIGH",
            },
        },
    },
    "FINRA": {
        "description": "Financial Industry Regulatory Authority (US)",
        "requirements": {
            "recordkeeping": {
                "keywords": ["recordkeeping", "books and records", "retain records", "SEC Rule 17a-4"],
                "description": "Must comply with recordkeeping rules (FINRA Rule 4511)",
                "severity": "HIGH",
            },
            "supervision": {
                "keywords": ["supervision", "supervisory system", "review and approval"],
                "description": "Should reference supervisory obligations (FINRA Rule 3110)",
                "severity": "MEDIUM",
            },
            "anti_money_laundering": {
                "keywords": ["anti-money laundering", "AML", "suspicious activity", "SAR", "OFAC"],
                "description": "Must reference AML compliance (FINRA Rule 3310)",
                "severity": "HIGH",
            },
            "privacy": {
                "keywords": ["privacy policy", "customer information", "Regulation S-P", "nonpublic personal information"],
                "description": "Must protect customer information (Regulation S-P)",
                "severity": "HIGH",
            },
        },
    },
}

RISK_STYLES = {
    "CRITICAL": ("#dc2626", "#fef2f2"),
    "HIGH": ("#ea580c", "#fff7ed"),
    "MEDIUM": ("#ca8a04", "#fefce8"),
    "LOW": ("#16a34a", "#f0fdf4"),
}


def _get_sentence_containing(text_lower, keyword_lower, start_idx):
    """FIX v3.1: Extract the full sentence containing the keyword match."""
    # Find sentence boundaries around the match
    # Look backward for sentence start
    sent_start = start_idx
    for i in range(start_idx - 1, max(0, start_idx - 500), -1):
        if text_lower[i] in '.!?' and i < start_idx - 2:
            sent_start = i + 1
            break
    else:
        sent_start = max(0, start_idx - 500)

    # Look forward for sentence end
    sent_end = start_idx + len(keyword_lower)
    for i in range(sent_end, min(len(text_lower), sent_end + 500)):
        if text_lower[i] in '.!?':
            sent_end = i + 1
            break
    else:
        sent_end = min(len(text_lower), sent_end + 500)

    return text_lower[sent_start:sent_end].strip()


def _check_negation(text_lower, keyword, window=200):
    """FIX v3.1: Check if a keyword match is negated — uses sentence-aware window."""
    idx = text_lower.find(keyword.lower())
    if idx == -1:
        return False

    # Get sentence-aware context (more accurate than fixed window)
    sentence = _get_sentence_containing(text_lower, keyword.lower(), idx)

    # Also get a wider window for cross-sentence negation
    start = max(0, idx - window)
    end = min(len(text_lower), idx + len(keyword) + window)
    wider_context = text_lower[start:end]

    # Check sentence first (higher confidence)
    for neg_pat in _NEGATION_PATTERNS_COMPILED:
        if neg_pat.search(sentence):
            return True

    # Then check wider window (lower confidence, still relevant)
    for neg_pat in _NEGATION_PATTERNS_COMPILED[:4]:  # Only strong negation patterns for wider window
        if neg_pat.search(wider_context):
            return True

    return False


def _get_context(text, keyword, window=100):
    """Extract context around a keyword match with sentence boundaries."""
    text_lower = text.lower()
    idx = text_lower.find(keyword.lower())
    if idx == -1:
        return ""
    start = max(0, idx - window)
    end = min(len(text), idx + len(keyword) + window)
    context = text[start:end].strip()
    if start > 0:
        context = "..." + context
    if end < len(text):
        context = context + "..."
    return context


# FIX v4.3: Regulation applicability gates — only apply regulations relevant to the contract type
_REGULATION_GATES = {
    "SOX": re.compile(
        r'financial\s+statement|internal\s+control|audit\s+committee|public\s+company|sec\s+filing|pcaob|sarbanes',
        re.IGNORECASE
    ),
    "HIPAA": re.compile(
        r'protected\s+health|(?<!\w)phi(?!\w)|health\s+information|medical\s+record|business\s+associate\s+agreement|(?<!\w)baa(?!\w)|hipaa',
        re.IGNORECASE
    ),
    "FINRA": re.compile(
        r'securities|broker[\-\s]?dealer|investment\s+advis|financial\s+industry|(?<!\w)finra(?!\w)|registered\s+representative',
        re.IGNORECASE
    ),
}


def check_compliance(text):
    """Check contract text against applicable regulatory frameworks with negation handling.
    
    FIX v4.3:
      - Regulation applicability gates: SOX/HIPAA/FINRA only checked if contract contains relevant terms
      - Whole-word keyword matching: prevents substring false positives (e.g. "SAR" in "Year 3")
      - GDPR and CCPA always checked (broadly applicable)
    """
    text_lower = text.lower()
    results = {}

    # FIX v4.3: Determine which regulations apply to this contract
    applicable_regs = {"GDPR", "CCPA"}  # Always check these
    for reg_name, gate_pattern in _REGULATION_GATES.items():
        if gate_pattern.search(text):
            applicable_regs.add(reg_name)

    for reg_name, reg_data in REGULATIONS.items():
        # FIX v4.3: Skip regulations that don't apply to this contract
        if reg_name not in applicable_regs:
            # Still include in results but mark as not applicable
            results[reg_name] = {
                "description": reg_data["description"],
                "compliance_rate": -1,  # -1 = not applicable
                "checks": [],
                "overall_status": "NOT_APPLICABLE",
                "negated_count": 0,
                "ambiguous_count": 0,
                "note": f"{reg_name} does not appear applicable to this contract type.",
            }
            continue

        checks = []
        for req_name, req_data in reg_data["requirements"].items():
            matched = False
            negated = False
            matched_keywords = []
            context_snippets = []

            for kw in req_data["keywords"]:
                # FIX v4.3: Use whole-word matching to prevent substring false positives
                # e.g., "SAR" should not match "Year 3" tokenised fragments
                kw_lower = kw.lower()
                if len(kw_lower) <= 4:
                    # Short keywords (SAR, DPO, PHI, BAA) — require word boundaries
                    pattern = re.compile(r'\b' + re.escape(kw_lower) + r'\b', re.IGNORECASE)
                    if not pattern.search(text_lower):
                        continue
                else:
                    # Longer keywords — substring is OK
                    if kw_lower not in text_lower:
                        continue

                matched_keywords.append(kw)
                if _check_negation(text_lower, kw):
                    negated = True
                else:
                    matched = True
                ctx = _get_context(text, kw)
                if ctx:
                    context_snippets.append(ctx)

            if matched and not negated:
                status = "PASS"
            elif negated and not matched:
                status = "NEGATED"
            elif matched and negated:
                status = "AMBIGUOUS"
            else:
                status = "MISSING"

            checks.append({
                "requirement": req_name,
                "description": req_data["description"],
                "severity": req_data["severity"],
                "status": status,
                "matched_keywords": matched_keywords,
                "context": context_snippets[:2],
            })

        passed = sum(1 for c in checks if c["status"] == "PASS")
        total = len(checks)
        compliance_rate = round(passed / total * 100) if total > 0 else 0

        negated_count = sum(1 for c in checks if c["status"] == "NEGATED")
        ambiguous_count = sum(1 for c in checks if c["status"] == "AMBIGUOUS")

        if compliance_rate >= 80:
            overall = "COMPLIANT"
        elif compliance_rate >= 40:
            overall = "PARTIAL"
        else:
            overall = "NON-COMPLIANT"

        if any(c["status"] == "NEGATED" and c["severity"] in ("CRITICAL", "HIGH") for c in checks):
            overall = "WARNING"

        results[reg_name] = {
            "description": reg_data["description"],
            "compliance_rate": compliance_rate,
            "checks": checks,
            "overall_status": overall,
            "negated_count": negated_count,
            "ambiguous_count": ambiguous_count,
        }

    return results


def render_compliance_html(results):
    """Render compliance results as HTML for Gradio."""
    html = '<div style="font-family:system-ui,sans-serif;">'

    for reg_name, reg_result in results.items():
        rate = reg_result["compliance_rate"]
        status = reg_result["overall_status"]

        # FIX v4.3: Handle NOT_APPLICABLE regulations
        if status == "NOT_APPLICABLE":
            note = reg_result.get("note", f"{reg_name} not applicable to this contract.")
            html += f'''
            <div style="border:1px solid #e5e7eb;border-radius:10px;margin-bottom:16px;overflow:hidden;opacity:0.6;">
              <div style="display:flex;justify-content:space-between;align-items:center;padding:12px 16px;background:#f9fafb;border-bottom:1px solid #e5e7eb;">
                <div>
                  <span style="font-size:16px;font-weight:700;color:#9ca3af;">{reg_name}</span>
                  <p style="font-size:11px;color:#9ca3af;margin:2px 0 0 0;">{reg_result["description"]}</p>
                </div>
                <div style="text-align:right;">
                  <div style="font-size:12px;font-weight:600;color:#9ca3af;">N/A</div>
                  <div style="font-size:10px;color:#9ca3af;">Not Applicable</div>
                </div>
              </div>
              <div style="padding:10px 16px;font-size:11px;color:#9ca3af;font-style:italic;">
                {note}
              </div>
            </div>
            '''
            continue

        status_colors = {
            "COMPLIANT": ("#16a34a", "#f0fdf4"),
            "PARTIAL": ("#ca8a04", "#fefce8"),
            "NON-COMPLIANT": ("#dc2626", "#fef2f2"),
            "WARNING": ("#ea580c", "#fff7ed"),
        }
        status_color, status_bg = status_colors.get(status, ("#6b7280", "#f9fafb"))

        neg = reg_result.get("negated_count", 0)
        amb = reg_result.get("ambiguous_count", 0)
        warnings = ""
        if neg > 0:
            warnings += f'<span style="font-size:10px;color:#ea580c;margin-left:8px;">⚠️ {neg} negated</span>'
        if amb > 0:
            warnings += f'<span style="font-size:10px;color:#ca8a04;margin-left:8px;">❓ {amb} ambiguous</span>'

        html += f'''
        <div style="border:1px solid #e5e7eb;border-radius:10px;margin-bottom:16px;overflow:hidden;">
          <div style="display:flex;justify-content:space-between;align-items:center;padding:12px 16px;background:{status_bg};border-bottom:1px solid #e5e7eb;">
            <div>
              <span style="font-size:16px;font-weight:700;color:#1f2937;">{reg_name}</span>
              {warnings}
              <p style="font-size:11px;color:#6b7280;margin:2px 0 0 0;">{reg_result["description"]}</p>
            </div>
            <div style="text-align:right;">
              <div style="font-size:24px;font-weight:700;color:{status_color};">{rate}%</div>
              <div style="font-size:11px;color:{status_color};font-weight:500;">{status}</div>
            </div>
          </div>
          <div style="padding:8px 16px;">
        '''

        for check in reg_result["checks"]:
            color, bg = RISK_STYLES[check["severity"]]
            status_icons = {"PASS": "✅", "MISSING": "❌", "NEGATED": "🚫", "AMBIGUOUS": "❓"}
            status_icon = status_icons.get(check["status"], "❓")
            status_text_map = {"PASS": "Found", "MISSING": "Missing", "NEGATED": "Negated", "AMBIGUOUS": "Ambiguous"}
            status_text = status_text_map.get(check["status"], "Unknown")
            keywords = ", ".join(check["matched_keywords"][:3]) if check["matched_keywords"] else "—"

            context_html = ""
            if check.get("context"):
                ctx = check["context"][0][:120].replace("<", "&lt;").replace(">", "&gt;")
                context_html = f'<div style="font-size:10px;color:#6b7280;margin-top:2px;font-style:italic;">"{ctx}"</div>'

            html += f'''
            <div style="display:flex;justify-content:space-between;align-items:flex-start;padding:8px 0;border-bottom:1px solid #f3f4f6;">
              <div style="flex:1;">
                <div style="font-size:12px;font-weight:500;color:#374151;">{check["description"]}</div>
                <div style="font-size:10px;color:#9ca3af;margin-top:2px;">Keywords: {keywords}</div>
                {context_html}
              </div>
              <div style="display:flex;align-items:center;gap:6px;margin-left:8px;">
                <span style="font-size:10px;color:{color};font-weight:600;background:{bg};padding:2px 8px;border-radius:4px;">{check["severity"]}</span>
                <span style="font-size:13px;" title="{status_text}">{status_icon}</span>
              </div>
            </div>
            '''

        html += '</div></div>'

    html += '</div>'
    return html