""" ClauseGuard — Obligation Tracker v3.0 ═════════════════════════════════════ FIXED in v3.0: • Reduced false positives (filter out generic service descriptions) • Better party extraction with role detection • Obligation priority scoring • Context-aware obligation type detection """ import re from collections import defaultdict from datetime import datetime, timedelta # Obligation keywords by category — more specific patterns to reduce false positives OBLIGATION_PATTERNS = { "monetary": [ r"(?:shall|must|will|agrees? to)\s+pay\s+(?:a\s+)?(?:(?:monthly|annual|quarterly)\s+)?(?:fee|amount|sum|payment)?\s*(?:of\s+)?(?:\$[\d,]+(?:\.\d{2})?)", r"(?:fee|payment|compensation|reimburs(?:e|ement))\s+(?:of|in the amount of)\s+\$[\d,]+", r"(?:shall|must|will)\s+remit\s+\$[\d,]+", r"(?:liquidated damages|penalty)\s+(?:of|in the amount of)\s+\$[\d,]+", r"(?:shall|must)\s+(?:pay|reimburse)\s+(?:all|any)\s+(?:outstanding|overdue|unpaid)", ], "compliance": [ r"(?:shall|must|will)\s+comply\s+with\s+(?:all\s+)?(?:applicable\s+)?(?:laws|regulations|standards|requirements)", r"(?:shall|must|will)\s+(?:adhere|conform)\s+to\s+(?:the|all|applicable)", r"(?:shall|must|will)\s+(?:obtain|maintain|procure)\s+(?:all\s+)?(?:necessary|required|applicable)\s+(?:approvals?|permits?|licenses?|certifications?)", r"(?:shall|must|will)\s+maintain\s+(?:insurance|coverage|bond|policy)", r"(?:shall|must|will)\s+ensure\s+(?:compliance|conformance|adherence)", ], "reporting": [ r"(?:shall|must|will)\s+(?:report|disclose)\s+(?:to|any)\s+(?:the|supervisory|regulatory)", r"(?:shall|must|will)\s+provide\s+(?:regular|monthly|quarterly|annual|periodic)\s+(?:reports?|updates?|statements?)", r"(?:shall|must|will)\s+(?:notify|inform)\s+(?:the other party|promptly|immediately|within)", r"(?:shall|must|will)\s+deliver\s+(?:a|an|the)\s+(?:report|statement|notice|certificate)", r"(?:shall|must|will)\s+provide\s+(?:SOC|audit|compliance)\s+(?:\d+\s+)?(?:Type\s+)?(?:reports?|certificates?)", ], "delivery": [ r"(?:shall|must|will)\s+deliver\s+(?:the|all|any)\s+(?:products?|goods?|materials?|deliverables?|services?)", r"(?:shall|must|will)\s+(?:furnish|supply)\s+(?:the|all|any)", r"(?:shall|must|will)\s+(?:submit|produce|complete)\s+(?:the|all|any)\s+(?:work|deliverables?|results?)", r"(?:delivery|performance)\s+(?:date|schedule|deadline|timeline|milestone)", ], "termination": [ r"(?:shall|must|will)\s+(?:return|surrender)\s+(?:all|any)\s+(?:materials?|property|documents?|data|information|equipment)", r"(?:shall|must|will)\s+(?:destroy|delete|erase)\s+(?:all|any)\s+(?:copies|data|information|records?|materials?)", r"(?:shall|must|will)\s+(?:cease|discontinue)\s+(?:all|any)\s+(?:use|access|activities)", r"(?:upon|after|following)\s+termination.*(?:shall|must|will)\s+(?:pay|return|destroy|cease)", r"(?:surviving|post-termination)\s+obligations?", ], } # More restrictive — patterns that DON'T indicate obligations (false positive filters) _FALSE_POSITIVE_PATTERNS = [ r"^(?:the|this)\s+(?:agreement|contract|document)\s+(?:shall|will)\s+(?:be|become|remain)", r"(?:shall|will)\s+(?:be\s+)?(?:governed|construed|interpreted)", r"(?:shall|will)\s+(?:constitute|represent|mean|include)", r"(?:shall|will)\s+(?:not\s+)?(?:be\s+)?(?:deemed|considered|construed)", r"(?:shall|will)\s+(?:have|possess)\s+(?:the\s+)?(?:right|authority|power)", r"(?:shall|will)\s+(?:survive|remain\s+in\s+(?:effect|force))", ] # Timeframe extraction TIME_PATTERNS = [ (r"within\s+(\d+)\s+(day|week|month|year)s?", "relative"), (r"no\s+later\s+than\s+(\d+)\s+(day|week|month|year)s?", "relative"), (r"within\s+(\d+)\s+business\s+days?", "business_days"), (r"by\s+([A-Z][a-z]+\s+\d{1,2},?\s+\d{4})", "absolute"), (r"on\s+or\s+before\s+([A-Z][a-z]+\s+\d{1,2},?\s+\d{4})", "absolute"), (r"(\d{1,2}/\d{1,2}/\d{2,4})", "absolute_date"), (r"(?:promptly|immediately)(?:\s+(?:upon|after|following))?", "immediate"), ] PARTY_PATTERNS = [ r"\b(?:Party A|Party B|Disclosing Party|Receiving Party|Licensor|Licensee|Buyer|Seller|Tenant|Landlord|Employer|Employee|Company|Customer|Vendor|Client|Provider|Contractor)\b", r"\b[A-Z][A-Za-z0-9\s&]+?(?:Inc\.?|LLC|Ltd\.?|Limited|Corp\.?|Corporation|PLC|GmbH)\b", ] # Priority scoring for obligation types _PRIORITY_MAP = { "monetary": 3, "termination": 3, "compliance": 2, "reporting": 2, "delivery": 1, } # FIX v4.2: Pre-compile obligation patterns at module level (was recompiling per sentence) _OBLIGATION_PATTERNS_COMPILED = { otype: [re.compile(p, re.IGNORECASE) for p in patterns] for otype, patterns in OBLIGATION_PATTERNS.items() } # FIX v4.2: Pre-compile false positive patterns _FALSE_POSITIVE_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _FALSE_POSITIVE_PATTERNS] # FIX v4.2: Pre-compile time patterns _TIME_PATTERNS_COMPILED = [(re.compile(p, re.IGNORECASE), ptype) for p, ptype in TIME_PATTERNS] # FIX v4.2: Pre-compile party patterns _PARTY_PATTERNS_COMPILED = [re.compile(p) for p in PARTY_PATTERNS] def _is_false_positive(sentence): """Check if a sentence is a common false positive (definition/interpretation, not obligation).""" for fp in _FALSE_POSITIVE_PATTERNS_COMPILED: if fp.search(sentence): return True return False def extract_obligations(text): """Extract obligations from contract text with false positive filtering.""" obligations = [] # Split into sentences sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text) for sentence in sentences: sentence = sentence.strip() if len(sentence) < 30 or len(sentence) > 1000: continue # Skip false positives if _is_false_positive(sentence): continue found_types = set() for otype, patterns in _OBLIGATION_PATTERNS_COMPILED.items(): for pat in patterns: if pat.search(sentence): found_types.add(otype) break if not found_types: continue # Extract party (Fix 8: scope to sentence only, reject >40 char strings) party = "Unknown" # First try structured direction detection obligation_direction = _detect_obligation_direction(sentence) if obligation_direction: party = obligation_direction else: # Fallback to pattern matching within the sentence for pp in _PARTY_PATTERNS_COMPILED: m = pp.search(sentence) if m: candidate = m.group(0).strip() # Fix 8: Reject party strings >40 chars (header bleed-through) if len(candidate) <= 40: party = candidate break # Extract timeframe deadline = "Not specified" deadline_urgency = 0 for pat, ptype in _TIME_PATTERNS_COMPILED: m = pat.search(sentence) if m: if ptype == "relative": num = m.group(1) unit = m.group(2) deadline = f"Within {num} {unit}(s)" deadline_urgency = int(num) elif ptype == "business_days": num = m.group(1) deadline = f"Within {num} business day(s)" deadline_urgency = int(num) elif ptype in ("absolute", "absolute_date"): deadline = m.group(1) deadline_urgency = 1 elif ptype == "immediate": deadline = "Immediately" deadline_urgency = 0 break for otype in found_types: priority = _PRIORITY_MAP.get(otype, 1) if deadline_urgency > 0 and deadline_urgency <= 7: priority += 1 # Urgent deadlines get higher priority obligations.append({ "type": otype, "party": party, "description": sentence[:250] + ("..." if len(sentence) > 250 else ""), "deadline": deadline, "full_text": sentence, "priority": priority, }) # Sort by priority (highest first) obligations.sort(key=lambda x: x.get("priority", 0), reverse=True) # FIX v4.3: Deduplicate obligations — same text producing multiple types # Keep the more specific type (termination > compliance > monetary > general) _TYPE_PRIORITY = {"termination": 1, "compliance": 2, "reporting": 3, "delivery": 4, "monetary": 5} seen_texts = {} deduped = [] for ob in obligations: # Hash on first 80 chars of description + party key = hash(ob["description"][:80] + ob["party"]) type_pri = _TYPE_PRIORITY.get(ob["type"], 99) if key not in seen_texts: seen_texts[key] = (type_pri, len(deduped)) deduped.append(ob) else: existing_pri, existing_idx = seen_texts[key] if type_pri < existing_pri: # This type is more specific — replace deduped[existing_idx] = ob seen_texts[key] = (type_pri, existing_idx) obligations = deduped return obligations def _detect_obligation_direction(sentence): """Try to detect who bears the obligation from sentence structure.""" patterns = [ (r"^(?:The\s+)?(Provider|Company|Licensor|Landlord|Employer|Seller|Vendor)\s+(?:shall|must|will)", None), (r"^(?:The\s+)?(Customer|Client|Licensee|Tenant|Employee|Buyer)\s+(?:shall|must|will)", None), (r"^(?:Each|Both)\s+part(?:y|ies)\s+(?:shall|must|will)", "Both parties"), (r"^(?:Neither|No)\s+party\s+(?:shall|may)", "Neither party"), ] for pat, override in patterns: m = re.search(pat, sentence, re.IGNORECASE) if m: return override or m.group(1) return None def render_obligations_html(obligations): """Render obligations as HTML cards for Gradio.""" if not obligations: return '
No obligations detected.
' # Group by type grouped = defaultdict(list) for ob in obligations: grouped[ob["type"]].append(ob) type_icons = { "monetary": "💰", "compliance": "⚖️", "reporting": "📊", "delivery": "📦", "termination": "🛑", } type_colors = { "monetary": "#22c55e", "compliance": "#f59e0b", "reporting": "#3b82f6", "delivery": "#8b5cf6", "termination": "#ef4444", } html = '
' # Summary counts html += '
' for otype, obs in sorted(grouped.items()): color = type_colors.get(otype, "#6b7280") icon = type_icons.get(otype, "📋") html += f'''
{icon}
{len(obs)}
{otype}
''' html += '
' # Individual cards for otype, obs in sorted(grouped.items()): color = type_colors.get(otype, "#6b7280") icon = type_icons.get(otype, "📋") html += f'

{icon} {otype.title()} Obligations

' for ob in obs: priority = ob.get("priority", 1) priority_badge = "" if priority >= 3: priority_badge = 'HIGH PRIORITY' elif priority >= 2: priority_badge = 'MEDIUM' html += f'''
{ob["party"]}{priority_badge} {ob["deadline"]}

{ob["description"]}

''' html += '
' return html