ClauseGuard / obligations.py
gaurv007's picture
fix(v4.3): obligations.py β€” bug report fixes (10 issues)
5f38aa2 verified
"""
ClauseGuard β€” Obligation Tracker v3.0
═════════════════════════════════════
FIXED in v3.0:
β€’ Reduced false positives (filter out generic service descriptions)
β€’ Better party extraction with role detection
β€’ Obligation priority scoring
β€’ Context-aware obligation type detection
"""
import re
from collections import defaultdict
from datetime import datetime, timedelta
# Obligation keywords by category β€” more specific patterns to reduce false positives
OBLIGATION_PATTERNS = {
"monetary": [
r"(?:shall|must|will|agrees? to)\s+pay\s+(?:a\s+)?(?:(?:monthly|annual|quarterly)\s+)?(?:fee|amount|sum|payment)?\s*(?:of\s+)?(?:\$[\d,]+(?:\.\d{2})?)",
r"(?:fee|payment|compensation|reimburs(?:e|ement))\s+(?:of|in the amount of)\s+\$[\d,]+",
r"(?:shall|must|will)\s+remit\s+\$[\d,]+",
r"(?:liquidated damages|penalty)\s+(?:of|in the amount of)\s+\$[\d,]+",
r"(?:shall|must)\s+(?:pay|reimburse)\s+(?:all|any)\s+(?:outstanding|overdue|unpaid)",
],
"compliance": [
r"(?:shall|must|will)\s+comply\s+with\s+(?:all\s+)?(?:applicable\s+)?(?:laws|regulations|standards|requirements)",
r"(?:shall|must|will)\s+(?:adhere|conform)\s+to\s+(?:the|all|applicable)",
r"(?:shall|must|will)\s+(?:obtain|maintain|procure)\s+(?:all\s+)?(?:necessary|required|applicable)\s+(?:approvals?|permits?|licenses?|certifications?)",
r"(?:shall|must|will)\s+maintain\s+(?:insurance|coverage|bond|policy)",
r"(?:shall|must|will)\s+ensure\s+(?:compliance|conformance|adherence)",
],
"reporting": [
r"(?:shall|must|will)\s+(?:report|disclose)\s+(?:to|any)\s+(?:the|supervisory|regulatory)",
r"(?:shall|must|will)\s+provide\s+(?:regular|monthly|quarterly|annual|periodic)\s+(?:reports?|updates?|statements?)",
r"(?:shall|must|will)\s+(?:notify|inform)\s+(?:the other party|promptly|immediately|within)",
r"(?:shall|must|will)\s+deliver\s+(?:a|an|the)\s+(?:report|statement|notice|certificate)",
r"(?:shall|must|will)\s+provide\s+(?:SOC|audit|compliance)\s+(?:\d+\s+)?(?:Type\s+)?(?:reports?|certificates?)",
],
"delivery": [
r"(?:shall|must|will)\s+deliver\s+(?:the|all|any)\s+(?:products?|goods?|materials?|deliverables?|services?)",
r"(?:shall|must|will)\s+(?:furnish|supply)\s+(?:the|all|any)",
r"(?:shall|must|will)\s+(?:submit|produce|complete)\s+(?:the|all|any)\s+(?:work|deliverables?|results?)",
r"(?:delivery|performance)\s+(?:date|schedule|deadline|timeline|milestone)",
],
"termination": [
r"(?:shall|must|will)\s+(?:return|surrender)\s+(?:all|any)\s+(?:materials?|property|documents?|data|information|equipment)",
r"(?:shall|must|will)\s+(?:destroy|delete|erase)\s+(?:all|any)\s+(?:copies|data|information|records?|materials?)",
r"(?:shall|must|will)\s+(?:cease|discontinue)\s+(?:all|any)\s+(?:use|access|activities)",
r"(?:upon|after|following)\s+termination.*(?:shall|must|will)\s+(?:pay|return|destroy|cease)",
r"(?:surviving|post-termination)\s+obligations?",
],
}
# More restrictive β€” patterns that DON'T indicate obligations (false positive filters)
_FALSE_POSITIVE_PATTERNS = [
r"^(?:the|this)\s+(?:agreement|contract|document)\s+(?:shall|will)\s+(?:be|become|remain)",
r"(?:shall|will)\s+(?:be\s+)?(?:governed|construed|interpreted)",
r"(?:shall|will)\s+(?:constitute|represent|mean|include)",
r"(?:shall|will)\s+(?:not\s+)?(?:be\s+)?(?:deemed|considered|construed)",
r"(?:shall|will)\s+(?:have|possess)\s+(?:the\s+)?(?:right|authority|power)",
r"(?:shall|will)\s+(?:survive|remain\s+in\s+(?:effect|force))",
]
# Timeframe extraction
TIME_PATTERNS = [
(r"within\s+(\d+)\s+(day|week|month|year)s?", "relative"),
(r"no\s+later\s+than\s+(\d+)\s+(day|week|month|year)s?", "relative"),
(r"within\s+(\d+)\s+business\s+days?", "business_days"),
(r"by\s+([A-Z][a-z]+\s+\d{1,2},?\s+\d{4})", "absolute"),
(r"on\s+or\s+before\s+([A-Z][a-z]+\s+\d{1,2},?\s+\d{4})", "absolute"),
(r"(\d{1,2}/\d{1,2}/\d{2,4})", "absolute_date"),
(r"(?:promptly|immediately)(?:\s+(?:upon|after|following))?", "immediate"),
]
PARTY_PATTERNS = [
r"\b(?:Party A|Party B|Disclosing Party|Receiving Party|Licensor|Licensee|Buyer|Seller|Tenant|Landlord|Employer|Employee|Company|Customer|Vendor|Client|Provider|Contractor)\b",
r"\b[A-Z][A-Za-z0-9\s&]+?(?:Inc\.?|LLC|Ltd\.?|Limited|Corp\.?|Corporation|PLC|GmbH)\b",
]
# Priority scoring for obligation types
_PRIORITY_MAP = {
"monetary": 3,
"termination": 3,
"compliance": 2,
"reporting": 2,
"delivery": 1,
}
# FIX v4.2: Pre-compile obligation patterns at module level (was recompiling per sentence)
_OBLIGATION_PATTERNS_COMPILED = {
otype: [re.compile(p, re.IGNORECASE) for p in patterns]
for otype, patterns in OBLIGATION_PATTERNS.items()
}
# FIX v4.2: Pre-compile false positive patterns
_FALSE_POSITIVE_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _FALSE_POSITIVE_PATTERNS]
# FIX v4.2: Pre-compile time patterns
_TIME_PATTERNS_COMPILED = [(re.compile(p, re.IGNORECASE), ptype) for p, ptype in TIME_PATTERNS]
# FIX v4.2: Pre-compile party patterns
_PARTY_PATTERNS_COMPILED = [re.compile(p) for p in PARTY_PATTERNS]
def _is_false_positive(sentence):
"""Check if a sentence is a common false positive (definition/interpretation, not obligation)."""
for fp in _FALSE_POSITIVE_PATTERNS_COMPILED:
if fp.search(sentence):
return True
return False
def extract_obligations(text):
"""Extract obligations from contract text with false positive filtering."""
obligations = []
# Split into sentences
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
for sentence in sentences:
sentence = sentence.strip()
if len(sentence) < 30 or len(sentence) > 1000:
continue
# Skip false positives
if _is_false_positive(sentence):
continue
found_types = set()
for otype, patterns in _OBLIGATION_PATTERNS_COMPILED.items():
for pat in patterns:
if pat.search(sentence):
found_types.add(otype)
break
if not found_types:
continue
# Extract party (Fix 8: scope to sentence only, reject >40 char strings)
party = "Unknown"
# First try structured direction detection
obligation_direction = _detect_obligation_direction(sentence)
if obligation_direction:
party = obligation_direction
else:
# Fallback to pattern matching within the sentence
for pp in _PARTY_PATTERNS_COMPILED:
m = pp.search(sentence)
if m:
candidate = m.group(0).strip()
# Fix 8: Reject party strings >40 chars (header bleed-through)
if len(candidate) <= 40:
party = candidate
break
# Extract timeframe
deadline = "Not specified"
deadline_urgency = 0
for pat, ptype in _TIME_PATTERNS_COMPILED:
m = pat.search(sentence)
if m:
if ptype == "relative":
num = m.group(1)
unit = m.group(2)
deadline = f"Within {num} {unit}(s)"
deadline_urgency = int(num)
elif ptype == "business_days":
num = m.group(1)
deadline = f"Within {num} business day(s)"
deadline_urgency = int(num)
elif ptype in ("absolute", "absolute_date"):
deadline = m.group(1)
deadline_urgency = 1
elif ptype == "immediate":
deadline = "Immediately"
deadline_urgency = 0
break
for otype in found_types:
priority = _PRIORITY_MAP.get(otype, 1)
if deadline_urgency > 0 and deadline_urgency <= 7:
priority += 1 # Urgent deadlines get higher priority
obligations.append({
"type": otype,
"party": party,
"description": sentence[:250] + ("..." if len(sentence) > 250 else ""),
"deadline": deadline,
"full_text": sentence,
"priority": priority,
})
# Sort by priority (highest first)
obligations.sort(key=lambda x: x.get("priority", 0), reverse=True)
# FIX v4.3: Deduplicate obligations β€” same text producing multiple types
# Keep the more specific type (termination > compliance > monetary > general)
_TYPE_PRIORITY = {"termination": 1, "compliance": 2, "reporting": 3, "delivery": 4, "monetary": 5}
seen_texts = {}
deduped = []
for ob in obligations:
# Hash on first 80 chars of description + party
key = hash(ob["description"][:80] + ob["party"])
type_pri = _TYPE_PRIORITY.get(ob["type"], 99)
if key not in seen_texts:
seen_texts[key] = (type_pri, len(deduped))
deduped.append(ob)
else:
existing_pri, existing_idx = seen_texts[key]
if type_pri < existing_pri:
# This type is more specific β€” replace
deduped[existing_idx] = ob
seen_texts[key] = (type_pri, existing_idx)
obligations = deduped
return obligations
def _detect_obligation_direction(sentence):
"""Try to detect who bears the obligation from sentence structure."""
patterns = [
(r"^(?:The\s+)?(Provider|Company|Licensor|Landlord|Employer|Seller|Vendor)\s+(?:shall|must|will)", None),
(r"^(?:The\s+)?(Customer|Client|Licensee|Tenant|Employee|Buyer)\s+(?:shall|must|will)", None),
(r"^(?:Each|Both)\s+part(?:y|ies)\s+(?:shall|must|will)", "Both parties"),
(r"^(?:Neither|No)\s+party\s+(?:shall|may)", "Neither party"),
]
for pat, override in patterns:
m = re.search(pat, sentence, re.IGNORECASE)
if m:
return override or m.group(1)
return None
def render_obligations_html(obligations):
"""Render obligations as HTML cards for Gradio."""
if not obligations:
return '<div style="padding:16px;color:#6b7280;text-align:center;">No obligations detected.</div>'
# Group by type
grouped = defaultdict(list)
for ob in obligations:
grouped[ob["type"]].append(ob)
type_icons = {
"monetary": "πŸ’°",
"compliance": "βš–οΈ",
"reporting": "πŸ“Š",
"delivery": "πŸ“¦",
"termination": "πŸ›‘",
}
type_colors = {
"monetary": "#22c55e",
"compliance": "#f59e0b",
"reporting": "#3b82f6",
"delivery": "#8b5cf6",
"termination": "#ef4444",
}
html = '<div style="font-family:system-ui,sans-serif;">'
# Summary counts
html += '<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(120px,1fr));gap:8px;margin-bottom:16px;">'
for otype, obs in sorted(grouped.items()):
color = type_colors.get(otype, "#6b7280")
icon = type_icons.get(otype, "πŸ“‹")
html += f'''
<div style="text-align:center;padding:10px;border-radius:8px;background:{color}15;border:1px solid {color}30;">
<div style="font-size:20px;">{icon}</div>
<div style="font-size:20px;font-weight:700;color:{color};">{len(obs)}</div>
<div style="font-size:11px;color:{color};text-transform:capitalize;">{otype}</div>
</div>
'''
html += '</div>'
# Individual cards
for otype, obs in sorted(grouped.items()):
color = type_colors.get(otype, "#6b7280")
icon = type_icons.get(otype, "πŸ“‹")
html += f'<h3 style="font-size:14px;color:#374151;margin:16px 0 8px 0;border-bottom:2px solid {color}30;padding-bottom:4px;">{icon} {otype.title()} Obligations</h3>'
for ob in obs:
priority = ob.get("priority", 1)
priority_badge = ""
if priority >= 3:
priority_badge = '<span style="font-size:9px;background:#fef2f2;color:#dc2626;padding:1px 4px;border-radius:3px;margin-left:4px;">HIGH PRIORITY</span>'
elif priority >= 2:
priority_badge = '<span style="font-size:9px;background:#fefce8;color:#ca8a04;padding:1px 4px;border-radius:3px;margin-left:4px;">MEDIUM</span>'
html += f'''
<div style="border:1px solid #e5e7eb;border-left:4px solid {color};border-radius:6px;padding:10px;margin-bottom:8px;background:#fafafa;">
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:4px;">
<span style="font-size:12px;font-weight:600;color:{color};">{ob["party"]}{priority_badge}</span>
<span style="font-size:11px;color:#6b7280;background:#f3f4f6;padding:2px 8px;border-radius:4px;">{ob["deadline"]}</span>
</div>
<p style="font-size:12px;color:#4b5563;margin:0;line-height:1.5;">{ob["description"]}</p>
</div>
'''
html += '</div>'
return html