Spaces:
Sleeping
Sleeping
v3.1: Fix 9-10 — cross-domain detection, higher similarity threshold, contract type gate
Browse files- compare.py +33 -2
compare.py
CHANGED
|
@@ -98,6 +98,28 @@ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
|
|
| 98 |
if clauses_b is None:
|
| 99 |
clauses_b = _split_clauses(text_b)
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
# Build clause type maps
|
| 102 |
type_map_a = defaultdict(list)
|
| 103 |
type_map_b = defaultdict(list)
|
|
@@ -111,8 +133,9 @@ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
|
|
| 111 |
matched_b = set()
|
| 112 |
modified = []
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
|
|
|
| 116 |
|
| 117 |
for i, ca in enumerate(clauses_a):
|
| 118 |
best_sim = 0
|
|
@@ -181,12 +204,20 @@ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
|
|
| 181 |
risk_delta = "Similar risk profiles"
|
| 182 |
risk_winner = "tie"
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
comparison_method = "semantic (sentence embeddings)" if _embedder is not None else "lexical (string matching)"
|
| 185 |
|
| 186 |
return {
|
| 187 |
"alignment_score": round(alignment, 3),
|
| 188 |
"contract_a_clauses": len(clauses_a),
|
| 189 |
"contract_b_clauses": len(clauses_b),
|
|
|
|
|
|
|
|
|
|
| 190 |
"added_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in added[:50]],
|
| 191 |
"removed_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in removed[:50]],
|
| 192 |
"modified_clauses": modified[:50],
|
|
|
|
| 98 |
if clauses_b is None:
|
| 99 |
clauses_b = _split_clauses(text_b)
|
| 100 |
|
| 101 |
+
# Fix 9: Detect contract types and flag cross-domain comparisons
|
| 102 |
+
_CONTRACT_TYPE_KEYWORDS = {
|
| 103 |
+
"employment": ["employee", "employer", "salary", "compensation", "benefits", "vacation", "severance", "at-will"],
|
| 104 |
+
"lease": ["landlord", "tenant", "rent", "premises", "lease", "occupancy", "security deposit", "eviction"],
|
| 105 |
+
"service": ["service provider", "customer", "SLA", "deliverables", "statement of work", "SOW"],
|
| 106 |
+
"nda": ["confidential", "non-disclosure", "disclosing party", "receiving party"],
|
| 107 |
+
"saas": ["subscription", "SaaS", "cloud", "uptime", "API", "data processing"],
|
| 108 |
+
"purchase": ["buyer", "seller", "purchase order", "goods", "shipment", "delivery"],
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
def _detect_contract_type(text):
|
| 112 |
+
text_lower = text.lower()
|
| 113 |
+
scores = {}
|
| 114 |
+
for ctype, keywords in _CONTRACT_TYPE_KEYWORDS.items():
|
| 115 |
+
scores[ctype] = sum(1 for kw in keywords if kw.lower() in text_lower)
|
| 116 |
+
best = max(scores, key=scores.get)
|
| 117 |
+
return best if scores[best] >= 2 else "general"
|
| 118 |
+
|
| 119 |
+
type_a = _detect_contract_type(text_a)
|
| 120 |
+
type_b = _detect_contract_type(text_b)
|
| 121 |
+
is_cross_domain = type_a != type_b and type_a != "general" and type_b != "general"
|
| 122 |
+
|
| 123 |
# Build clause type maps
|
| 124 |
type_map_a = defaultdict(list)
|
| 125 |
type_map_b = defaultdict(list)
|
|
|
|
| 133 |
matched_b = set()
|
| 134 |
modified = []
|
| 135 |
|
| 136 |
+
# Fix 10: Raise thresholds to reject false "modified" matches
|
| 137 |
+
SIMILARITY_THRESHOLD = 0.75 # was 0.70 — too many false matches
|
| 138 |
+
MODIFIED_THRESHOLD = 0.55 # was 0.40 — "Good Reason" ≠ "Force Majeure"
|
| 139 |
|
| 140 |
for i, ca in enumerate(clauses_a):
|
| 141 |
best_sim = 0
|
|
|
|
| 204 |
risk_delta = "Similar risk profiles"
|
| 205 |
risk_winner = "tie"
|
| 206 |
|
| 207 |
+
# Fix 9: Cross-domain warning
|
| 208 |
+
if is_cross_domain:
|
| 209 |
+
risk_delta = f"Cross-domain comparison ({type_a} vs {type_b}) — risk delta not meaningful across different contract types"
|
| 210 |
+
risk_winner = "cross-domain"
|
| 211 |
+
|
| 212 |
comparison_method = "semantic (sentence embeddings)" if _embedder is not None else "lexical (string matching)"
|
| 213 |
|
| 214 |
return {
|
| 215 |
"alignment_score": round(alignment, 3),
|
| 216 |
"contract_a_clauses": len(clauses_a),
|
| 217 |
"contract_b_clauses": len(clauses_b),
|
| 218 |
+
"contract_a_type": type_a,
|
| 219 |
+
"contract_b_type": type_b,
|
| 220 |
+
"is_cross_domain": is_cross_domain,
|
| 221 |
"added_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in added[:50]],
|
| 222 |
"removed_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in removed[:50]],
|
| 223 |
"modified_clauses": modified[:50],
|