gaurv007 commited on
Commit
c6e0514
·
verified ·
1 Parent(s): 3116f23

v3.1: Fix 9-10 — cross-domain detection, higher similarity threshold, contract type gate

Browse files
Files changed (1) hide show
  1. compare.py +33 -2
compare.py CHANGED
@@ -98,6 +98,28 @@ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
98
  if clauses_b is None:
99
  clauses_b = _split_clauses(text_b)
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  # Build clause type maps
102
  type_map_a = defaultdict(list)
103
  type_map_b = defaultdict(list)
@@ -111,8 +133,9 @@ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
111
  matched_b = set()
112
  modified = []
113
 
114
- SIMILARITY_THRESHOLD = 0.70
115
- MODIFIED_THRESHOLD = 0.40
 
116
 
117
  for i, ca in enumerate(clauses_a):
118
  best_sim = 0
@@ -181,12 +204,20 @@ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
181
  risk_delta = "Similar risk profiles"
182
  risk_winner = "tie"
183
 
 
 
 
 
 
184
  comparison_method = "semantic (sentence embeddings)" if _embedder is not None else "lexical (string matching)"
185
 
186
  return {
187
  "alignment_score": round(alignment, 3),
188
  "contract_a_clauses": len(clauses_a),
189
  "contract_b_clauses": len(clauses_b),
 
 
 
190
  "added_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in added[:50]],
191
  "removed_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in removed[:50]],
192
  "modified_clauses": modified[:50],
 
98
  if clauses_b is None:
99
  clauses_b = _split_clauses(text_b)
100
 
101
+ # Fix 9: Detect contract types and flag cross-domain comparisons
102
+ _CONTRACT_TYPE_KEYWORDS = {
103
+ "employment": ["employee", "employer", "salary", "compensation", "benefits", "vacation", "severance", "at-will"],
104
+ "lease": ["landlord", "tenant", "rent", "premises", "lease", "occupancy", "security deposit", "eviction"],
105
+ "service": ["service provider", "customer", "SLA", "deliverables", "statement of work", "SOW"],
106
+ "nda": ["confidential", "non-disclosure", "disclosing party", "receiving party"],
107
+ "saas": ["subscription", "SaaS", "cloud", "uptime", "API", "data processing"],
108
+ "purchase": ["buyer", "seller", "purchase order", "goods", "shipment", "delivery"],
109
+ }
110
+
111
+ def _detect_contract_type(text):
112
+ text_lower = text.lower()
113
+ scores = {}
114
+ for ctype, keywords in _CONTRACT_TYPE_KEYWORDS.items():
115
+ scores[ctype] = sum(1 for kw in keywords if kw.lower() in text_lower)
116
+ best = max(scores, key=scores.get)
117
+ return best if scores[best] >= 2 else "general"
118
+
119
+ type_a = _detect_contract_type(text_a)
120
+ type_b = _detect_contract_type(text_b)
121
+ is_cross_domain = type_a != type_b and type_a != "general" and type_b != "general"
122
+
123
  # Build clause type maps
124
  type_map_a = defaultdict(list)
125
  type_map_b = defaultdict(list)
 
133
  matched_b = set()
134
  modified = []
135
 
136
+ # Fix 10: Raise thresholds to reject false "modified" matches
137
+ SIMILARITY_THRESHOLD = 0.75 # was 0.70 — too many false matches
138
+ MODIFIED_THRESHOLD = 0.55 # was 0.40 — "Good Reason" ≠ "Force Majeure"
139
 
140
  for i, ca in enumerate(clauses_a):
141
  best_sim = 0
 
204
  risk_delta = "Similar risk profiles"
205
  risk_winner = "tie"
206
 
207
+ # Fix 9: Cross-domain warning
208
+ if is_cross_domain:
209
+ risk_delta = f"Cross-domain comparison ({type_a} vs {type_b}) — risk delta not meaningful across different contract types"
210
+ risk_winner = "cross-domain"
211
+
212
  comparison_method = "semantic (sentence embeddings)" if _embedder is not None else "lexical (string matching)"
213
 
214
  return {
215
  "alignment_score": round(alignment, 3),
216
  "contract_a_clauses": len(clauses_a),
217
  "contract_b_clauses": len(clauses_b),
218
+ "contract_a_type": type_a,
219
+ "contract_b_type": type_b,
220
+ "is_cross_domain": is_cross_domain,
221
  "added_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in added[:50]],
222
  "removed_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in removed[:50]],
223
  "modified_clauses": modified[:50],