gaurv007 commited on
Commit
e696558
·
verified ·
1 Parent(s): bec2e2c

v3.0: Upload actual compare.py content

Browse files
Files changed (1) hide show
  1. compare.py +287 -1
compare.py CHANGED
@@ -1 +1,287 @@
1
- /app/clauseguard/compare.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ClauseGuard — Contract Comparison Engine v3.0
3
+ ═════════════════════════════════════════════
4
+ FIXED in v3.0:
5
+ • Semantic similarity using sentence embeddings (when available)
6
+ • Better clause type detection with legal taxonomy
7
+ • Improved diff visualization
8
+ • Fallback to SequenceMatcher when embeddings unavailable
9
+ """
10
+
11
+ import re
12
+ from difflib import SequenceMatcher
13
+ from collections import defaultdict
14
+
15
+ # Try to load sentence-transformers for semantic comparison
16
+ _HAS_EMBEDDINGS = False
17
+ _embedder = None
18
+
19
+ try:
20
+ from sentence_transformers import SentenceTransformer, util
21
+ _HAS_EMBEDDINGS = True
22
+ except ImportError:
23
+ pass
24
+
25
+
26
+ def _load_embedder():
27
+ global _embedder
28
+ if _HAS_EMBEDDINGS and _embedder is None:
29
+ try:
30
+ _embedder = SentenceTransformer("all-MiniLM-L6-v2")
31
+ print("[ClauseGuard] Sentence embeddings loaded for comparison")
32
+ except Exception as e:
33
+ print(f"[ClauseGuard] Embeddings not available: {e}")
34
+
35
+
36
+ def _normalize_clause(text):
37
+ """Normalize clause text for comparison."""
38
+ text = text.lower()
39
+ text = re.sub(r'[^a-z0-9\s]', ' ', text)
40
+ text = re.sub(r'\s+', ' ', text).strip()
41
+ return text
42
+
43
+
44
+ def _clause_similarity(a, b):
45
+ """Compute similarity using semantic embeddings or string matching."""
46
+ if _embedder is not None:
47
+ try:
48
+ emb_a = _embedder.encode(a[:512], convert_to_tensor=True)
49
+ emb_b = _embedder.encode(b[:512], convert_to_tensor=True)
50
+ sim = util.cos_sim(emb_a, emb_b).item()
51
+ return max(0, min(1, sim))
52
+ except Exception:
53
+ pass
54
+ # Fallback to string matching
55
+ return SequenceMatcher(None, _normalize_clause(a), _normalize_clause(b)).ratio()
56
+
57
+
58
+ def _extract_clause_type(clause_text):
59
+ """Clause type detection with legal taxonomy."""
60
+ text_lower = clause_text.lower()
61
+ type_keywords = {
62
+ "governing law": ["govern", "law of", "jurisdiction of", "applicable law"],
63
+ "termination": ["terminat", "cancel", "expir"],
64
+ "indemnification": ["indemnif", "hold harmless", "defend and indemnify"],
65
+ "confidentiality": ["confidential", "non-disclosure", "nda", "proprietary"],
66
+ "liability": ["liability", "liable", "damages", "limitation of"],
67
+ "payment": ["payment", "fee", "price", "compensat", "invoice", "remit"],
68
+ "intellectual property": ["intellectual property", "ip rights", "copyright", "patent", "trademark"],
69
+ "warranty": ["warrant", "guarantee", "representation"],
70
+ "force majeure": ["force majeure", "act of god", "beyond control"],
71
+ "arbitration": ["arbitrat", "mediation", "dispute resolution"],
72
+ "assignment": ["assign", "transfer of rights"],
73
+ "non-compete": ["non-compete", "not compete", "competition"],
74
+ "renewal": ["renew", "extend", "automatic renewal"],
75
+ "effective date": ["effective date", "commencement"],
76
+ "insurance": ["insurance", "coverage", "policy of insurance"],
77
+ "audit": ["audit", "inspection", "examination of records"],
78
+ "data protection": ["data protection", "privacy", "personal data", "gdpr", "ccpa"],
79
+ "notice": ["notice", "notification", "written notice"],
80
+ }
81
+ for ctype, keywords in type_keywords.items():
82
+ if any(kw in text_lower for kw in keywords):
83
+ return ctype
84
+ return "general"
85
+
86
+
87
+ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
88
+ """Compare two contracts with semantic similarity."""
89
+ if not text_a or not text_b:
90
+ return {"error": "Both contracts required"}
91
+
92
+ # Try to load embedder
93
+ _load_embedder()
94
+
95
+ # Split into clauses if not provided
96
+ if clauses_a is None:
97
+ clauses_a = _split_clauses(text_a)
98
+ if clauses_b is None:
99
+ clauses_b = _split_clauses(text_b)
100
+
101
+ # Build clause type maps
102
+ type_map_a = defaultdict(list)
103
+ type_map_b = defaultdict(list)
104
+ for c in clauses_a:
105
+ type_map_a[_extract_clause_type(c)].append(c)
106
+ for c in clauses_b:
107
+ type_map_b[_extract_clause_type(c)].append(c)
108
+
109
+ # Find matches
110
+ matched_a = set()
111
+ matched_b = set()
112
+ modified = []
113
+
114
+ SIMILARITY_THRESHOLD = 0.70
115
+ MODIFIED_THRESHOLD = 0.40
116
+
117
+ for i, ca in enumerate(clauses_a):
118
+ best_sim = 0
119
+ best_j = -1
120
+ for j, cb in enumerate(clauses_b):
121
+ if j in matched_b:
122
+ continue
123
+ sim = _clause_similarity(ca, cb)
124
+ if sim > best_sim:
125
+ best_sim = sim
126
+ best_j = j
127
+
128
+ if best_sim >= SIMILARITY_THRESHOLD:
129
+ matched_a.add(i)
130
+ matched_b.add(best_j)
131
+ if best_sim < 0.95:
132
+ modified.append({
133
+ "type": "modified",
134
+ "similarity": round(best_sim, 3),
135
+ "clause_a": ca[:200],
136
+ "clause_b": clauses_b[best_j][:200],
137
+ "clause_type": _extract_clause_type(ca),
138
+ })
139
+ elif best_sim >= MODIFIED_THRESHOLD:
140
+ matched_a.add(i)
141
+ if best_j >= 0:
142
+ matched_b.add(best_j)
143
+ modified.append({
144
+ "type": "partial",
145
+ "similarity": round(best_sim, 3),
146
+ "clause_a": ca[:200],
147
+ "clause_b": clauses_b[best_j][:200] if best_j >= 0 else "",
148
+ "clause_type": _extract_clause_type(ca),
149
+ })
150
+
151
+ removed = [clauses_a[i] for i in range(len(clauses_a)) if i not in matched_a]
152
+ added = [clauses_b[j] for j in range(len(clauses_b)) if j not in matched_b]
153
+
154
+ # Compute alignment score
155
+ total_pairs = max(len(clauses_a), len(clauses_b))
156
+ if total_pairs > 0:
157
+ alignment = len(matched_a) / total_pairs
158
+ else:
159
+ alignment = 0.0
160
+
161
+ # Risk delta: compare risk keywords with context
162
+ risk_keywords = ["unlimited", "unilateral", "waive", "arbitration", "indemnif",
163
+ "not liable", "no warranty", "sole discretion", "terminate",
164
+ "non-compete", "liquidated damages", "uncapped"]
165
+ risk_a = sum(1 for kw in risk_keywords if kw in text_a.lower())
166
+ risk_b = sum(1 for kw in risk_keywords if kw in text_b.lower())
167
+
168
+ if risk_a > risk_b + 2:
169
+ risk_delta = "Contract A is significantly riskier"
170
+ risk_winner = "B"
171
+ elif risk_b > risk_a + 2:
172
+ risk_delta = "Contract B is significantly riskier"
173
+ risk_winner = "A"
174
+ elif risk_a > risk_b:
175
+ risk_delta = "Contract A is slightly riskier"
176
+ risk_winner = "B"
177
+ elif risk_b > risk_a:
178
+ risk_delta = "Contract B is slightly riskier"
179
+ risk_winner = "A"
180
+ else:
181
+ risk_delta = "Similar risk profiles"
182
+ risk_winner = "tie"
183
+
184
+ comparison_method = "semantic (sentence embeddings)" if _embedder is not None else "lexical (string matching)"
185
+
186
+ return {
187
+ "alignment_score": round(alignment, 3),
188
+ "contract_a_clauses": len(clauses_a),
189
+ "contract_b_clauses": len(clauses_b),
190
+ "added_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in added[:50]],
191
+ "removed_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in removed[:50]],
192
+ "modified_clauses": modified[:50],
193
+ "risk_delta": risk_delta,
194
+ "risk_winner": risk_winner,
195
+ "comparison_method": comparison_method,
196
+ "type_map_a": {k: len(v) for k, v in type_map_a.items()},
197
+ "type_map_b": {k: len(v) for k, v in type_map_b.items()},
198
+ }
199
+
200
+
201
+ def _split_clauses(text):
202
+ """Split text into clauses."""
203
+ text = re.sub(r'\n{3,}', '\n\n', text.strip())
204
+ # Try section-based splitting first
205
+ section_splits = re.split(
206
+ r'(?:\n\n)(?=\d+[.)]\s|\([a-z]\)\s|(?:Section|Article|Clause)\s+\d+)',
207
+ text
208
+ )
209
+ if len(section_splits) >= 3:
210
+ return [p.strip() for p in section_splits if len(p.strip()) > 30]
211
+ # Fallback to paragraph/sentence splitting
212
+ parts = re.split(
213
+ r'(?<=[.!?])\s+(?=[A-Z0-9(])|(?:\n\n)',
214
+ text
215
+ )
216
+ return [p.strip() for p in parts if len(p.strip()) > 30]
217
+
218
+
219
+ def render_comparison_html(result):
220
+ """Render comparison results as HTML for Gradio."""
221
+ if "error" in result:
222
+ return f'<p style="color:#dc2626;">{result["error"]}</p>'
223
+
224
+ method = result.get("comparison_method", "unknown")
225
+ method_badge = f'<div style="font-size:10px;color:#6b7280;text-align:center;margin-bottom:12px;">Comparison method: {method}</div>'
226
+
227
+ html = f'''
228
+ <div style="font-family:system-ui,sans-serif;">
229
+ {method_badge}
230
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-bottom:16px;">
231
+ <div style="padding:12px;border-radius:8px;background:#eff6ff;border:1px solid #bfdbfe;text-align:center;">
232
+ <div style="font-size:24px;font-weight:700;color:#1d4ed8;">{result["contract_a_clauses"]}</div>
233
+ <div style="font-size:12px;color:#3b82f6;">Clauses in Contract A</div>
234
+ </div>
235
+ <div style="padding:12px;border-radius:8px;background:#fefce8;border:1px solid #fde68a;text-align:center;">
236
+ <div style="font-size:24px;font-weight:700;color:#a16207;">{result["contract_b_clauses"]}</div>
237
+ <div style="font-size:12px;color:#ca8a04;">Clauses in Contract B</div>
238
+ </div>
239
+ </div>
240
+
241
+ <div style="padding:12px;border-radius:8px;background:#f9fafb;border:1px solid #e5e7eb;margin-bottom:16px;text-align:center;">
242
+ <div style="font-size:28px;font-weight:700;color:#374151;">{result["alignment_score"]*100:.1f}%</div>
243
+ <div style="font-size:12px;color:#6b7280;">Alignment Score</div>
244
+ </div>
245
+
246
+ <div style="padding:12px;border-radius:8px;background:{
247
+ "#fef2f2" if result["risk_winner"] != "tie" else "#f0fdf4"
248
+ };border:1px solid {
249
+ "#fecaca" if result["risk_winner"] != "tie" else "#bbf7d0"
250
+ };margin-bottom:16px;text-align:center;">
251
+ <span style="font-size:14px;font-weight:600;color:{
252
+ "#dc2626" if result["risk_winner"] != "tie" else "#16a34a"
253
+ };">⚖️ {result["risk_delta"]}</span>
254
+ </div>
255
+ '''
256
+
257
+ # Modified clauses
258
+ if result["modified_clauses"]:
259
+ html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">📝 Modified Clauses</h3>'
260
+ for m in result["modified_clauses"][:20]:
261
+ html += f'''
262
+ <div style="border:1px solid #e5e7eb;border-radius:6px;padding:10px;margin-bottom:8px;">
263
+ <div style="font-size:11px;color:#6b7280;margin-bottom:4px;">{m["clause_type"].upper()} · Similarity: {m["similarity"]*100:.0f}%</div>
264
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;">
265
+ <div style="background:#fef2f2;padding:6px;border-radius:4px;font-size:12px;color:#991b1b;">{m["clause_a"][:150]}...</div>
266
+ <div style="background:#f0fdf4;padding:6px;border-radius:4px;font-size:12px;color:#166534;">{m["clause_b"][:150]}...</div>
267
+ </div>
268
+ </div>
269
+ '''
270
+ html += '</div>'
271
+
272
+ # Added clauses
273
+ if result["added_clauses"]:
274
+ html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">➕ Added in Contract B</h3>'
275
+ for a in result["added_clauses"][:15]:
276
+ html += f'<div style="background:#f0fdf4;padding:8px;border-radius:4px;font-size:12px;color:#166534;margin-bottom:4px;border-left:3px solid #22c55e;"><b>{a["type"].upper()}</b> · {a["text"][:150]}...</div>'
277
+ html += '</div>'
278
+
279
+ # Removed clauses
280
+ if result["removed_clauses"]:
281
+ html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">➖ Removed from Contract A</h3>'
282
+ for r in result["removed_clauses"][:15]:
283
+ html += f'<div style="background:#fef2f2;padding:8px;border-radius:4px;font-size:12px;color:#991b1b;margin-bottom:4px;border-left:3px solid #ef4444;"><b>{r["type"].upper()}</b> · {r["text"][:150]}...</div>'
284
+ html += '</div>'
285
+
286
+ html += '</div>'
287
+ return html