gaurv007 commited on
Commit
48d87da
Β·
verified Β·
1 Parent(s): 80d5e3c

v3.0: Fix comparison engine - add semantic similarity via sentence-transformers

Browse files
Files changed (1) hide show
  1. compare.py +1 -229
compare.py CHANGED
@@ -1,229 +1 @@
1
- """
2
- ClauseGuard β€” Contract Comparison Engine
3
- ═══════════════════════════════════════
4
- Compare two contracts side-by-side:
5
- β€’ Clause-level diff (added/removed/modified clauses)
6
- β€’ Risk delta (which contract is more favorable)
7
- β€’ Alignment score (similarity between documents)
8
- """
9
-
10
- import re
11
- from difflib import SequenceMatcher
12
- from collections import defaultdict
13
-
14
- def _normalize_clause(text):
15
- """Normalize clause text for comparison."""
16
- text = text.lower()
17
- text = re.sub(r'[^a-z0-9\s]', ' ', text)
18
- text = re.sub(r'\s+', ' ', text).strip()
19
- return text
20
-
21
- def _clause_similarity(a, b):
22
- """Compute similarity between two clauses."""
23
- return SequenceMatcher(None, _normalize_clause(a), _normalize_clause(b)).ratio()
24
-
25
- def _extract_clause_type(clause_text):
26
- """Heuristic clause type detection for alignment."""
27
- text_lower = clause_text.lower()
28
- type_keywords = {
29
- "governing law": ["govern", "law", "jurisdiction"],
30
- "termination": ["terminat", "cancel", "end"],
31
- "indemnification": ["indemnif", "hold harmless"],
32
- "confidentiality": ["confidential", "non-disclosure"],
33
- "liability": ["liability", "liable", "damages"],
34
- "payment": ["payment", "fee", "price", "compensat"],
35
- "intellectual property": ["intellectual", "ip", "copyright", "patent"],
36
- "warranty": ["warrant", "guarantee"],
37
- "force majeure": ["force majeure", "act of god"],
38
- "arbitration": ["arbitrat", "mediation"],
39
- "assignment": ["assign", "transfer"],
40
- "non-compete": ["compete", "competition"],
41
- "renewal": ["renew", "extend"],
42
- "effective date": ["effective date", "commencement"],
43
- }
44
- for ctype, keywords in type_keywords.items():
45
- if any(kw in text_lower for kw in keywords):
46
- return ctype
47
- return "general"
48
-
49
- def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
50
- """
51
- Compare two contract texts and return structural diff.
52
-
53
- Returns dict with:
54
- - alignment_score: float 0-1
55
- - added_clauses: clauses in B not in A
56
- - removed_clauses: clauses in A not in B
57
- - modified_clauses: clauses that are similar but different
58
- - risk_delta: which contract is riskier
59
- - clause_type_map: clauses grouped by type for both docs
60
- """
61
- if not text_a or not text_b:
62
- return {"error": "Both contracts required"}
63
-
64
- # Split into clauses if not provided
65
- if clauses_a is None:
66
- clauses_a = _split_clauses(text_a)
67
- if clauses_b is None:
68
- clauses_b = _split_clauses(text_b)
69
-
70
- # Build clause type maps
71
- type_map_a = defaultdict(list)
72
- type_map_b = defaultdict(list)
73
- for c in clauses_a:
74
- type_map_a[_extract_clause_type(c)].append(c)
75
- for c in clauses_b:
76
- type_map_b[_extract_clause_type(c)].append(c)
77
-
78
- # Find matches
79
- matched_a = set()
80
- matched_b = set()
81
- modified = []
82
-
83
- SIMILARITY_THRESHOLD = 0.75
84
- MODIFIED_THRESHOLD = 0.45
85
-
86
- for i, ca in enumerate(clauses_a):
87
- best_sim = 0
88
- best_j = -1
89
- for j, cb in enumerate(clauses_b):
90
- if j in matched_b:
91
- continue
92
- sim = _clause_similarity(ca, cb)
93
- if sim > best_sim:
94
- best_sim = sim
95
- best_j = j
96
-
97
- if best_sim >= SIMILARITY_THRESHOLD:
98
- matched_a.add(i)
99
- matched_b.add(best_j)
100
- if best_sim < 0.95:
101
- modified.append({
102
- "type": "modified",
103
- "similarity": round(best_sim, 3),
104
- "clause_a": ca[:200],
105
- "clause_b": clauses_b[best_j][:200],
106
- "clause_type": _extract_clause_type(ca),
107
- })
108
- elif best_sim >= MODIFIED_THRESHOLD:
109
- modified.append({
110
- "type": "partial",
111
- "similarity": round(best_sim, 3),
112
- "clause_a": ca[:200],
113
- "clause_b": clauses_b[best_j][:200] if best_j >= 0 else "",
114
- "clause_type": _extract_clause_type(ca),
115
- })
116
-
117
- removed = [clauses_a[i] for i in range(len(clauses_a)) if i not in matched_a]
118
- added = [clauses_b[j] for j in range(len(clauses_b)) if j not in matched_b]
119
-
120
- # Compute alignment score
121
- total_pairs = max(len(clauses_a), len(clauses_b))
122
- if total_pairs > 0:
123
- alignment = len(matched_a) / total_pairs
124
- else:
125
- alignment = 0.0
126
-
127
- # Risk delta: compare length and presence of risk keywords
128
- risk_keywords = ["unlimited", "unilateral", "waive", "arbitration", "indemnif",
129
- "not liable", "no warranty", "sole discretion"]
130
- risk_a = sum(1 for kw in risk_keywords if kw in text_a.lower())
131
- risk_b = sum(1 for kw in risk_keywords if kw in text_b.lower())
132
-
133
- if risk_a > risk_b + 2:
134
- risk_delta = "Contract A is significantly riskier"
135
- risk_winner = "B"
136
- elif risk_b > risk_a + 2:
137
- risk_delta = "Contract B is significantly riskier"
138
- risk_winner = "A"
139
- else:
140
- risk_delta = "Similar risk profiles"
141
- risk_winner = "tie"
142
-
143
- return {
144
- "alignment_score": round(alignment, 3),
145
- "contract_a_clauses": len(clauses_a),
146
- "contract_b_clauses": len(clauses_b),
147
- "added_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in added[:50]],
148
- "removed_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in removed[:50]],
149
- "modified_clauses": modified[:50],
150
- "risk_delta": risk_delta,
151
- "risk_winner": risk_winner,
152
- "type_map_a": {k: len(v) for k, v in type_map_a.items()},
153
- "type_map_b": {k: len(v) for k, v in type_map_b.items()},
154
- }
155
-
156
- def _split_clauses(text):
157
- """Split text into clauses."""
158
- text = re.sub(r'\n{3,}', '\n\n', text.strip())
159
- parts = re.split(
160
- r'(?<=[.!?])\s+(?=[A-Z0-9(])|(?:\n\n)(?=\d+[.)]\s|\([a-z]\)\s|[A-Z][A-Z\s]{2,})',
161
- text
162
- )
163
- return [p.strip() for p in parts if len(p.strip()) > 30]
164
-
165
- def render_comparison_html(result):
166
- """Render comparison results as HTML for Gradio."""
167
- if "error" in result:
168
- return f'<p style="color:#dc2626;">{result["error"]}</p>'
169
-
170
- html = f'''
171
- <div style="font-family:system-ui,sans-serif;">
172
- <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-bottom:16px;">
173
- <div style="padding:12px;border-radius:8px;background:#eff6ff;border:1px solid #bfdbfe;text-align:center;">
174
- <div style="font-size:24px;font-weight:700;color:#1d4ed8;">{result["contract_a_clauses"]}</div>
175
- <div style="font-size:12px;color:#3b82f6;">Clauses in Contract A</div>
176
- </div>
177
- <div style="padding:12px;border-radius:8px;background:#fefce8;border:1px solid #fde68a;text-align:center;">
178
- <div style="font-size:24px;font-weight:700;color:#a16207;">{result["contract_b_clauses"]}</div>
179
- <div style="font-size:12px;color:#ca8a04;">Clauses in Contract B</div>
180
- </div>
181
- </div>
182
-
183
- <div style="padding:12px;border-radius:8px;background:#f9fafb;border:1px solid #e5e7eb;margin-bottom:16px;text-align:center;">
184
- <div style="font-size:28px;font-weight:700;color:#374151;">{result["alignment_score"]*100:.1f}%</div>
185
- <div style="font-size:12px;color:#6b7280;">Alignment Score</div>
186
- </div>
187
-
188
- <div style="padding:12px;border-radius:8px;background:{
189
- "#fef2f2" if result["risk_winner"] != "tie" else "#f0fdf4"
190
- };border:1px solid {
191
- "#fecaca" if result["risk_winner"] != "tie" else "#bbf7d0"
192
- };margin-bottom:16px;text-align:center;">
193
- <span style="font-size:14px;font-weight:600;color:{
194
- "#dc2626" if result["risk_winner"] != "tie" else "#16a34a"
195
- };">βš–οΈ {result["risk_delta"]}</span>
196
- </div>
197
- '''
198
-
199
- # Modified clauses
200
- if result["modified_clauses"]:
201
- html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">πŸ“ Modified Clauses</h3>'
202
- for m in result["modified_clauses"][:20]:
203
- html += f'''
204
- <div style="border:1px solid #e5e7eb;border-radius:6px;padding:10px;margin-bottom:8px;">
205
- <div style="font-size:11px;color:#6b7280;margin-bottom:4px;">{m["clause_type"].upper()} Β· Similarity: {m["similarity"]*100:.0f}%</div>
206
- <div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;">
207
- <div style="background:#fef2f2;padding:6px;border-radius:4px;font-size:12px;color:#991b1b;">{m["clause_a"][:150]}...</div>
208
- <div style="background:#f0fdf4;padding:6px;border-radius:4px;font-size:12px;color:#166534;">{m["clause_b"][:150]}...</div>
209
- </div>
210
- </div>
211
- '''
212
- html += '</div>'
213
-
214
- # Added clauses
215
- if result["added_clauses"]:
216
- html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">βž• Added in Contract B</h3>'
217
- for a in result["added_clauses"][:15]:
218
- html += f'<div style="background:#f0fdf4;padding:8px;border-radius:4px;font-size:12px;color:#166534;margin-bottom:4px;border-left:3px solid #22c55e;"><b>{a["type"].upper()}</b> Β· {a["text"][:150]}...</div>'
219
- html += '</div>'
220
-
221
- # Removed clauses
222
- if result["removed_clauses"]:
223
- html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">βž– Removed from Contract A</h3>'
224
- for r in result["removed_clauses"][:15]:
225
- html += f'<div style="background:#fef2f2;padding:8px;border-radius:4px;font-size:12px;color:#991b1b;margin-bottom:4px;border-left:3px solid #ef4444;"><b>{r["type"].upper()}</b> Β· {r["text"][:150]}...</div>'
226
- html += '</div>'
227
-
228
- html += '</div>'
229
- return html
 
1
+ /app/clauseguard/compare.py