gaurv007 commited on
Commit
a116282
·
verified ·
1 Parent(s): e8d10a0

Upload compare.py

Browse files
Files changed (1) hide show
  1. compare.py +229 -0
compare.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ClauseGuard — Contract Comparison Engine
3
+ ═══════════════════════════════════════
4
+ Compare two contracts side-by-side:
5
+ • Clause-level diff (added/removed/modified clauses)
6
+ • Risk delta (which contract is more favorable)
7
+ • Alignment score (similarity between documents)
8
+ """
9
+
10
+ import re
11
+ from difflib import SequenceMatcher
12
+ from collections import defaultdict
13
+
14
+ def _normalize_clause(text):
15
+ """Normalize clause text for comparison."""
16
+ text = text.lower()
17
+ text = re.sub(r'[^a-z0-9\s]', ' ', text)
18
+ text = re.sub(r'\s+', ' ', text).strip()
19
+ return text
20
+
21
+ def _clause_similarity(a, b):
22
+ """Compute similarity between two clauses."""
23
+ return SequenceMatcher(None, _normalize_clause(a), _normalize_clause(b)).ratio()
24
+
25
+ def _extract_clause_type(clause_text):
26
+ """Heuristic clause type detection for alignment."""
27
+ text_lower = clause_text.lower()
28
+ type_keywords = {
29
+ "governing law": ["govern", "law", "jurisdiction"],
30
+ "termination": ["terminat", "cancel", "end"],
31
+ "indemnification": ["indemnif", "hold harmless"],
32
+ "confidentiality": ["confidential", "non-disclosure"],
33
+ "liability": ["liability", "liable", "damages"],
34
+ "payment": ["payment", "fee", "price", "compensat"],
35
+ "intellectual property": ["intellectual", "ip", "copyright", "patent"],
36
+ "warranty": ["warrant", "guarantee"],
37
+ "force majeure": ["force majeure", "act of god"],
38
+ "arbitration": ["arbitrat", "mediation"],
39
+ "assignment": ["assign", "transfer"],
40
+ "non-compete": ["compete", "competition"],
41
+ "renewal": ["renew", "extend"],
42
+ "effective date": ["effective date", "commencement"],
43
+ }
44
+ for ctype, keywords in type_keywords.items():
45
+ if any(kw in text_lower for kw in keywords):
46
+ return ctype
47
+ return "general"
48
+
49
+ def compare_contracts(text_a, text_b, clauses_a=None, clauses_b=None):
50
+ """
51
+ Compare two contract texts and return structural diff.
52
+
53
+ Returns dict with:
54
+ - alignment_score: float 0-1
55
+ - added_clauses: clauses in B not in A
56
+ - removed_clauses: clauses in A not in B
57
+ - modified_clauses: clauses that are similar but different
58
+ - risk_delta: which contract is riskier
59
+ - clause_type_map: clauses grouped by type for both docs
60
+ """
61
+ if not text_a or not text_b:
62
+ return {"error": "Both contracts required"}
63
+
64
+ # Split into clauses if not provided
65
+ if clauses_a is None:
66
+ clauses_a = _split_clauses(text_a)
67
+ if clauses_b is None:
68
+ clauses_b = _split_clauses(text_b)
69
+
70
+ # Build clause type maps
71
+ type_map_a = defaultdict(list)
72
+ type_map_b = defaultdict(list)
73
+ for c in clauses_a:
74
+ type_map_a[_extract_clause_type(c)].append(c)
75
+ for c in clauses_b:
76
+ type_map_b[_extract_clause_type(c)].append(c)
77
+
78
+ # Find matches
79
+ matched_a = set()
80
+ matched_b = set()
81
+ modified = []
82
+
83
+ SIMILARITY_THRESHOLD = 0.75
84
+ MODIFIED_THRESHOLD = 0.45
85
+
86
+ for i, ca in enumerate(clauses_a):
87
+ best_sim = 0
88
+ best_j = -1
89
+ for j, cb in enumerate(clauses_b):
90
+ if j in matched_b:
91
+ continue
92
+ sim = _clause_similarity(ca, cb)
93
+ if sim > best_sim:
94
+ best_sim = sim
95
+ best_j = j
96
+
97
+ if best_sim >= SIMILARITY_THRESHOLD:
98
+ matched_a.add(i)
99
+ matched_b.add(best_j)
100
+ if best_sim < 0.95:
101
+ modified.append({
102
+ "type": "modified",
103
+ "similarity": round(best_sim, 3),
104
+ "clause_a": ca[:200],
105
+ "clause_b": clauses_b[best_j][:200],
106
+ "clause_type": _extract_clause_type(ca),
107
+ })
108
+ elif best_sim >= MODIFIED_THRESHOLD:
109
+ modified.append({
110
+ "type": "partial",
111
+ "similarity": round(best_sim, 3),
112
+ "clause_a": ca[:200],
113
+ "clause_b": clauses_b[best_j][:200] if best_j >= 0 else "",
114
+ "clause_type": _extract_clause_type(ca),
115
+ })
116
+
117
+ removed = [clauses_a[i] for i in range(len(clauses_a)) if i not in matched_a]
118
+ added = [clauses_b[j] for j in range(len(clauses_b)) if j not in matched_b]
119
+
120
+ # Compute alignment score
121
+ total_pairs = max(len(clauses_a), len(clauses_b))
122
+ if total_pairs > 0:
123
+ alignment = len(matched_a) / total_pairs
124
+ else:
125
+ alignment = 0.0
126
+
127
+ # Risk delta: compare length and presence of risk keywords
128
+ risk_keywords = ["unlimited", "unilateral", "waive", "arbitration", "indemnif",
129
+ "not liable", "no warranty", "sole discretion"]
130
+ risk_a = sum(1 for kw in risk_keywords if kw in text_a.lower())
131
+ risk_b = sum(1 for kw in risk_keywords if kw in text_b.lower())
132
+
133
+ if risk_a > risk_b + 2:
134
+ risk_delta = "Contract A is significantly riskier"
135
+ risk_winner = "B"
136
+ elif risk_b > risk_a + 2:
137
+ risk_delta = "Contract B is significantly riskier"
138
+ risk_winner = "A"
139
+ else:
140
+ risk_delta = "Similar risk profiles"
141
+ risk_winner = "tie"
142
+
143
+ return {
144
+ "alignment_score": round(alignment, 3),
145
+ "contract_a_clauses": len(clauses_a),
146
+ "contract_b_clauses": len(clauses_b),
147
+ "added_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in added[:50]],
148
+ "removed_clauses": [{"text": c[:200], "type": _extract_clause_type(c)} for c in removed[:50]],
149
+ "modified_clauses": modified[:50],
150
+ "risk_delta": risk_delta,
151
+ "risk_winner": risk_winner,
152
+ "type_map_a": {k: len(v) for k, v in type_map_a.items()},
153
+ "type_map_b": {k: len(v) for k, v in type_map_b.items()},
154
+ }
155
+
156
+ def _split_clauses(text):
157
+ """Split text into clauses."""
158
+ text = re.sub(r'\n{3,}', '\n\n', text.strip())
159
+ parts = re.split(
160
+ r'(?<=[.!?])\s+(?=[A-Z0-9(])|(?:\n\n)(?=\d+[.)]\s|\([a-z]\)\s|[A-Z][A-Z\s]{2,})',
161
+ text
162
+ )
163
+ return [p.strip() for p in parts if len(p.strip()) > 30]
164
+
165
+ def render_comparison_html(result):
166
+ """Render comparison results as HTML for Gradio."""
167
+ if "error" in result:
168
+ return f'<p style="color:#dc2626;">{result["error"]}</p>'
169
+
170
+ html = f'''
171
+ <div style="font-family:system-ui,sans-serif;">
172
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-bottom:16px;">
173
+ <div style="padding:12px;border-radius:8px;background:#eff6ff;border:1px solid #bfdbfe;text-align:center;">
174
+ <div style="font-size:24px;font-weight:700;color:#1d4ed8;">{result["contract_a_clauses"]}</div>
175
+ <div style="font-size:12px;color:#3b82f6;">Clauses in Contract A</div>
176
+ </div>
177
+ <div style="padding:12px;border-radius:8px;background:#fefce8;border:1px solid #fde68a;text-align:center;">
178
+ <div style="font-size:24px;font-weight:700;color:#a16207;">{result["contract_b_clauses"]}</div>
179
+ <div style="font-size:12px;color:#ca8a04;">Clauses in Contract B</div>
180
+ </div>
181
+ </div>
182
+
183
+ <div style="padding:12px;border-radius:8px;background:#f9fafb;border:1px solid #e5e7eb;margin-bottom:16px;text-align:center;">
184
+ <div style="font-size:28px;font-weight:700;color:#374151;">{result["alignment_score"]*100:.1f}%</div>
185
+ <div style="font-size:12px;color:#6b7280;">Alignment Score</div>
186
+ </div>
187
+
188
+ <div style="padding:12px;border-radius:8px;background:{
189
+ "#fef2f2" if result["risk_winner"] != "tie" else "#f0fdf4"
190
+ };border:1px solid {
191
+ "#fecaca" if result["risk_winner"] != "tie" else "#bbf7d0"
192
+ };margin-bottom:16px;text-align:center;">
193
+ <span style="font-size:14px;font-weight:600;color:{
194
+ "#dc2626" if result["risk_winner"] != "tie" else "#16a34a"
195
+ };">⚖️ {result["risk_delta"]}</span>
196
+ </div>
197
+ '''
198
+
199
+ # Modified clauses
200
+ if result["modified_clauses"]:
201
+ html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">📝 Modified Clauses</h3>'
202
+ for m in result["modified_clauses"][:20]:
203
+ html += f'''
204
+ <div style="border:1px solid #e5e7eb;border-radius:6px;padding:10px;margin-bottom:8px;">
205
+ <div style="font-size:11px;color:#6b7280;margin-bottom:4px;">{m["clause_type"].upper()} · Similarity: {m["similarity"]*100:.0f}%</div>
206
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;">
207
+ <div style="background:#fef2f2;padding:6px;border-radius:4px;font-size:12px;color:#991b1b;">{m["clause_a"][:150]}...</div>
208
+ <div style="background:#f0fdf4;padding:6px;border-radius:4px;font-size:12px;color:#166534;">{m["clause_b"][:150]}...</div>
209
+ </div>
210
+ </div>
211
+ '''
212
+ html += '</div>'
213
+
214
+ # Added clauses
215
+ if result["added_clauses"]:
216
+ html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">➕ Added in Contract B</h3>'
217
+ for a in result["added_clauses"][:15]:
218
+ html += f'<div style="background:#f0fdf4;padding:8px;border-radius:4px;font-size:12px;color:#166534;margin-bottom:4px;border-left:3px solid #22c55e;"><b>{a["type"].upper()}</b> · {a["text"][:150]}...</div>'
219
+ html += '</div>'
220
+
221
+ # Removed clauses
222
+ if result["removed_clauses"]:
223
+ html += '<div style="margin-bottom:16px;"><h3 style="font-size:14px;color:#374151;margin-bottom:8px;">➖ Removed from Contract A</h3>'
224
+ for r in result["removed_clauses"][:15]:
225
+ html += f'<div style="background:#fef2f2;padding:8px;border-radius:4px;font-size:12px;color:#991b1b;margin-bottom:4px;border-left:3px solid #ef4444;"><b>{r["type"].upper()}</b> · {r["text"][:150]}...</div>'
226
+ html += '</div>'
227
+
228
+ html += '</div>'
229
+ return html