gaurv007 commited on
Commit
f81766f
·
verified ·
1 Parent(s): e3f2df1

Wire trained Legal-BERT model (76% F1) into live demo — replaces regex

Browse files
Files changed (2) hide show
  1. app.py +79 -40
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,33 +1,74 @@
1
  """
2
  ClauseGuard — AI Fine Print Scanner
 
3
  """
4
 
5
  import gradio as gr
6
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- NUM_LABELS = 8
9
- LABELS = [
10
- ("Limitation of liability", "HIGH", "Company avoids responsibility for damages or losses."),
11
- ("Unilateral termination", "HIGH", "They can close your account without reason."),
12
- ("Unilateral change", "MEDIUM", "Terms can change without your consent."),
13
- ("Content removal", "MEDIUM", "Your content can be deleted without notice."),
14
- ("Contract by using", "LOW", "You agree just by visiting or using the site."),
15
- ("Choice of law", "MEDIUM", "Foreign law applies instead of your local protections."),
16
- ("Jurisdiction", "MEDIUM", "Disputes handled in their preferred court, not yours."),
17
- ("Arbitration", "HIGH", "You waive your right to sue in court."),
18
- ]
19
-
20
  PATTERNS = {
21
- 0: [r"not liable", r"shall not be (liable|responsible)", r"in no event.*liable", r"limitation of liability", r"without warranty", r"disclaim"],
22
- 1: [r"terminat.*at any time", r"suspend.*account.*without", r"we may (terminat|suspend|discontinu)", r"right to (terminat|suspend)"],
23
- 2: [r"sole discretion", r"reserves? the right to (modify|change|update|amend)", r"at any time.*without (prior )?notice", r"we may (modify|change|update)"],
24
- 3: [r"remove.*content.*without", r"right to remove", r"we may.*remove"],
25
- 4: [r"by (using|accessing).*you agree", r"continued use.*constitutes? acceptance"],
26
- 5: [r"governed by.*laws? of", r"shall be governed", r"laws of the state of"],
27
- 6: [r"exclusive jurisdiction", r"courts? of.*(california|delaware|new york|ireland|england)", r"submit to.*jurisdiction"],
28
- 7: [r"arbitrat", r"binding arbitration", r"waive.*right.*court", r"class action waiver"],
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def split_clauses(text):
32
  text = re.sub(r'\n{2,}', '\n', text.strip())
33
  parts = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(])|(?:\n)(?=\d+[.)]\s|\([a-z]\)\s)', text)
@@ -45,17 +86,11 @@ def analyze(text):
45
  sev_counts = {"HIGH": 0, "MEDIUM": 0, "LOW": 0}
46
 
47
  for clause in clauses:
48
- clause_lower = clause.lower()
49
- hits = []
50
- for lid, pats in PATTERNS.items():
51
- for p in pats:
52
- if re.search(p, clause_lower):
53
- name, sev, desc = LABELS[lid]
54
- hits.append({"name": name, "severity": sev, "desc": desc})
55
- sev_counts[sev] += 1
56
- break
57
  if hits:
58
  flagged.append({"text": clause, "hits": hits})
 
 
59
 
60
  total = len(clauses)
61
  risk = min(100, round((sev_counts["HIGH"] * 20 + sev_counts["MEDIUM"] * 10 + sev_counts["LOW"] * 5) / max(1, total) * 100))
@@ -66,7 +101,9 @@ def analyze(text):
66
  elif risk >= 10: grade = "B"
67
  else: grade = "A"
68
 
69
- # Summary
 
 
70
  summary = f"""<div style="font-family:system-ui,sans-serif;">
71
  <div style="border:1px solid #e4e4e7;border-radius:8px;padding:20px;margin-bottom:16px;">
72
  <div style="display:flex;justify-content:space-between;align-items:baseline;">
@@ -80,27 +117,28 @@ def analyze(text):
80
  'background:#f0fdf4;color:#15803d;'
81
  }">Grade {grade}</span>
82
  </div>
83
- <p style="margin-top:8px;font-size:12px;color:#a1a1aa;">{total} clauses scanned · {len(flagged)} flagged · {sev_counts['HIGH']} high · {sev_counts['MEDIUM']} medium · {sev_counts['LOW']} low</p>
84
  </div>"""
85
 
86
  if not flagged:
87
  summary += '<div style="border:1px solid #e4e4e7;border-radius:8px;padding:24px;text-align:center;"><p style="font-size:14px;color:#71717a;">No unfair clauses found.</p></div>'
88
  else:
89
- for i, item in enumerate(flagged):
90
  max_sev = max(item["hits"], key=lambda h: {"HIGH":3,"MEDIUM":2,"LOW":1}[h["severity"]])["severity"]
91
- border_color = {"HIGH":"#fca5a5","MEDIUM":"#fcd34d","LOW":"#93c5fd"}[max_sev]
92
 
93
  tags = ""
94
  for h in item["hits"]:
95
- tag_style = {"HIGH":"background:#fef2f2;color:#b91c1c;border:1px solid #fecaca;",
96
- "MEDIUM":"background:#fffbeb;color:#a16207;border:1px solid #fde68a;",
97
- "LOW":"background:#eff6ff;color:#1d4ed8;border:1px solid #bfdbfe;"}[h["severity"]]
98
- tags += f'<span style="{tag_style}font-size:11px;font-weight:500;padding:1px 8px;border-radius:3px;margin-right:4px;">{h["name"]}</span>'
 
99
 
100
  descs = "".join(f'<p style="font-size:12px;color:#71717a;margin-top:4px;">{h["desc"]}</p>' for h in item["hits"])
101
- preview = item["text"][:180] + ("..." if len(item["text"]) > 180 else "")
102
 
103
- summary += f'''<div style="border:1px solid #e4e4e7;border-left:3px solid {border_color};border-radius:8px;padding:14px;margin-bottom:8px;">
104
  <p style="font-size:13px;color:#3f3f46;line-height:1.6;">{preview}</p>
105
  <div style="margin-top:8px;">{tags}</div>
106
  {descs}
@@ -109,6 +147,7 @@ def analyze(text):
109
  summary += "</div>"
110
  return summary, ""
111
 
 
112
  SPOTIFY = """By using the Spotify Service, you agree to be bound by these Terms of Use.
113
 
114
  Spotify may, in its sole discretion, modify or update these Terms of Service at any time without prior notice. Your continued use of the Service after any such changes constitutes your acceptance of the new Terms of Service.
@@ -153,7 +192,7 @@ with demo:
153
  scan_btn.click(fn=analyze, inputs=[text_input], outputs=[results_html, hidden])
154
  clear_btn.click(fn=lambda: ("", "", ""), outputs=[text_input, results_html, hidden])
155
 
156
- gr.HTML('<p style="font-family:system-ui,sans-serif;font-size:11px;color:#a1a1aa;text-align:center;padding:16px 0;border-top:1px solid #f4f4f5;margin-top:16px;">Not legal advice. Based on CLAUDETTE taxonomy. <a href="https://huggingface.co/datasets/coastalcph/lex_glue" style="color:#71717a;">Dataset</a></p>')
157
 
158
  if __name__ == "__main__":
159
  demo.launch()
 
1
  """
2
  ClauseGuard — AI Fine Print Scanner
3
+ Uses Legal-BERT fine-tuned on CLAUDETTE/LexGLUE unfair_tos (8 categories).
4
  """
5
 
6
  import gradio as gr
7
  import re
8
+ import numpy as np
9
+
10
+ # ─── Load ML model ───
11
+ MODEL_ID = "gaurv007/clauseguard-legal-bert"
12
+ ml_pipeline = None
13
+
14
+ try:
15
+ from transformers import pipeline
16
+ ml_pipeline = pipeline("text-classification", model=MODEL_ID, top_k=None, device=-1)
17
+ print(f"Loaded model: {MODEL_ID}")
18
+ except Exception as e:
19
+ print(f"Model load failed ({e}), using regex fallback")
20
+
21
+ # ─── Label metadata ───
22
+ LABELS = {
23
+ "Limitation of liability": ("HIGH", "Company avoids responsibility for damages or losses."),
24
+ "Unilateral termination": ("HIGH", "They can close your account without reason."),
25
+ "Unilateral change": ("MEDIUM", "Terms can change without your consent."),
26
+ "Content removal": ("MEDIUM", "Your content can be deleted without notice."),
27
+ "Contract by using": ("LOW", "You agree just by visiting or using the site."),
28
+ "Choice of law": ("MEDIUM", "Foreign law applies instead of your local protections."),
29
+ "Jurisdiction": ("MEDIUM", "Disputes handled in their preferred court, not yours."),
30
+ "Arbitration": ("HIGH", "You waive your right to sue in court."),
31
+ }
32
 
33
+ # ─── Regex fallback ───
 
 
 
 
 
 
 
 
 
 
 
34
  PATTERNS = {
35
+ "Limitation of liability": [r"not liable", r"shall not be (liable|responsible)", r"in no event.*liable", r"limitation of liability", r"without warranty", r"disclaim"],
36
+ "Unilateral termination": [r"terminat.*at any time", r"suspend.*account.*without", r"we may (terminat|suspend|discontinu)", r"right to (terminat|suspend)"],
37
+ "Unilateral change": [r"sole discretion", r"reserves? the right to (modify|change|update|amend)", r"at any time.*without (prior )?notice", r"we may (modify|change|update)"],
38
+ "Content removal": [r"remove.*content.*without", r"right to remove", r"we may.*remove"],
39
+ "Contract by using": [r"by (using|accessing).*you agree", r"continued use.*constitutes? acceptance"],
40
+ "Choice of law": [r"governed by.*laws? of", r"shall be governed", r"laws of the state of"],
41
+ "Jurisdiction": [r"exclusive jurisdiction", r"courts? of.*(california|delaware|new york|ireland|england)", r"submit to.*jurisdiction"],
42
+ "Arbitration": [r"arbitrat", r"binding arbitration", r"waive.*right.*court", r"class action waiver"],
43
  }
44
 
45
+ def classify_ml(text):
46
+ """Classify using the trained Legal-BERT model."""
47
+ if not ml_pipeline:
48
+ return classify_regex(text)
49
+ try:
50
+ preds = ml_pipeline(text, truncation=True, max_length=512)
51
+ results = []
52
+ for p in preds[0] if isinstance(preds[0], list) else preds:
53
+ if p["score"] > 0.5 and p["label"] in LABELS:
54
+ sev, desc = LABELS[p["label"]]
55
+ results.append({"name": p["label"], "severity": sev, "desc": desc, "confidence": round(p["score"], 2)})
56
+ return results
57
+ except Exception:
58
+ return classify_regex(text)
59
+
60
+ def classify_regex(text):
61
+ """Fallback regex classifier."""
62
+ results = []
63
+ text_lower = text.lower()
64
+ for name, pats in PATTERNS.items():
65
+ for p in pats:
66
+ if re.search(p, text_lower):
67
+ sev, desc = LABELS[name]
68
+ results.append({"name": name, "severity": sev, "desc": desc, "confidence": 0.7})
69
+ break
70
+ return results
71
+
72
  def split_clauses(text):
73
  text = re.sub(r'\n{2,}', '\n', text.strip())
74
  parts = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(])|(?:\n)(?=\d+[.)]\s|\([a-z]\)\s)', text)
 
86
  sev_counts = {"HIGH": 0, "MEDIUM": 0, "LOW": 0}
87
 
88
  for clause in clauses:
89
+ hits = classify_ml(clause)
 
 
 
 
 
 
 
 
90
  if hits:
91
  flagged.append({"text": clause, "hits": hits})
92
+ for h in hits:
93
+ sev_counts[h["severity"]] += 1
94
 
95
  total = len(clauses)
96
  risk = min(100, round((sev_counts["HIGH"] * 20 + sev_counts["MEDIUM"] * 10 + sev_counts["LOW"] * 5) / max(1, total) * 100))
 
101
  elif risk >= 10: grade = "B"
102
  else: grade = "A"
103
 
104
+ engine = "Legal-BERT" if ml_pipeline else "Pattern matching"
105
+
106
+ # Build HTML
107
  summary = f"""<div style="font-family:system-ui,sans-serif;">
108
  <div style="border:1px solid #e4e4e7;border-radius:8px;padding:20px;margin-bottom:16px;">
109
  <div style="display:flex;justify-content:space-between;align-items:baseline;">
 
117
  'background:#f0fdf4;color:#15803d;'
118
  }">Grade {grade}</span>
119
  </div>
120
+ <p style="margin-top:8px;font-size:12px;color:#a1a1aa;">{total} clauses · {len(flagged)} flagged · {sev_counts['HIGH']} high · {sev_counts['MEDIUM']} medium · {sev_counts['LOW']} low · Engine: {engine}</p>
121
  </div>"""
122
 
123
  if not flagged:
124
  summary += '<div style="border:1px solid #e4e4e7;border-radius:8px;padding:24px;text-align:center;"><p style="font-size:14px;color:#71717a;">No unfair clauses found.</p></div>'
125
  else:
126
+ for item in flagged:
127
  max_sev = max(item["hits"], key=lambda h: {"HIGH":3,"MEDIUM":2,"LOW":1}[h["severity"]])["severity"]
128
+ border = {"HIGH":"#fca5a5","MEDIUM":"#fcd34d","LOW":"#93c5fd"}[max_sev]
129
 
130
  tags = ""
131
  for h in item["hits"]:
132
+ ts = {"HIGH":"background:#fef2f2;color:#b91c1c;border:1px solid #fecaca;",
133
+ "MEDIUM":"background:#fffbeb;color:#a16207;border:1px solid #fde68a;",
134
+ "LOW":"background:#eff6ff;color:#1d4ed8;border:1px solid #bfdbfe;"}[h["severity"]]
135
+ conf = f' ({h["confidence"]})' if h.get("confidence") and ml_pipeline else ""
136
+ tags += f'<span style="{ts}font-size:11px;font-weight:500;padding:1px 8px;border-radius:3px;margin-right:4px;">{h["name"]}{conf}</span>'
137
 
138
  descs = "".join(f'<p style="font-size:12px;color:#71717a;margin-top:4px;">{h["desc"]}</p>' for h in item["hits"])
139
+ preview = item["text"][:200] + ("..." if len(item["text"]) > 200 else "")
140
 
141
+ summary += f'''<div style="border:1px solid #e4e4e7;border-left:3px solid {border};border-radius:8px;padding:14px;margin-bottom:8px;">
142
  <p style="font-size:13px;color:#3f3f46;line-height:1.6;">{preview}</p>
143
  <div style="margin-top:8px;">{tags}</div>
144
  {descs}
 
147
  summary += "</div>"
148
  return summary, ""
149
 
150
+
151
  SPOTIFY = """By using the Spotify Service, you agree to be bound by these Terms of Use.
152
 
153
  Spotify may, in its sole discretion, modify or update these Terms of Service at any time without prior notice. Your continued use of the Service after any such changes constitutes your acceptance of the new Terms of Service.
 
192
  scan_btn.click(fn=analyze, inputs=[text_input], outputs=[results_html, hidden])
193
  clear_btn.click(fn=lambda: ("", "", ""), outputs=[text_input, results_html, hidden])
194
 
195
+ gr.HTML('<p style="font-family:system-ui,sans-serif;font-size:11px;color:#a1a1aa;text-align:center;padding:16px 0;border-top:1px solid #f4f4f5;margin-top:16px;">Not legal advice. Model: Legal-BERT fine-tuned on CLAUDETTE. <a href="https://huggingface.co/gaurv007/clauseguard-legal-bert" style="color:#71717a;">Model</a> · <a href="https://huggingface.co/datasets/coastalcph/lex_glue" style="color:#71717a;">Dataset</a></p>')
196
 
197
  if __name__ == "__main__":
198
  demo.launch()
requirements.txt CHANGED
@@ -1 +1,4 @@
1
  gradio>=5.0
 
 
 
 
1
  gradio>=5.0
2
+ transformers>=5.0
3
+ torch
4
+ numpy