gaurv007 commited on
Commit
f4ccb3e
·
1 Parent(s): 79c33ca

🔧 v4.2: Critical bug fixes + performance optimizations (7 bugs, 4 perf improvements) (#3)

Browse files

- v4.2: Update obligations.py (11d6a4f623ebb141d0aa0499471a49d4d8cb11cf)
- v4.2: Update app.py (a61dcf1b6d7c3cb55fc63affd32944cbc9e8d4bd)
- v4.2: Update extension/background.js (15e2d6a4cd62ce71491354cbf1321b4885abbf98)
- v4.2: Update compare.py (9bd2e1c8bd97232894e32b59cd6c7ef89fba9bfc)
- v4.2: Update compliance.py (b16b7fae913f907d196c1fdc1a89b7b152a161e7)
- v4.2: Update api/main.py (376db46363624c3463364ad7b30549ad4851670e)
- v4.2: Update README.md (3c7bc996f7c39c2769899fc46658148e838457f0)

Files changed (7) hide show
  1. README.md +13 -2
  2. api/main.py +18 -11
  3. app.py +99 -64
  4. compare.py +1 -1
  5. compliance.py +7 -4
  6. extension/background.js +12 -5
  7. obligations.py +23 -8
README.md CHANGED
@@ -10,11 +10,22 @@ app_file: app.py
10
  pinned: false
11
  ---
12
 
13
- # 🛡️ ClauseGuard v4.0 — World's Best Open-Source Legal Contract Analysis
14
 
15
  **ClauseGuard** is the most comprehensive open-source AI-powered legal contract analysis tool. It analyzes contracts using state-of-the-art legal NLP models and provides actionable risk assessments, Q&A chatbot, clause redlining, and OCR for scanned PDFs.
16
 
17
- ## 🆕 What's New in v4.0
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  | Feature | Description |
20
  |---------|-------------|
 
10
  pinned: false
11
  ---
12
 
13
+ # 🛡️ ClauseGuard v4.2 — World's Best Open-Source Legal Contract Analysis
14
 
15
  **ClauseGuard** is the most comprehensive open-source AI-powered legal contract analysis tool. It analyzes contracts using state-of-the-art legal NLP models and provides actionable risk assessments, Q&A chatbot, clause redlining, and OCR for scanned PDFs.
16
 
17
+ ## 🆕 What's New in v4.2
18
+
19
+ | Feature | Description |
20
+ |---------|-------------|
21
+ | **🔧 NLI Fix** | Fixed contradiction detection — now uses `CrossEncoder.predict()` instead of broken `pipeline("text-classification")` dict input. Contradictions actually work now. |
22
+ | **🔒 Thread Safety** | `BoundedCache` now uses `threading.RLock` to prevent race conditions under concurrent Gradio requests |
23
+ | **⚡ Pre-compiled Regex** | All regex patterns (clause classification, obligations, compliance negation) pre-compiled at module level — eliminates thousands of redundant compilations |
24
+ | **🔗 Extension Fix** | Chrome extension risk formula now matches backend (diminishing returns, not normalized by doc length). Fixed API_BASE URL. |
25
+ | **🏷️ Label Coverage** | Added missing regex-only labels (Indemnification, Confidentiality, Force Majeure, Penalties) to RISK_MAP and DESC_MAP |
26
+ | **🛡️ Security** | API CORS localhost origins now require explicit opt-in via `CORS_ALLOW_LOCALHOST=true` env var |
27
+
28
+ ### Previous: v4.0
29
 
30
  | Feature | Description |
31
  |---------|-------------|
api/main.py CHANGED
@@ -58,8 +58,9 @@ HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
58
  SAULLM_ENDPOINT = os.environ.get("SAULLM_ENDPOINT", "")
59
  MAX_TEXT_LENGTH = int(os.environ.get("MAX_TEXT_LENGTH", "200000"))
60
 
61
- # ─── FIX v4.1: Sliding window rate limiter with proper IP extraction ───
62
  _rate_limits: dict[str, list[float]] = {}
 
63
  RATE_LIMIT_REQUESTS = 30
64
  RATE_LIMIT_WINDOW = 60 # seconds
65
 
@@ -71,8 +72,17 @@ def _get_client_ip(request: Request) -> str:
71
  return request.client.host if request.client else "unknown"
72
 
73
  def _check_rate_limit(client_ip: str) -> bool:
74
- """Sliding window rate limiter."""
 
75
  now = time.time()
 
 
 
 
 
 
 
 
76
  if client_ip not in _rate_limits:
77
  _rate_limits[client_ip] = []
78
 
@@ -85,13 +95,6 @@ def _check_rate_limit(client_ip: str) -> bool:
85
  return False
86
 
87
  _rate_limits[client_ip].append(now)
88
-
89
- # Periodic cleanup of stale IPs (every 100 requests)
90
- if len(_rate_limits) > 1000:
91
- stale = [ip for ip, ts in _rate_limits.items() if not ts or now - ts[-1] > RATE_LIMIT_WINDOW * 2]
92
- for ip in stale:
93
- del _rate_limits[ip]
94
-
95
  return True
96
 
97
  # ─── Supabase helper ───
@@ -193,11 +196,15 @@ async def lifespan(app: FastAPI):
193
 
194
  app = FastAPI(title="ClauseGuard API", version="4.1.0", lifespan=lifespan)
195
 
 
 
196
  ALLOWED_ORIGINS = [
197
  "https://clauseguardweb.netlify.app",
198
- "http://localhost:3000",
199
- "http://localhost:3001",
200
  ]
 
 
 
 
201
  app.add_middleware(
202
  CORSMiddleware,
203
  allow_origins=ALLOWED_ORIGINS,
 
58
  SAULLM_ENDPOINT = os.environ.get("SAULLM_ENDPOINT", "")
59
  MAX_TEXT_LENGTH = int(os.environ.get("MAX_TEXT_LENGTH", "200000"))
60
 
61
+ # ─── FIX v4.2: Improved sliding window rate limiter with periodic cleanup ───
62
  _rate_limits: dict[str, list[float]] = {}
63
+ _rate_limits_last_cleanup: float = 0.0
64
  RATE_LIMIT_REQUESTS = 30
65
  RATE_LIMIT_WINDOW = 60 # seconds
66
 
 
72
  return request.client.host if request.client else "unknown"
73
 
74
  def _check_rate_limit(client_ip: str) -> bool:
75
+ """Sliding window rate limiter with periodic stale-IP cleanup."""
76
+ global _rate_limits_last_cleanup
77
  now = time.time()
78
+
79
+ # FIX v4.2: Periodic cleanup every 60s regardless of dict size
80
+ if now - _rate_limits_last_cleanup > 60:
81
+ stale = [ip for ip, ts in _rate_limits.items() if not ts or now - ts[-1] > RATE_LIMIT_WINDOW * 2]
82
+ for ip in stale:
83
+ del _rate_limits[ip]
84
+ _rate_limits_last_cleanup = now
85
+
86
  if client_ip not in _rate_limits:
87
  _rate_limits[client_ip] = []
88
 
 
95
  return False
96
 
97
  _rate_limits[client_ip].append(now)
 
 
 
 
 
 
 
98
  return True
99
 
100
  # ─── Supabase helper ───
 
196
 
197
  app = FastAPI(title="ClauseGuard API", version="4.1.0", lifespan=lifespan)
198
 
199
+ # FIX v4.2: CORS origins configurable via env var; localhost only in dev
200
+ _extra_origins = os.environ.get("CORS_EXTRA_ORIGINS", "").split(",")
201
  ALLOWED_ORIGINS = [
202
  "https://clauseguardweb.netlify.app",
 
 
203
  ]
204
+ # Only add localhost origins if explicitly enabled via env
205
+ if os.environ.get("CORS_ALLOW_LOCALHOST", "").lower() == "true":
206
+ ALLOWED_ORIGINS.extend(["http://localhost:3000", "http://localhost:3001"])
207
+ ALLOWED_ORIGINS.extend([o.strip() for o in _extra_origins if o.strip()])
208
  app.add_middleware(
209
  CORSMiddleware,
210
  allow_origins=ALLOWED_ORIGINS,
app.py CHANGED
@@ -1,6 +1,15 @@
1
  """
2
- ClauseGuard — World's Best Legal Contract Analysis Tool (v4.1)
3
  ═══════════════════════════════════════════════════════════════
 
 
 
 
 
 
 
 
 
4
  Fixes in v4.1:
5
  • FIX: Bounded LRU caches (chunk_cache, prediction_cache) — no more memory leaks
6
  • FIX: NLI input format — pass (text_a, text_b) tuple, not [SEP]-concatenated string
@@ -44,6 +53,7 @@ import io
44
  import uuid
45
  import tempfile
46
  import hashlib
 
47
  from collections import defaultdict, OrderedDict
48
  from datetime import datetime
49
  from functools import lru_cache
@@ -80,6 +90,14 @@ try:
80
  except Exception:
81
  pass
82
 
 
 
 
 
 
 
 
 
83
  # ── Import submodules ───────────────────────────────────────────────
84
  from compare import compare_contracts, render_comparison_html
85
  from obligations import extract_obligations, render_obligations_html
@@ -142,7 +160,12 @@ _UNFAIR_LABELS = [
142
  "Jurisdiction", "Arbitration"
143
  ]
144
 
145
- _ALL_LABELS = CUAD_LABELS + _UNFAIR_LABELS
 
 
 
 
 
146
 
147
  RISK_MAP = {
148
  # Critical
@@ -198,6 +221,11 @@ RISK_MAP = {
198
  "Other": "LOW",
199
  "ROFR/ROFO/ROFN": "LOW",
200
  "Contract by using": "LOW",
 
 
 
 
 
201
  }
202
 
203
  DESC_MAP = {label: label.replace("_", " ") for label in _ALL_LABELS}
@@ -238,6 +266,11 @@ DESC_MAP.update({
238
  "Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
239
  "Unlimited/All-You-Can-Eat License": "License with no usage limits.",
240
  "Notice Period to Terminate Renewal": "Required notice period before automatic renewal.",
 
 
 
 
 
241
  })
242
 
243
  RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
@@ -267,31 +300,39 @@ for _i in range(41):
267
  # ═══════════════════════════════════════════════════════════════════════
268
 
269
  class BoundedCache:
270
- """Thread-safe bounded LRU cache using OrderedDict."""
 
 
 
271
  def __init__(self, maxsize=1000):
272
  self._cache = OrderedDict()
273
  self._maxsize = maxsize
 
274
 
275
  def get(self, key, default=None):
276
- if key in self._cache:
277
- self._cache.move_to_end(key)
278
- return self._cache[key]
279
- return default
 
280
 
281
  def put(self, key, value):
282
- if key in self._cache:
283
- self._cache.move_to_end(key)
284
- self._cache[key] = value
285
- else:
286
- if len(self._cache) >= self._maxsize:
287
- self._cache.popitem(last=False)
288
- self._cache[key] = value
 
289
 
290
  def __contains__(self, key):
291
- return key in self._cache
 
292
 
293
  def __len__(self):
294
- return len(self._cache)
 
295
 
296
 
297
  # ═══════════════════════════════════════════════════════════════════════
@@ -301,7 +342,7 @@ class BoundedCache:
301
  cuad_tokenizer = None
302
  cuad_model = None
303
  ner_pipeline = None
304
- nli_pipeline = None
305
  _model_status = {"cuad": "not_loaded", "ner": "not_loaded", "nli": "not_loaded"}
306
 
307
  def _load_cuad_model():
@@ -349,20 +390,16 @@ def _load_ner_model():
349
  _model_status["ner"] = f"failed: {e}"
350
 
351
  def _load_nli_model():
352
- global nli_pipeline, _model_status, _HAS_NLI_MODEL
353
- if not _HAS_TORCH:
354
- _model_status["nli"] = "unavailable"
355
  return
356
  try:
357
- print("[ClauseGuard] Loading NLI model: cross-encoder/nli-deberta-v3-base")
358
- nli_pipeline = pipeline(
359
- "text-classification",
360
- model="cross-encoder/nli-deberta-v3-base",
361
- device=-1,
362
- )
363
  _HAS_NLI_MODEL = True
364
  _model_status["nli"] = "loaded"
365
- print("[ClauseGuard] NLI model loaded successfully")
366
  except Exception as e:
367
  print(f"[ClauseGuard] NLI model load failed (using heuristic fallback): {e}")
368
  _model_status["nli"] = f"failed: {e}"
@@ -430,6 +467,18 @@ def parse_document(file_path):
430
 
431
  _chunk_cache = BoundedCache(maxsize=500)
432
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  def split_clauses(text):
434
  """Deterministic, structure-aware clause splitting.
435
  Same input ALWAYS produces same output. Normalized text is hashed
@@ -443,18 +492,7 @@ def split_clauses(text):
443
  text = re.sub(r'\n{3,}', '\n\n', text.strip())
444
 
445
  # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
446
- section_pattern = re.compile(
447
- r'(?:^|\n\n)'
448
- r'(?='
449
- r'\d+(?:\.\d+)*[.)]\s' # 1. 2. 3.1. 3.1)
450
- r'|[A-Z]{2,}[A-Z\s]*\n' # ALL CAPS HEADERS
451
- r'|\([a-z]\)\s' # (a) (b) (c)
452
- r'|(?:Section|Article|Clause)\s+\d+' # Section 1, Article 2
453
- r')',
454
- re.MULTILINE
455
- )
456
-
457
- positions = [m.start() for m in section_pattern.finditer(text)]
458
 
459
  if len(positions) >= 3:
460
  clauses = []
@@ -688,14 +726,19 @@ _REGEX_PATTERNS = {
688
  "Price Restriction": [r"price.*(?:restriction|limitation|ceiling|cap|floor)", r"(?:shall|may).*not.*(?:increase|raise|exceed).*price"],
689
  }
690
 
 
 
 
 
 
691
  def _classify_regex(text):
692
  """Regex fallback — returns pattern match, NOT fake confidence."""
693
  text_lower = text.lower()
694
  results = []
695
  seen = set()
696
- for label, patterns in _REGEX_PATTERNS.items():
697
  for pat in patterns:
698
- if re.search(pat, text_lower):
699
  if label not in seen:
700
  risk = RISK_MAP.get(label, "MEDIUM")
701
  results.append({
@@ -816,29 +859,21 @@ def _extract_entities_regex(text):
816
  # ═══════════════════════════════════════════════════════════════════════
817
 
818
  def _run_nli(text_a, text_b):
819
- """Run NLI pipeline with correct input format for cross-encoder.
820
- FIX v4.1: cross-encoder expects {'text': a, 'text_pair': b} or a dict,
821
- but the HF pipeline for text-classification with cross-encoder accepts
822
- a dict input: {"text": text_a, "text_pair": text_b}.
823
- The simplest correct way is to pass them as a list of dicts."""
824
  try:
825
- # The cross-encoder/nli-deberta-v3-base pipeline expects two texts.
826
- # Passing as a dict with text and text_pair is the correct format.
827
- result = nli_pipeline(
828
- {"text": text_a[:256], "text_pair": text_b[:256]},
829
- truncation=True,
830
- )
831
- return result
832
- except Exception:
833
- # Some pipeline versions accept positional (text, text_pair) as tuple
834
- try:
835
- return nli_pipeline(
836
- text_a[:256],
837
- text_pair=text_b[:256],
838
- truncation=True,
839
- )
840
- except Exception:
841
- return None
842
 
843
 
844
  def detect_contradictions(clause_results, raw_text=""):
@@ -857,7 +892,7 @@ def detect_contradictions(clause_results, raw_text=""):
857
  clause_texts_by_label[cr["label"]].append(cr.get("text", ""))
858
 
859
  # ── 1. Semantic NLI (if model available) ──
860
- if _HAS_NLI_MODEL and nli_pipeline is not None:
861
  conflict_pairs = [
862
  ("Uncapped Liability", "Cap on Liability",
863
  "Liability cannot be both uncapped and capped simultaneously."),
 
1
  """
2
+ ClauseGuard — World's Best Legal Contract Analysis Tool (v4.2)
3
  ═══════════════════════════════════════════════════════════════
4
+ Fixes in v4.2:
5
+ • FIX: NLI now uses CrossEncoder.predict() — contradictions actually work
6
+ • FIX: BoundedCache uses threading.RLock — no more race conditions
7
+ • FIX: Pre-compiled ALL regex patterns at module level (perf)
8
+ • FIX: Added missing regex labels to RISK_MAP/DESC_MAP
9
+ • FIX: Extension risk formula matches backend
10
+ • FIX: Extension API_BASE URL corrected
11
+ • FIX: API CORS localhost requires explicit opt-in
12
+
13
  Fixes in v4.1:
14
  • FIX: Bounded LRU caches (chunk_cache, prediction_cache) — no more memory leaks
15
  • FIX: NLI input format — pass (text_a, text_b) tuple, not [SEP]-concatenated string
 
53
  import uuid
54
  import tempfile
55
  import hashlib
56
+ import threading
57
  from collections import defaultdict, OrderedDict
58
  from datetime import datetime
59
  from functools import lru_cache
 
90
  except Exception:
91
  pass
92
 
93
+ # ── CrossEncoder for NLI (soft-fail) ──────────────────────────────────
94
+ _HAS_CROSS_ENCODER = False
95
+ try:
96
+ from sentence_transformers import CrossEncoder as _CrossEncoder
97
+ _HAS_CROSS_ENCODER = True
98
+ except ImportError:
99
+ pass
100
+
101
  # ── Import submodules ───────────────────────────────────────────────
102
  from compare import compare_contracts, render_comparison_html
103
  from obligations import extract_obligations, render_obligations_html
 
160
  "Jurisdiction", "Arbitration"
161
  ]
162
 
163
+ # FIX v4.2: Include regex-only labels that aren't in CUAD or Unfair lists
164
+ _EXTRA_REGEX_LABELS = [
165
+ "Indemnification", "Confidentiality", "Force Majeure", "Penalties"
166
+ ]
167
+
168
+ _ALL_LABELS = CUAD_LABELS + _UNFAIR_LABELS + _EXTRA_REGEX_LABELS
169
 
170
  RISK_MAP = {
171
  # Critical
 
221
  "Other": "LOW",
222
  "ROFR/ROFO/ROFN": "LOW",
223
  "Contract by using": "LOW",
224
+ # FIX v4.2: Added regex-only labels that were missing from RISK_MAP
225
+ "Indemnification": "HIGH",
226
+ "Confidentiality": "MEDIUM",
227
+ "Force Majeure": "LOW",
228
+ "Penalties": "HIGH",
229
  }
230
 
231
  DESC_MAP = {label: label.replace("_", " ") for label in _ALL_LABELS}
 
266
  "Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
267
  "Unlimited/All-You-Can-Eat License": "License with no usage limits.",
268
  "Notice Period to Terminate Renewal": "Required notice period before automatic renewal.",
269
+ # FIX v4.2: Added descriptions for regex-only labels
270
+ "Indemnification": "Obligation to compensate the other party for losses or damages.",
271
+ "Confidentiality": "Restrictions on sharing proprietary or sensitive information.",
272
+ "Force Majeure": "Excuses performance due to extraordinary events beyond control.",
273
+ "Penalties": "Financial penalties for breach or late performance.",
274
  })
275
 
276
  RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
 
300
  # ═══════════════════════════════════════════════════════════════════════
301
 
302
  class BoundedCache:
303
+ """Thread-safe bounded LRU cache using OrderedDict + RLock.
304
+ FIX v4.2: Added threading.RLock to prevent race conditions under
305
+ Gradio's concurrent request handling. OrderedDict compound operations
306
+ (contains + setitem + move_to_end + popitem) are NOT atomic even with GIL."""
307
  def __init__(self, maxsize=1000):
308
  self._cache = OrderedDict()
309
  self._maxsize = maxsize
310
+ self._lock = threading.RLock()
311
 
312
  def get(self, key, default=None):
313
+ with self._lock:
314
+ if key in self._cache:
315
+ self._cache.move_to_end(key)
316
+ return self._cache[key]
317
+ return default
318
 
319
  def put(self, key, value):
320
+ with self._lock:
321
+ if key in self._cache:
322
+ self._cache.move_to_end(key)
323
+ self._cache[key] = value
324
+ else:
325
+ if len(self._cache) >= self._maxsize:
326
+ self._cache.popitem(last=False)
327
+ self._cache[key] = value
328
 
329
  def __contains__(self, key):
330
+ with self._lock:
331
+ return key in self._cache
332
 
333
  def __len__(self):
334
+ with self._lock:
335
+ return len(self._cache)
336
 
337
 
338
  # ═══════════════════════════════════════════════════════════════════════
 
342
  cuad_tokenizer = None
343
  cuad_model = None
344
  ner_pipeline = None
345
+ nli_model = None # FIX v4.2: CrossEncoder instead of pipeline
346
  _model_status = {"cuad": "not_loaded", "ner": "not_loaded", "nli": "not_loaded"}
347
 
348
  def _load_cuad_model():
 
390
  _model_status["ner"] = f"failed: {e}"
391
 
392
  def _load_nli_model():
393
+ global nli_model, _model_status, _HAS_NLI_MODEL
394
+ if not _HAS_CROSS_ENCODER:
395
+ _model_status["nli"] = "unavailable (sentence-transformers not installed)"
396
  return
397
  try:
398
+ print("[ClauseGuard] Loading NLI model: cross-encoder/nli-deberta-v3-base (CrossEncoder)")
399
+ nli_model = _CrossEncoder("cross-encoder/nli-deberta-v3-base")
 
 
 
 
400
  _HAS_NLI_MODEL = True
401
  _model_status["nli"] = "loaded"
402
+ print("[ClauseGuard] NLI CrossEncoder loaded successfully")
403
  except Exception as e:
404
  print(f"[ClauseGuard] NLI model load failed (using heuristic fallback): {e}")
405
  _model_status["nli"] = f"failed: {e}"
 
467
 
468
  _chunk_cache = BoundedCache(maxsize=500)
469
 
470
+ # FIX v4.2: Pre-compile section pattern at module level (was recompiling per call)
471
+ _SECTION_PATTERN = re.compile(
472
+ r'(?:^|\n\n)'
473
+ r'(?='
474
+ r'\d+(?:\.\d+)*[.)]\s' # 1. 2. 3.1. 3.1)
475
+ r'|[A-Z]{2,}[A-Z\s]*\n' # ALL CAPS HEADERS
476
+ r'|\([a-z]\)\s' # (a) (b) (c)
477
+ r'|(?:Section|Article|Clause)\s+\d+' # Section 1, Article 2
478
+ r')',
479
+ re.MULTILINE
480
+ )
481
+
482
  def split_clauses(text):
483
  """Deterministic, structure-aware clause splitting.
484
  Same input ALWAYS produces same output. Normalized text is hashed
 
492
  text = re.sub(r'\n{3,}', '\n\n', text.strip())
493
 
494
  # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
495
+ positions = [m.start() for m in _SECTION_PATTERN.finditer(text)]
 
 
 
 
 
 
 
 
 
 
 
496
 
497
  if len(positions) >= 3:
498
  clauses = []
 
726
  "Price Restriction": [r"price.*(?:restriction|limitation|ceiling|cap|floor)", r"(?:shall|may).*not.*(?:increase|raise|exceed).*price"],
727
  }
728
 
729
+ # FIX v4.2: Pre-compile regex patterns at module level (was recompiling per call)
730
+ _REGEX_PATTERNS_COMPILED = {}
731
+ for _label, _pats in _REGEX_PATTERNS.items():
732
+ _REGEX_PATTERNS_COMPILED[_label] = [re.compile(p, re.IGNORECASE) for p in _pats]
733
+
734
  def _classify_regex(text):
735
  """Regex fallback — returns pattern match, NOT fake confidence."""
736
  text_lower = text.lower()
737
  results = []
738
  seen = set()
739
+ for label, patterns in _REGEX_PATTERNS_COMPILED.items():
740
  for pat in patterns:
741
+ if pat.search(text_lower):
742
  if label not in seen:
743
  risk = RISK_MAP.get(label, "MEDIUM")
744
  results.append({
 
859
  # ═══════════════════════════════════════════════════════════════════════
860
 
861
  def _run_nli(text_a, text_b):
862
+ """Run NLI using CrossEncoder with correct input format.
863
+ FIX v4.2: Use sentence_transformers.CrossEncoder.predict() which accepts
864
+ a list of (text_a, text_b) tuples. Returns scores for [contradiction, entailment, neutral].
865
+ The old code used pipeline("text-classification") with dict input, which was broken."""
 
866
  try:
867
+ # CrossEncoder.predict returns numpy array of shape (n_pairs, 3)
868
+ # Columns: [contradiction, entailment, neutral]
869
+ scores = nli_model.predict([(text_a[:256], text_b[:256])])
870
+ label_mapping = ["contradiction", "entailment", "neutral"]
871
+ top_idx = int(scores[0].argmax())
872
+ top_score = float(scores[0][top_idx])
873
+ return [{"label": label_mapping[top_idx], "score": top_score}]
874
+ except Exception as e:
875
+ print(f"[ClauseGuard] NLI inference error: {e}")
876
+ return None
 
 
 
 
 
 
 
877
 
878
 
879
  def detect_contradictions(clause_results, raw_text=""):
 
892
  clause_texts_by_label[cr["label"]].append(cr.get("text", ""))
893
 
894
  # ── 1. Semantic NLI (if model available) ──
895
+ if _HAS_NLI_MODEL and nli_model is not None:
896
  conflict_pairs = [
897
  ("Uncapped Liability", "Cap on Liability",
898
  "Liability cannot be both uncapped and capped simultaneously."),
compare.py CHANGED
@@ -28,7 +28,7 @@ def _load_embedder():
28
  global _embedder
29
  if _HAS_EMBEDDINGS and _embedder is None:
30
  try:
31
- _embedder = SentenceTransformer("all-MiniLM-L6-v2")
32
  print("[ClauseGuard] Sentence embeddings loaded for comparison")
33
  except Exception as e:
34
  print(f"[ClauseGuard] Embeddings not available: {e}")
 
28
  global _embedder
29
  if _HAS_EMBEDDINGS and _embedder is None:
30
  try:
31
+ _embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
32
  print("[ClauseGuard] Sentence embeddings loaded for comparison")
33
  except Exception as e:
34
  print(f"[ClauseGuard] Embeddings not available: {e}")
compliance.py CHANGED
@@ -23,6 +23,9 @@ _NEGATION_PATTERNS = [
23
  r"notwithstanding.*(?:shall\s+not|does\s+not|is\s+not)",
24
  ]
25
 
 
 
 
26
  # Regulatory requirement definitions
27
  REGULATIONS = {
28
  "GDPR": {
@@ -214,13 +217,13 @@ def _check_negation(text_lower, keyword, window=200):
214
  wider_context = text_lower[start:end]
215
 
216
  # Check sentence first (higher confidence)
217
- for neg_pat in _NEGATION_PATTERNS:
218
- if re.search(neg_pat, sentence, re.IGNORECASE):
219
  return True
220
 
221
  # Then check wider window (lower confidence, still relevant)
222
- for neg_pat in _NEGATION_PATTERNS[:4]: # Only strong negation patterns for wider window
223
- if re.search(neg_pat, wider_context, re.IGNORECASE):
224
  return True
225
 
226
  return False
 
23
  r"notwithstanding.*(?:shall\s+not|does\s+not|is\s+not)",
24
  ]
25
 
26
+ # FIX v4.2: Pre-compile negation patterns at module level
27
+ _NEGATION_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _NEGATION_PATTERNS]
28
+
29
  # Regulatory requirement definitions
30
  REGULATIONS = {
31
  "GDPR": {
 
217
  wider_context = text_lower[start:end]
218
 
219
  # Check sentence first (higher confidence)
220
+ for neg_pat in _NEGATION_PATTERNS_COMPILED:
221
+ if neg_pat.search(sentence):
222
  return True
223
 
224
  # Then check wider window (lower confidence, still relevant)
225
+ for neg_pat in _NEGATION_PATTERNS_COMPILED[:4]: # Only strong negation patterns for wider window
226
+ if neg_pat.search(wider_context):
227
  return True
228
 
229
  return False
extension/background.js CHANGED
@@ -4,7 +4,8 @@
4
  * FIXED: Error handling and retry logic
5
  */
6
 
7
- const API_BASE = "https://gaurv007-clauseguard-api.hf.space";
 
8
  const FREE_SCANS_PER_MONTH = 10;
9
  const API_TIMEOUT_MS = 45000;
10
 
@@ -181,13 +182,19 @@ function localAnalyze(text) {
181
  });
182
 
183
  const flagged = results.filter(r => r.categories.length > 0);
184
- const sev = { HIGH: 0, MEDIUM: 0, LOW: 0 };
185
- flagged.forEach(r => r.categories.forEach(c => sev[c.severity]++));
186
- const risk = Math.min(100, Math.round((sev.HIGH*20 + sev.MEDIUM*10 + sev.LOW*5) / Math.max(1, clauses.length) * 100));
 
 
 
 
 
 
187
 
188
  return {
189
  risk_score: risk,
190
- grade: risk >= 60 ? "F" : risk >= 40 ? "D" : risk >= 20 ? "C" : risk >= 10 ? "B" : "A",
191
  total_clauses: clauses.length, flagged_count: flagged.length, results,
192
  };
193
  }
 
4
  * FIXED: Error handling and retry logic
5
  */
6
 
7
+ // FIX v4.2: Corrected API_BASE URL to match the actual Gradio Space
8
+ const API_BASE = "https://gaurv007-clauseguard.hf.space";
9
  const FREE_SCANS_PER_MONTH = 10;
10
  const API_TIMEOUT_MS = 45000;
11
 
 
182
  });
183
 
184
  const flagged = results.filter(r => r.categories.length > 0);
185
+ const sev = { CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0 };
186
+ flagged.forEach(r => r.categories.forEach(c => {
187
+ if (sev.hasOwnProperty(c.severity)) sev[c.severity]++;
188
+ else sev.MEDIUM++; // default for unknown severity
189
+ }));
190
+ // FIX v4.2: Use the same diminishing-returns formula as the backend (app.py)
191
+ // instead of normalizing by clause count (which gave different scores)
192
+ const weighted = sev.CRITICAL*40 + sev.HIGH*20 + sev.MEDIUM*10 + sev.LOW*3;
193
+ const risk = Math.min(100, Math.round(100 * (1 - (1 / (1 + weighted / 30)))));
194
 
195
  return {
196
  risk_score: risk,
197
+ grade: risk >= 70 ? "F" : risk >= 50 ? "D" : risk >= 30 ? "C" : risk >= 15 ? "B" : "A",
198
  total_clauses: clauses.length, flagged_count: flagged.length, results,
199
  };
200
  }
obligations.py CHANGED
@@ -85,11 +85,26 @@ _PRIORITY_MAP = {
85
  "delivery": 1,
86
  }
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  def _is_false_positive(sentence):
90
  """Check if a sentence is a common false positive (definition/interpretation, not obligation)."""
91
- for fp in _FALSE_POSITIVE_PATTERNS:
92
- if re.search(fp, sentence, re.IGNORECASE):
93
  return True
94
  return False
95
 
@@ -111,9 +126,9 @@ def extract_obligations(text):
111
  continue
112
 
113
  found_types = set()
114
- for otype, patterns in OBLIGATION_PATTERNS.items():
115
  for pat in patterns:
116
- if re.search(pat, sentence, re.IGNORECASE):
117
  found_types.add(otype)
118
  break
119
 
@@ -128,8 +143,8 @@ def extract_obligations(text):
128
  party = obligation_direction
129
  else:
130
  # Fallback to pattern matching within the sentence
131
- for pp in PARTY_PATTERNS:
132
- m = re.search(pp, sentence)
133
  if m:
134
  candidate = m.group(0).strip()
135
  # Fix 8: Reject party strings >40 chars (header bleed-through)
@@ -140,8 +155,8 @@ def extract_obligations(text):
140
  # Extract timeframe
141
  deadline = "Not specified"
142
  deadline_urgency = 0
143
- for pat, ptype in TIME_PATTERNS:
144
- m = re.search(pat, sentence, re.IGNORECASE)
145
  if m:
146
  if ptype == "relative":
147
  num = m.group(1)
 
85
  "delivery": 1,
86
  }
87
 
88
+ # FIX v4.2: Pre-compile obligation patterns at module level (was recompiling per sentence)
89
+ _OBLIGATION_PATTERNS_COMPILED = {
90
+ otype: [re.compile(p, re.IGNORECASE) for p in patterns]
91
+ for otype, patterns in OBLIGATION_PATTERNS.items()
92
+ }
93
+
94
+ # FIX v4.2: Pre-compile false positive patterns
95
+ _FALSE_POSITIVE_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _FALSE_POSITIVE_PATTERNS]
96
+
97
+ # FIX v4.2: Pre-compile time patterns
98
+ _TIME_PATTERNS_COMPILED = [(re.compile(p, re.IGNORECASE), ptype) for p, ptype in TIME_PATTERNS]
99
+
100
+ # FIX v4.2: Pre-compile party patterns
101
+ _PARTY_PATTERNS_COMPILED = [re.compile(p) for p in PARTY_PATTERNS]
102
+
103
 
104
  def _is_false_positive(sentence):
105
  """Check if a sentence is a common false positive (definition/interpretation, not obligation)."""
106
+ for fp in _FALSE_POSITIVE_PATTERNS_COMPILED:
107
+ if fp.search(sentence):
108
  return True
109
  return False
110
 
 
126
  continue
127
 
128
  found_types = set()
129
+ for otype, patterns in _OBLIGATION_PATTERNS_COMPILED.items():
130
  for pat in patterns:
131
+ if pat.search(sentence):
132
  found_types.add(otype)
133
  break
134
 
 
143
  party = obligation_direction
144
  else:
145
  # Fallback to pattern matching within the sentence
146
+ for pp in _PARTY_PATTERNS_COMPILED:
147
+ m = pp.search(sentence)
148
  if m:
149
  candidate = m.group(0).strip()
150
  # Fix 8: Reject party strings >40 chars (header bleed-through)
 
155
  # Extract timeframe
156
  deadline = "Not specified"
157
  deadline_urgency = 0
158
+ for pat, ptype in _TIME_PATTERNS_COMPILED:
159
+ m = pat.search(sentence)
160
  if m:
161
  if ptype == "relative":
162
  num = m.group(1)