Spaces:

gaurv007
/

ClauseGuard

Running

gaurv007 commited on 12 days ago

Commit

f4ccb3e

1 Parent(s): 79c33ca

🔧 v4.2: Critical bug fixes + performance optimizations (7 bugs, 4 perf improvements) (#3)

- v4.2: Update obligations.py (11d6a4f623ebb141d0aa0499471a49d4d8cb11cf)
- v4.2: Update app.py (a61dcf1b6d7c3cb55fc63affd32944cbc9e8d4bd)
- v4.2: Update extension/background.js (15e2d6a4cd62ce71491354cbf1321b4885abbf98)
- v4.2: Update compare.py (9bd2e1c8bd97232894e32b59cd6c7ef89fba9bfc)
- v4.2: Update compliance.py (b16b7fae913f907d196c1fdc1a89b7b152a161e7)
- v4.2: Update api/main.py (376db46363624c3463364ad7b30549ad4851670e)
- v4.2: Update README.md (3c7bc996f7c39c2769899fc46658148e838457f0)

Files changed (7) hide show

README.md +13 -2
api/main.py +18 -11
app.py +99 -64
compare.py +1 -1
compliance.py +7 -4
extension/background.js +12 -5
obligations.py +23 -8

README.md CHANGED Viewed

@@ -10,11 +10,22 @@ app_file: app.py
 pinned: false
 ---
-# 🛡️ ClauseGuard v4.0 — World's Best Open-Source Legal Contract Analysis
 **ClauseGuard** is the most comprehensive open-source AI-powered legal contract analysis tool. It analyzes contracts using state-of-the-art legal NLP models and provides actionable risk assessments, Q&A chatbot, clause redlining, and OCR for scanned PDFs.
-## 🆕 What's New in v4.0
 | Feature | Description |
 |---------|-------------|

 pinned: false
 ---
+# 🛡️ ClauseGuard v4.2 — World's Best Open-Source Legal Contract Analysis
 **ClauseGuard** is the most comprehensive open-source AI-powered legal contract analysis tool. It analyzes contracts using state-of-the-art legal NLP models and provides actionable risk assessments, Q&A chatbot, clause redlining, and OCR for scanned PDFs.
+## 🆕 What's New in v4.2
+| Feature | Description |
+|---------|-------------|
+| **🔧 NLI Fix** | Fixed contradiction detection — now uses `CrossEncoder.predict()` instead of broken `pipeline("text-classification")` dict input. Contradictions actually work now. |
+| **🔒 Thread Safety** | `BoundedCache` now uses `threading.RLock` to prevent race conditions under concurrent Gradio requests |
+| **⚡ Pre-compiled Regex** | All regex patterns (clause classification, obligations, compliance negation) pre-compiled at module level — eliminates thousands of redundant compilations |
+| **🔗 Extension Fix** | Chrome extension risk formula now matches backend (diminishing returns, not normalized by doc length). Fixed API_BASE URL. |
+| **🏷️ Label Coverage** | Added missing regex-only labels (Indemnification, Confidentiality, Force Majeure, Penalties) to RISK_MAP and DESC_MAP |
+| **🛡️ Security** | API CORS localhost origins now require explicit opt-in via `CORS_ALLOW_LOCALHOST=true` env var |
+### Previous: v4.0
 | Feature | Description |
 |---------|-------------|

api/main.py CHANGED Viewed

@@ -58,8 +58,9 @@ HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
 SAULLM_ENDPOINT = os.environ.get("SAULLM_ENDPOINT", "")
 MAX_TEXT_LENGTH = int(os.environ.get("MAX_TEXT_LENGTH", "200000"))
-# ─── FIX v4.1: Sliding window rate limiter with proper IP extraction ───
 _rate_limits: dict[str, list[float]] = {}
 RATE_LIMIT_REQUESTS = 30
 RATE_LIMIT_WINDOW = 60  # seconds
@@ -71,8 +72,17 @@ def _get_client_ip(request: Request) -> str:
     return request.client.host if request.client else "unknown"
 def _check_rate_limit(client_ip: str) -> bool:
-    """Sliding window rate limiter."""
     now = time.time()
     if client_ip not in _rate_limits:
         _rate_limits[client_ip] = []
@@ -85,13 +95,6 @@ def _check_rate_limit(client_ip: str) -> bool:
         return False
     _rate_limits[client_ip].append(now)
-    # Periodic cleanup of stale IPs (every 100 requests)
-    if len(_rate_limits) > 1000:
-        stale = [ip for ip, ts in _rate_limits.items() if not ts or now - ts[-1] > RATE_LIMIT_WINDOW * 2]
-        for ip in stale:
-            del _rate_limits[ip]
     return True
 # ─── Supabase helper ───
@@ -193,11 +196,15 @@ async def lifespan(app: FastAPI):
 app = FastAPI(title="ClauseGuard API", version="4.1.0", lifespan=lifespan)
 ALLOWED_ORIGINS = [
     "https://clauseguardweb.netlify.app",
-    "http://localhost:3000",
-    "http://localhost:3001",
 ]
 app.add_middleware(
     CORSMiddleware,
     allow_origins=ALLOWED_ORIGINS,

 SAULLM_ENDPOINT = os.environ.get("SAULLM_ENDPOINT", "")
 MAX_TEXT_LENGTH = int(os.environ.get("MAX_TEXT_LENGTH", "200000"))
+# ─── FIX v4.2: Improved sliding window rate limiter with periodic cleanup ───
 _rate_limits: dict[str, list[float]] = {}
+_rate_limits_last_cleanup: float = 0.0
 RATE_LIMIT_REQUESTS = 30
 RATE_LIMIT_WINDOW = 60  # seconds
     return request.client.host if request.client else "unknown"
 def _check_rate_limit(client_ip: str) -> bool:
+    """Sliding window rate limiter with periodic stale-IP cleanup."""
+    global _rate_limits_last_cleanup
     now = time.time()
+    # FIX v4.2: Periodic cleanup every 60s regardless of dict size
+    if now - _rate_limits_last_cleanup > 60:
+        stale = [ip for ip, ts in _rate_limits.items() if not ts or now - ts[-1] > RATE_LIMIT_WINDOW * 2]
+        for ip in stale:
+            del _rate_limits[ip]
+        _rate_limits_last_cleanup = now
     if client_ip not in _rate_limits:
         _rate_limits[client_ip] = []
         return False
     _rate_limits[client_ip].append(now)
     return True
 # ─── Supabase helper ───
 app = FastAPI(title="ClauseGuard API", version="4.1.0", lifespan=lifespan)
+# FIX v4.2: CORS origins configurable via env var; localhost only in dev
+_extra_origins = os.environ.get("CORS_EXTRA_ORIGINS", "").split(",")
 ALLOWED_ORIGINS = [
     "https://clauseguardweb.netlify.app",
 ]
+# Only add localhost origins if explicitly enabled via env
+if os.environ.get("CORS_ALLOW_LOCALHOST", "").lower() == "true":
+    ALLOWED_ORIGINS.extend(["http://localhost:3000", "http://localhost:3001"])
+ALLOWED_ORIGINS.extend([o.strip() for o in _extra_origins if o.strip()])
 app.add_middleware(
     CORSMiddleware,
     allow_origins=ALLOWED_ORIGINS,

app.py CHANGED Viewed

@@ -1,6 +1,15 @@
 """
-ClauseGuard — World's Best Legal Contract Analysis Tool (v4.1)
 ═══════════════════════════════════════════════════════════════
 Fixes in v4.1:
   • FIX: Bounded LRU caches (chunk_cache, prediction_cache) — no more memory leaks
   • FIX: NLI input format — pass (text_a, text_b) tuple, not [SEP]-concatenated string
@@ -44,6 +53,7 @@ import io
 import uuid
 import tempfile
 import hashlib
 from collections import defaultdict, OrderedDict
 from datetime import datetime
 from functools import lru_cache
@@ -80,6 +90,14 @@ try:
 except Exception:
     pass
 # ── Import submodules ───────────────────────────────────────────────
 from compare import compare_contracts, render_comparison_html
 from obligations import extract_obligations, render_obligations_html
@@ -142,7 +160,12 @@ _UNFAIR_LABELS = [
     "Jurisdiction", "Arbitration"
 ]
-_ALL_LABELS = CUAD_LABELS + _UNFAIR_LABELS
 RISK_MAP = {
     # Critical
@@ -198,6 +221,11 @@ RISK_MAP = {
     "Other": "LOW",
     "ROFR/ROFO/ROFN": "LOW",
     "Contract by using": "LOW",
 }
 DESC_MAP = {label: label.replace("_", " ") for label in _ALL_LABELS}
@@ -238,6 +266,11 @@ DESC_MAP.update({
     "Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
     "Unlimited/All-You-Can-Eat License": "License with no usage limits.",
     "Notice Period to Terminate Renewal": "Required notice period before automatic renewal.",
 })
 RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
@@ -267,31 +300,39 @@ for _i in range(41):
 # ═══════════════════════════════════════════════════════════════════════
 class BoundedCache:
-    """Thread-safe bounded LRU cache using OrderedDict."""
     def __init__(self, maxsize=1000):
         self._cache = OrderedDict()
         self._maxsize = maxsize
     def get(self, key, default=None):
-        if key in self._cache:
-            self._cache.move_to_end(key)
-            return self._cache[key]
-        return default
     def put(self, key, value):
-        if key in self._cache:
-            self._cache.move_to_end(key)
-            self._cache[key] = value
-        else:
-            if len(self._cache) >= self._maxsize:
-                self._cache.popitem(last=False)
-            self._cache[key] = value
     def __contains__(self, key):
-        return key in self._cache
     def __len__(self):
-        return len(self._cache)
 # ═══════════════════════════════════════════════════════════════════════
@@ -301,7 +342,7 @@ class BoundedCache:
 cuad_tokenizer = None
 cuad_model = None
 ner_pipeline = None
-nli_pipeline = None
 _model_status = {"cuad": "not_loaded", "ner": "not_loaded", "nli": "not_loaded"}
 def _load_cuad_model():
@@ -349,20 +390,16 @@ def _load_ner_model():
         _model_status["ner"] = f"failed: {e}"
 def _load_nli_model():
-    global nli_pipeline, _model_status, _HAS_NLI_MODEL
-    if not _HAS_TORCH:
-        _model_status["nli"] = "unavailable"
         return
     try:
-        print("[ClauseGuard] Loading NLI model: cross-encoder/nli-deberta-v3-base")
-        nli_pipeline = pipeline(
-            "text-classification",
-            model="cross-encoder/nli-deberta-v3-base",
-            device=-1,
-        )
         _HAS_NLI_MODEL = True
         _model_status["nli"] = "loaded"
-        print("[ClauseGuard] NLI model loaded successfully")
     except Exception as e:
         print(f"[ClauseGuard] NLI model load failed (using heuristic fallback): {e}")
         _model_status["nli"] = f"failed: {e}"
@@ -430,6 +467,18 @@ def parse_document(file_path):
 _chunk_cache = BoundedCache(maxsize=500)
 def split_clauses(text):
     """Deterministic, structure-aware clause splitting.
     Same input ALWAYS produces same output. Normalized text is hashed
@@ -443,18 +492,7 @@ def split_clauses(text):
     text = re.sub(r'\n{3,}', '\n\n', text.strip())
     # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
-    section_pattern = re.compile(
-        r'(?:^|\n\n)'
-        r'(?='
-        r'\d+(?:\.\d+)*[.)]\s'   # 1. 2. 3.1. 3.1)
-        r'|[A-Z]{2,}[A-Z\s]*\n'  # ALL CAPS HEADERS
-        r'|\([a-z]\)\s'           # (a) (b) (c)
-        r'|(?:Section|Article|Clause)\s+\d+'  # Section 1, Article 2
-        r')',
-        re.MULTILINE
-    )
-    positions = [m.start() for m in section_pattern.finditer(text)]
     if len(positions) >= 3:
         clauses = []
@@ -688,14 +726,19 @@ _REGEX_PATTERNS = {
     "Price Restriction": [r"price.*(?:restriction|limitation|ceiling|cap|floor)", r"(?:shall|may).*not.*(?:increase|raise|exceed).*price"],
 }
 def _classify_regex(text):
     """Regex fallback — returns pattern match, NOT fake confidence."""
     text_lower = text.lower()
     results = []
     seen = set()
-    for label, patterns in _REGEX_PATTERNS.items():
         for pat in patterns:
-            if re.search(pat, text_lower):
                 if label not in seen:
                     risk = RISK_MAP.get(label, "MEDIUM")
                     results.append({
@@ -816,29 +859,21 @@ def _extract_entities_regex(text):
 # ═══════════════════════════════════════════════════════════════════════
 def _run_nli(text_a, text_b):
-    """Run NLI pipeline with correct input format for cross-encoder.
-    FIX v4.1: cross-encoder expects {'text': a, 'text_pair': b} or a dict,
-    but the HF pipeline for text-classification with cross-encoder accepts
-    a dict input: {"text": text_a, "text_pair": text_b}.
-    The simplest correct way is to pass them as a list of dicts."""
     try:
-        # The cross-encoder/nli-deberta-v3-base pipeline expects two texts.
-        # Passing as a dict with text and text_pair is the correct format.
-        result = nli_pipeline(
-            {"text": text_a[:256], "text_pair": text_b[:256]},
-            truncation=True,
-        )
-        return result
-    except Exception:
-        # Some pipeline versions accept positional (text, text_pair) as tuple
-        try:
-            return nli_pipeline(
-                text_a[:256],
-                text_pair=text_b[:256],
-                truncation=True,
-            )
-        except Exception:
-            return None
 def detect_contradictions(clause_results, raw_text=""):
@@ -857,7 +892,7 @@ def detect_contradictions(clause_results, raw_text=""):
         clause_texts_by_label[cr["label"]].append(cr.get("text", ""))
     # ── 1. Semantic NLI (if model available) ──
-    if _HAS_NLI_MODEL and nli_pipeline is not None:
         conflict_pairs = [
             ("Uncapped Liability", "Cap on Liability",
              "Liability cannot be both uncapped and capped simultaneously."),

 """
+ClauseGuard — World's Best Legal Contract Analysis Tool (v4.2)
 ═══════════════════════════════════════════════════════════════
+Fixes in v4.2:
+  • FIX: NLI now uses CrossEncoder.predict() — contradictions actually work
+  • FIX: BoundedCache uses threading.RLock — no more race conditions
+  • FIX: Pre-compiled ALL regex patterns at module level (perf)
+  • FIX: Added missing regex labels to RISK_MAP/DESC_MAP
+  • FIX: Extension risk formula matches backend
+  • FIX: Extension API_BASE URL corrected
+  • FIX: API CORS localhost requires explicit opt-in
 Fixes in v4.1:
   • FIX: Bounded LRU caches (chunk_cache, prediction_cache) — no more memory leaks
   • FIX: NLI input format — pass (text_a, text_b) tuple, not [SEP]-concatenated string
 import uuid
 import tempfile
 import hashlib
+import threading
 from collections import defaultdict, OrderedDict
 from datetime import datetime
 from functools import lru_cache
 except Exception:
     pass
+# ── CrossEncoder for NLI (soft-fail) ──────────────────────────────────
+_HAS_CROSS_ENCODER = False
+try:
+    from sentence_transformers import CrossEncoder as _CrossEncoder
+    _HAS_CROSS_ENCODER = True
+except ImportError:
+    pass
 # ── Import submodules ───────────────────────────────────────────────
 from compare import compare_contracts, render_comparison_html
 from obligations import extract_obligations, render_obligations_html
     "Jurisdiction", "Arbitration"
 ]
+# FIX v4.2: Include regex-only labels that aren't in CUAD or Unfair lists
+_EXTRA_REGEX_LABELS = [
+    "Indemnification", "Confidentiality", "Force Majeure", "Penalties"
+]
+_ALL_LABELS = CUAD_LABELS + _UNFAIR_LABELS + _EXTRA_REGEX_LABELS
 RISK_MAP = {
     # Critical
     "Other": "LOW",
     "ROFR/ROFO/ROFN": "LOW",
     "Contract by using": "LOW",
+    # FIX v4.2: Added regex-only labels that were missing from RISK_MAP
+    "Indemnification": "HIGH",
+    "Confidentiality": "MEDIUM",
+    "Force Majeure": "LOW",
+    "Penalties": "HIGH",
 }
 DESC_MAP = {label: label.replace("_", " ") for label in _ALL_LABELS}
     "Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
     "Unlimited/All-You-Can-Eat License": "License with no usage limits.",
     "Notice Period to Terminate Renewal": "Required notice period before automatic renewal.",
+    # FIX v4.2: Added descriptions for regex-only labels
+    "Indemnification": "Obligation to compensate the other party for losses or damages.",
+    "Confidentiality": "Restrictions on sharing proprietary or sensitive information.",
+    "Force Majeure": "Excuses performance due to extraordinary events beyond control.",
+    "Penalties": "Financial penalties for breach or late performance.",
 })
 RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
 # ═══════════════════════════════════════════════════════════════════════
 class BoundedCache:
+    """Thread-safe bounded LRU cache using OrderedDict + RLock.
+    FIX v4.2: Added threading.RLock to prevent race conditions under
+    Gradio's concurrent request handling. OrderedDict compound operations
+    (contains + setitem + move_to_end + popitem) are NOT atomic even with GIL."""
     def __init__(self, maxsize=1000):
         self._cache = OrderedDict()
         self._maxsize = maxsize
+        self._lock = threading.RLock()
     def get(self, key, default=None):
+        with self._lock:
+            if key in self._cache:
+                self._cache.move_to_end(key)
+                return self._cache[key]
+            return default
     def put(self, key, value):
+        with self._lock:
+            if key in self._cache:
+                self._cache.move_to_end(key)
+                self._cache[key] = value
+            else:
+                if len(self._cache) >= self._maxsize:
+                    self._cache.popitem(last=False)
+                self._cache[key] = value
     def __contains__(self, key):
+        with self._lock:
+            return key in self._cache
     def __len__(self):
+        with self._lock:
+            return len(self._cache)
 # ═══════════════════════════════════════════════════════════════════════
 cuad_tokenizer = None
 cuad_model = None
 ner_pipeline = None
+nli_model = None  # FIX v4.2: CrossEncoder instead of pipeline
 _model_status = {"cuad": "not_loaded", "ner": "not_loaded", "nli": "not_loaded"}
 def _load_cuad_model():
         _model_status["ner"] = f"failed: {e}"
 def _load_nli_model():
+    global nli_model, _model_status, _HAS_NLI_MODEL
+    if not _HAS_CROSS_ENCODER:
+        _model_status["nli"] = "unavailable (sentence-transformers not installed)"
         return
     try:
+        print("[ClauseGuard] Loading NLI model: cross-encoder/nli-deberta-v3-base (CrossEncoder)")
+        nli_model = _CrossEncoder("cross-encoder/nli-deberta-v3-base")
         _HAS_NLI_MODEL = True
         _model_status["nli"] = "loaded"
+        print("[ClauseGuard] NLI CrossEncoder loaded successfully")
     except Exception as e:
         print(f"[ClauseGuard] NLI model load failed (using heuristic fallback): {e}")
         _model_status["nli"] = f"failed: {e}"
 _chunk_cache = BoundedCache(maxsize=500)
+# FIX v4.2: Pre-compile section pattern at module level (was recompiling per call)
+_SECTION_PATTERN = re.compile(
+    r'(?:^|\n\n)'
+    r'(?='
+    r'\d+(?:\.\d+)*[.)]\s'   # 1. 2. 3.1. 3.1)
+    r'|[A-Z]{2,}[A-Z\s]*\n'  # ALL CAPS HEADERS
+    r'|\([a-z]\)\s'           # (a) (b) (c)
+    r'|(?:Section|Article|Clause)\s+\d+'  # Section 1, Article 2
+    r')',
+    re.MULTILINE
+)
 def split_clauses(text):
     """Deterministic, structure-aware clause splitting.
     Same input ALWAYS produces same output. Normalized text is hashed
     text = re.sub(r'\n{3,}', '\n\n', text.strip())
     # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
+    positions = [m.start() for m in _SECTION_PATTERN.finditer(text)]
     if len(positions) >= 3:
         clauses = []
     "Price Restriction": [r"price.*(?:restriction|limitation|ceiling|cap|floor)", r"(?:shall|may).*not.*(?:increase|raise|exceed).*price"],
 }
+# FIX v4.2: Pre-compile regex patterns at module level (was recompiling per call)
+_REGEX_PATTERNS_COMPILED = {}
+for _label, _pats in _REGEX_PATTERNS.items():
+    _REGEX_PATTERNS_COMPILED[_label] = [re.compile(p, re.IGNORECASE) for p in _pats]
 def _classify_regex(text):
     """Regex fallback — returns pattern match, NOT fake confidence."""
     text_lower = text.lower()
     results = []
     seen = set()
+    for label, patterns in _REGEX_PATTERNS_COMPILED.items():
         for pat in patterns:
+            if pat.search(text_lower):
                 if label not in seen:
                     risk = RISK_MAP.get(label, "MEDIUM")
                     results.append({
 # ═══════════════════════════════════════════════════════════════════════
 def _run_nli(text_a, text_b):
+    """Run NLI using CrossEncoder with correct input format.
+    FIX v4.2: Use sentence_transformers.CrossEncoder.predict() which accepts
+    a list of (text_a, text_b) tuples. Returns scores for [contradiction, entailment, neutral].
+    The old code used pipeline("text-classification") with dict input, which was broken."""
     try:
+        # CrossEncoder.predict returns numpy array of shape (n_pairs, 3)
+        # Columns: [contradiction, entailment, neutral]
+        scores = nli_model.predict([(text_a[:256], text_b[:256])])
+        label_mapping = ["contradiction", "entailment", "neutral"]
+        top_idx = int(scores[0].argmax())
+        top_score = float(scores[0][top_idx])
+        return [{"label": label_mapping[top_idx], "score": top_score}]
+    except Exception as e:
+        print(f"[ClauseGuard] NLI inference error: {e}")
+        return None
 def detect_contradictions(clause_results, raw_text=""):
         clause_texts_by_label[cr["label"]].append(cr.get("text", ""))
     # ── 1. Semantic NLI (if model available) ──
+    if _HAS_NLI_MODEL and nli_model is not None:
         conflict_pairs = [
             ("Uncapped Liability", "Cap on Liability",
              "Liability cannot be both uncapped and capped simultaneously."),

compare.py CHANGED Viewed

@@ -28,7 +28,7 @@ def _load_embedder():
     global _embedder
     if _HAS_EMBEDDINGS and _embedder is None:
         try:
-            _embedder = SentenceTransformer("all-MiniLM-L6-v2")
             print("[ClauseGuard] Sentence embeddings loaded for comparison")
         except Exception as e:
             print(f"[ClauseGuard] Embeddings not available: {e}")

     global _embedder
     if _HAS_EMBEDDINGS and _embedder is None:
         try:
+            _embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
             print("[ClauseGuard] Sentence embeddings loaded for comparison")
         except Exception as e:
             print(f"[ClauseGuard] Embeddings not available: {e}")

compliance.py CHANGED Viewed

@@ -23,6 +23,9 @@ _NEGATION_PATTERNS = [
     r"notwithstanding.*(?:shall\s+not|does\s+not|is\s+not)",
 ]
 # Regulatory requirement definitions
 REGULATIONS = {
     "GDPR": {
@@ -214,13 +217,13 @@ def _check_negation(text_lower, keyword, window=200):
     wider_context = text_lower[start:end]
     # Check sentence first (higher confidence)
-    for neg_pat in _NEGATION_PATTERNS:
-        if re.search(neg_pat, sentence, re.IGNORECASE):
             return True
     # Then check wider window (lower confidence, still relevant)
-    for neg_pat in _NEGATION_PATTERNS[:4]:  # Only strong negation patterns for wider window
-        if re.search(neg_pat, wider_context, re.IGNORECASE):
             return True
     return False

     r"notwithstanding.*(?:shall\s+not|does\s+not|is\s+not)",
 ]
+# FIX v4.2: Pre-compile negation patterns at module level
+_NEGATION_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _NEGATION_PATTERNS]
 # Regulatory requirement definitions
 REGULATIONS = {
     "GDPR": {
     wider_context = text_lower[start:end]
     # Check sentence first (higher confidence)
+    for neg_pat in _NEGATION_PATTERNS_COMPILED:
+        if neg_pat.search(sentence):
             return True
     # Then check wider window (lower confidence, still relevant)
+    for neg_pat in _NEGATION_PATTERNS_COMPILED[:4]:  # Only strong negation patterns for wider window
+        if neg_pat.search(wider_context):
             return True
     return False

extension/background.js CHANGED Viewed

@@ -4,7 +4,8 @@
  * FIXED: Error handling and retry logic
  */
-const API_BASE = "https://gaurv007-clauseguard-api.hf.space";
 const FREE_SCANS_PER_MONTH = 10;
 const API_TIMEOUT_MS = 45000;
@@ -181,13 +182,19 @@ function localAnalyze(text) {
   });
   const flagged = results.filter(r => r.categories.length > 0);
-  const sev = { HIGH: 0, MEDIUM: 0, LOW: 0 };
-  flagged.forEach(r => r.categories.forEach(c => sev[c.severity]++));
-  const risk = Math.min(100, Math.round((sev.HIGH*20 + sev.MEDIUM*10 + sev.LOW*5) / Math.max(1, clauses.length) * 100));
   return {
     risk_score: risk,
-    grade: risk >= 60 ? "F" : risk >= 40 ? "D" : risk >= 20 ? "C" : risk >= 10 ? "B" : "A",
     total_clauses: clauses.length, flagged_count: flagged.length, results,
   };
 }

  * FIXED: Error handling and retry logic
  */
+// FIX v4.2: Corrected API_BASE URL to match the actual Gradio Space
+const API_BASE = "https://gaurv007-clauseguard.hf.space";
 const FREE_SCANS_PER_MONTH = 10;
 const API_TIMEOUT_MS = 45000;
   });
   const flagged = results.filter(r => r.categories.length > 0);
+  const sev = { CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0 };
+  flagged.forEach(r => r.categories.forEach(c => {
+    if (sev.hasOwnProperty(c.severity)) sev[c.severity]++;
+    else sev.MEDIUM++;  // default for unknown severity
+  }));
+  // FIX v4.2: Use the same diminishing-returns formula as the backend (app.py)
+  // instead of normalizing by clause count (which gave different scores)
+  const weighted = sev.CRITICAL*40 + sev.HIGH*20 + sev.MEDIUM*10 + sev.LOW*3;
+  const risk = Math.min(100, Math.round(100 * (1 - (1 / (1 + weighted / 30)))));
   return {
     risk_score: risk,
+    grade: risk >= 70 ? "F" : risk >= 50 ? "D" : risk >= 30 ? "C" : risk >= 15 ? "B" : "A",
     total_clauses: clauses.length, flagged_count: flagged.length, results,
   };
 }

obligations.py CHANGED Viewed

@@ -85,11 +85,26 @@ _PRIORITY_MAP = {
     "delivery": 1,
 }
 def _is_false_positive(sentence):
     """Check if a sentence is a common false positive (definition/interpretation, not obligation)."""
-    for fp in _FALSE_POSITIVE_PATTERNS:
-        if re.search(fp, sentence, re.IGNORECASE):
             return True
     return False
@@ -111,9 +126,9 @@ def extract_obligations(text):
             continue
         found_types = set()
-        for otype, patterns in OBLIGATION_PATTERNS.items():
             for pat in patterns:
-                if re.search(pat, sentence, re.IGNORECASE):
                     found_types.add(otype)
                     break
@@ -128,8 +143,8 @@ def extract_obligations(text):
             party = obligation_direction
         else:
             # Fallback to pattern matching within the sentence
-            for pp in PARTY_PATTERNS:
-                m = re.search(pp, sentence)
                 if m:
                     candidate = m.group(0).strip()
                     # Fix 8: Reject party strings >40 chars (header bleed-through)
@@ -140,8 +155,8 @@ def extract_obligations(text):
         # Extract timeframe
         deadline = "Not specified"
         deadline_urgency = 0
-        for pat, ptype in TIME_PATTERNS:
-            m = re.search(pat, sentence, re.IGNORECASE)
             if m:
                 if ptype == "relative":
                     num = m.group(1)

     "delivery": 1,
 }
+# FIX v4.2: Pre-compile obligation patterns at module level (was recompiling per sentence)
+_OBLIGATION_PATTERNS_COMPILED = {
+    otype: [re.compile(p, re.IGNORECASE) for p in patterns]
+    for otype, patterns in OBLIGATION_PATTERNS.items()
+}
+# FIX v4.2: Pre-compile false positive patterns
+_FALSE_POSITIVE_PATTERNS_COMPILED = [re.compile(p, re.IGNORECASE) for p in _FALSE_POSITIVE_PATTERNS]
+# FIX v4.2: Pre-compile time patterns
+_TIME_PATTERNS_COMPILED = [(re.compile(p, re.IGNORECASE), ptype) for p, ptype in TIME_PATTERNS]
+# FIX v4.2: Pre-compile party patterns
+_PARTY_PATTERNS_COMPILED = [re.compile(p) for p in PARTY_PATTERNS]
 def _is_false_positive(sentence):
     """Check if a sentence is a common false positive (definition/interpretation, not obligation)."""
+    for fp in _FALSE_POSITIVE_PATTERNS_COMPILED:
+        if fp.search(sentence):
             return True
     return False
             continue
         found_types = set()
+        for otype, patterns in _OBLIGATION_PATTERNS_COMPILED.items():
             for pat in patterns:
+                if pat.search(sentence):
                     found_types.add(otype)
                     break
             party = obligation_direction
         else:
             # Fallback to pattern matching within the sentence
+            for pp in _PARTY_PATTERNS_COMPILED:
+                m = pp.search(sentence)
                 if m:
                     candidate = m.group(0).strip()
                     # Fix 8: Reject party strings >40 chars (header bleed-through)
         # Extract timeframe
         deadline = "Not specified"
         deadline_urgency = 0
+        for pat, ptype in _TIME_PATTERNS_COMPILED:
+            m = pat.search(sentence)
             if m:
                 if ptype == "relative":
                     num = m.group(1)