Spaces:

gaurv007
/

ClauseGuard

Running

App Files Files Community

gaurv007 commited on 12 days ago

Commit

423d2a9

verified ·

1 Parent(s): 5786572

fix(v4.3): app.py — bug report fixes (10 issues)

Browse files

Files changed (1) hide show

app.py +98 -6

app.py CHANGED Viewed

@@ -617,12 +617,51 @@ _LABEL_GUARDRAILS = {
         r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
         re.IGNORECASE
     ),
 }
 def _apply_guardrails(label, text, confidence):
     guard = _LABEL_GUARDRAILS.get(label)
     if guard and not guard.search(text):
         return "Other", confidence * 0.3
     return label, confidence
 def _text_hash(text):
@@ -951,6 +990,47 @@ def extract_entities(text):
     else:
         entities = _extract_entities_regex(text)
     # Always supplement with regex patterns for things NER often misses
     regex_ents = _extract_entities_regex(text)
     ml_spans = set()
@@ -1176,14 +1256,26 @@ def compute_risk_score(clause_results, total_clauses):
     if total_clauses == 0:
         return 0, "A", sev_counts
-    # FIX v4.1: Absolute risk — critical findings should always score high
-    # regardless of document size. A 200-clause doc with 5 critical findings
-    # is just as dangerous as a 10-clause doc with 5 critical findings.
     weighted = sum(sev_counts[s] * RISK_WEIGHTS[s] for s in sev_counts)
-    # Diminishing returns formula: starts linear, flattens near 100
-    # max theoretical = 100, one CRITICAL finding = ~30, two = ~48, five = ~72
-    risk = min(100, round(100 * (1 - (1 / (1 + weighted / 30)))))
     if risk >= 70: grade = "F"
     elif risk >= 50: grade = "D"

         r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
         re.IGNORECASE
     ),
+    # FIX v4.3: ROFR fires on "right, title, and interest" in IP clauses — require ROFR-specific phrases
+    "ROFR/ROFO/ROFN": re.compile(
+        r'right\s+of\s+first\s+(?:refusal|offer|negotiation)|ROFR|ROFO|ROFN',
+        re.IGNORECASE
+    ),
+    # FIX v4.3: Renewal Term fires on "twelve (12) months" in liability caps — require renewal-specific phrases
+    "Renewal Term": re.compile(
+        r'renew(?:al)?|successive\s+term|auto(?:matic(?:ally)?)?\s*[\-\s]?renew|non[\-\s]?renewal',
+        re.IGNORECASE
+    ),
+}
+# FIX v4.3: Exclusion patterns — even if guardrail passes, exclude if contra-indicators present
+_LABEL_EXCLUSIONS = {
+    "ROFR/ROFO/ROFN": re.compile(
+        r'assigns?\s+to|irrevocab(?:ly|le)\s+assign|all\s+right,?\s+title,?\s+and\s+interest|work[\-\s]for[\-\s]hire',
+        re.IGNORECASE
+    ),
+    "Renewal Term": re.compile(
+        r'limitation\s+of\s+liabilit|shall\s+not\s+be\s+liable|indemnif|hold\s+harmless|defend\s+and',
+        re.IGNORECASE
+    ),
+}
+# FIX v4.3: Minimum confidence thresholds per label (overrides the per-class _CUAD_THRESHOLDS)
+_LABEL_MIN_CONFIDENCE = {
+    "ROFR/ROFO/ROFN": 0.65,
+    "Renewal Term": 0.70,
 }
 def _apply_guardrails(label, text, confidence):
+    # Check minimum confidence for specific labels
+    min_conf = _LABEL_MIN_CONFIDENCE.get(label)
+    if min_conf and confidence < min_conf:
+        return "Other", confidence * 0.2
+    # Check required keywords (must be present)
     guard = _LABEL_GUARDRAILS.get(label)
     if guard and not guard.search(text):
         return "Other", confidence * 0.3
+    # Check exclusion patterns (must NOT be present)
+    exclusion = _LABEL_EXCLUSIONS.get(label)
+    if exclusion and exclusion.search(text):
+        return "Other", confidence * 0.2
     return label, confidence
 def _text_hash(text):
     else:
         entities = _extract_entities_regex(text)
+    # FIX v4.3: Post-process ML entities to clean up WordPiece artefacts
+    cleaned_entities = []
+    for e in entities:
+        text_val = e.get("text", "")
+        # Strip WordPiece subword tokens (## prefix)
+        if "##" in text_val:
+            text_val = re.sub(r'##\w*', '', text_val).strip()
+            text_val = re.sub(r'\s+', ' ', text_val).strip()
+        # Discard entities that are too short, start/end with hyphens, or are garbled
+        if len(text_val) < 2:
+            continue
+        if text_val.startswith("-") or text_val.endswith("-"):
+            continue
+        # Discard low-confidence MISC entities (almost always tokenisation artefacts)
+        if e.get("type") == "MISC" and e.get("score", 1.0) < 0.6:
+            continue
+        # Discard entities that are mostly punctuation/symbols
+        alpha_ratio = sum(1 for c in text_val if c.isalnum()) / max(len(text_val), 1)
+        if alpha_ratio < 0.4:
+            continue
+        e["text"] = text_val
+        cleaned_entities.append(e)
+    entities = cleaned_entities
+    # FIX v4.3: Split concatenated MONEY/QUANTITY entities
+    # e.g., "usd $ 485, 000,usd $ 72, 000" → separate entities
+    _CURRENCY_SPLIT = re.compile(r'(?<=[\d,])\s*(?=(?:USD|usd|EUR|GBP|\$|£|€))', re.IGNORECASE)
+    split_entities = []
+    for e in entities:
+        if e.get("type") in ("MONEY", "QUANTITY") and _CURRENCY_SPLIT.search(e["text"]):
+            parts = _CURRENCY_SPLIT.split(e["text"])
+            for part in parts:
+                part = part.strip().strip(",").strip()
+                if len(part) >= 2:
+                    new_ent = dict(e)
+                    new_ent["text"] = re.sub(r'\s+', '', part) if "$" in part or "USD" in part.upper() else part
+                    split_entities.append(new_ent)
+        else:
+            split_entities.append(e)
+    entities = split_entities
     # Always supplement with regex patterns for things NER often misses
     regex_ents = _extract_entities_regex(text)
     ml_spans = set()
     if total_clauses == 0:
         return 0, "A", sev_counts
+    # FIX v4.3: Revised risk formula — scale denominator with clause count
+    # to prevent small contracts from always scoring 80+.
+    # The old formula used a fixed /30 denominator which meant even 2 CRITICAL
+    # flags scored 73, making almost every contract grade F.
+    #
+    # New approach: dynamic denominator based on total clauses analysed.
+    # This means risk is relative to document complexity.
+    # - 1 CRITICAL in 5 clauses = high risk
+    # - 1 CRITICAL in 50 clauses = moderate risk (proportionally less of the contract)
     weighted = sum(sev_counts[s] * RISK_WEIGHTS[s] for s in sev_counts)
+    # Dynamic max: what if every clause were CRITICAL?
+    max_possible = total_clauses * RISK_WEIGHTS["CRITICAL"]
+    if max_possible == 0:
+        max_possible = 1
+    # Blend: 60% absolute (diminishing returns) + 40% relative (to total clauses)
+    absolute_risk = 100 * (1 - (1 / (1 + weighted / 50)))  # /50 instead of /30 = softer curve
+    relative_risk = min(100, (weighted / max_possible) * 100)
+    risk = min(100, round(0.6 * absolute_risk + 0.4 * relative_risk))
     if risk >= 70: grade = "F"
     elif risk >= 50: grade = "D"