Spaces:

gaurv007
/

ClauseGuard

Sleeping

App Files Files Community

gaurv007 commited on 15 days ago

Commit

584624e

verified ·

1 Parent(s): adad3b7

v3.1: Fix 1-6 from bug report — deterministic chunking, metadata fix, heading strip, raw_text missing-clause, guardrails

Browse files

Files changed (1) hide show

app.py +94 -17

app.py CHANGED Viewed

@@ -378,11 +378,22 @@ def parse_document(file_path):
         return None, f"Unsupported file type: {ext}"
 # ═══════════════════════════════════════════════════════════════════════
-# 4. STRUCTURE-AWARE CLAUSE SPLITTING
 # ═══════════════════════════════════════════════════════════════════════
 def split_clauses(text):
-    """Structure-aware clause splitting that respects section numbering."""
     text = re.sub(r'\n{3,}', '\n\n', text.strip())
     # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
@@ -426,9 +437,13 @@ def split_clauses(text):
             preamble = text[:positions[0]].strip()
             if len(preamble) > 30:
                 clauses.insert(0, preamble)
-        return clauses if clauses else _fallback_split(text)
     else:
-        return _fallback_split(text)
 def _fallback_split(text):
     """Fallback: split on paragraph breaks and sentence boundaries."""
@@ -462,8 +477,40 @@ def _fallback_split(text):
 # ═══════════════════════════════════════════════════════════════════════
 # 5. CLAUSE DETECTION — FIXED: sigmoid + per-class thresholds + caching
 # ═══════════════════════════════════════════════════════════════════════
 def _text_hash(text):
     return hashlib.md5(text.encode()).hexdigest()
@@ -474,14 +521,17 @@ def classify_cuad(clause_text):
     if cuad_model is None or cuad_tokenizer is None:
         return _classify_regex(clause_text)
     # Check cache
-    h = _text_hash(clause_text[:512])
     if h in _prediction_cache:
         return _prediction_cache[h]
     try:
         inputs = cuad_tokenizer(
-            clause_text,
             return_tensors="pt",
             truncation=True,
             max_length=256,
@@ -498,10 +548,15 @@ def classify_cuad(clause_text):
             threshold = _CUAD_THRESHOLDS.get(i, 0.40)
             if float(prob) > threshold and i < len(CUAD_LABELS):
                 label = CUAD_LABELS[i]
                 risk = RISK_MAP.get(label, "LOW")
                 results.append({
                     "label": label,
-                    "confidence": round(float(prob), 3),
                     "risk": risk,
                     "description": DESC_MAP.get(label, label),
                     "source": "ml",
@@ -773,19 +828,33 @@ def detect_contradictions(clause_results, raw_text=""):
                     "source": "heuristic",
                 })
-    # ── 2. Missing critical clauses ──
-    critical_clauses = {
-        "Governing Law": "No governing law clause detected — jurisdiction ambiguity may cause disputes.",
-        "Termination for Convenience": "No termination clause detected — exit terms are unclear.",
-        "Limitation of liability": "No liability limitation detected — exposure may be unlimited.",
     }
-    for cc, explanation in critical_clauses.items():
-        if cc not in labels_found:
             contradictions.append({
                 "type": "MISSING",
-                "explanation": explanation,
                 "severity": "MEDIUM",
-                "clauses": [cc],
                 "source": "structural",
             })
@@ -847,13 +916,21 @@ def analyze_contract(text):
     contradictions = detect_contradictions(clause_results, text)
     risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
     obligations = extract_obligations(text)
     compliance = check_compliance(text)
     result = {
         "metadata": {
             "analysis_date": datetime.now().isoformat(),
             "total_clauses": len(clauses),
-            "flagged_clauses": len(set(cr["text"] for cr in clause_results)),
             "model": get_model_status_text(),
         },
         "risk": {
             "score": risk,

         return None, f"Unsupported file type: {ext}"
 # ═══════════════════════════════════════════════════════════════════════
+# 4. DETERMINISTIC CLAUSE SPLITTING (Fix 1 from bug report)
 # ═══════════════════════════════════════════════════════════════════════
+# Document-level chunk cache: same text always produces same chunks
+_chunk_cache = {}
 def split_clauses(text):
+    """Deterministic, structure-aware clause splitting.
+    Fix 1: Same input ALWAYS produces same output. Normalized text is hashed
+    and cached so repeated runs on identical documents are identical."""
+    # Normalize whitespace before hashing for determinism
+    normalized = re.sub(r'\s+', ' ', text.strip())
+    text_hash = hashlib.sha256(normalized.encode()).hexdigest()
+    if text_hash in _chunk_cache:
+        return _chunk_cache[text_hash]
     text = re.sub(r'\n{3,}', '\n\n', text.strip())
     # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
             preamble = text[:positions[0]].strip()
             if len(preamble) > 30:
                 clauses.insert(0, preamble)
+        result = clauses if clauses else _fallback_split(text)
+        _chunk_cache[text_hash] = result
+        return result
     else:
+        result = _fallback_split(text)
+        _chunk_cache[text_hash] = result
+        return result
 def _fallback_split(text):
     """Fallback: split on paragraph breaks and sentence boundaries."""
 # ═══════════════════════════════════════════════════════════════════════
 # 5. CLAUSE DETECTION — FIXED: sigmoid + per-class thresholds + caching
+#    Fix 3: Strip section headings before classification
+#    Fix 6: Label guardrails for high-confidence false positives
 # ═══════════════════════════════════════════════════════════════════════
+# Fix 3: Section heading pattern — strip before classifying
+_HEADING_RE = re.compile(r'^\d+(?:\.\d+)*\s+[A-Z][A-Z\s&,/]+$', re.MULTILINE)
+def _strip_heading(text):
+    """Remove leading section headings that confuse the classifier."""
+    lines = text.split('\n')
+    if lines and _HEADING_RE.match(lines[0].strip()):
+        stripped = '\n'.join(lines[1:]).strip()
+        return stripped if len(stripped) > 20 else text
+    return text
+# Fix 6: Label guardrails — keyword validation for high-confidence labels
+_LABEL_GUARDRAILS = {
+    "Liquidated Damages": re.compile(
+        r'liquidated|pre-?determined.{0,10}damage|agreed.{0,10}sum|penalty clause|stipulated.{0,10}damage',
+        re.IGNORECASE
+    ),
+    "Uncapped Liability": re.compile(
+        r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
+        re.IGNORECASE
+    ),
+}
+def _apply_guardrails(label, text, confidence):
+    """Fix 6: If label has a guardrail and text lacks required keywords, demote."""
+    guard = _LABEL_GUARDRAILS.get(label)
+    if guard and not guard.search(text):
+        return "Other", confidence * 0.3  # demote to Other with reduced confidence
+    return label, confidence
 def _text_hash(text):
     return hashlib.md5(text.encode()).hexdigest()
     if cuad_model is None or cuad_tokenizer is None:
         return _classify_regex(clause_text)
+    # Fix 3: Strip section headings before classification
+    clean_text = _strip_heading(clause_text)
     # Check cache
+    h = _text_hash(clean_text[:512])
     if h in _prediction_cache:
         return _prediction_cache[h]
     try:
         inputs = cuad_tokenizer(
+            clean_text,
             return_tensors="pt",
             truncation=True,
             max_length=256,
             threshold = _CUAD_THRESHOLDS.get(i, 0.40)
             if float(prob) > threshold and i < len(CUAD_LABELS):
                 label = CUAD_LABELS[i]
+                conf = float(prob)
+                # Fix 6: Apply guardrails — reject high-confidence false positives
+                label, conf = _apply_guardrails(label, clause_text, conf)
+                if label == "Other" and conf < 0.3:
+                    continue  # Skip demoted labels
                 risk = RISK_MAP.get(label, "LOW")
                 results.append({
                     "label": label,
+                    "confidence": round(conf, 3),
                     "risk": risk,
                     "description": DESC_MAP.get(label, label),
                     "source": "ml",
                     "source": "heuristic",
                 })
+    # ── 2. Missing critical clauses (Fix 4: check raw_text, not labels) ──
+    _REQUIRED_CLAUSE_PATTERNS = {
+        "Governing Law": re.compile(
+            r'govern(?:ed|ing).{0,15}law|applicable.{0,10}law|laws?\s+of\s+the\s+state',
+            re.IGNORECASE
+        ),
+        "Limitation of liability": re.compile(
+            r'limitation.{0,10}liabilit|cap.{0,10}liabilit|liabilit.{0,10}shall\s+not\s+exceed|in\s+no\s+event.{0,20}liable',
+            re.IGNORECASE
+        ),
+        "Arbitration": re.compile(
+            r'arbitrat|AAA|JAMS|binding.{0,10}dispute',
+            re.IGNORECASE
+        ),
+        "Termination": re.compile(
+            r'terminat(?:e|ion|ed)|cancel(?:lation)?',
+            re.IGNORECASE
+        ),
     }
+    for clause_name, pattern in _REQUIRED_CLAUSE_PATTERNS.items():
+        # Check raw_text directly — it's stable and deterministic
+        if not pattern.search(raw_text):
             contradictions.append({
                 "type": "MISSING",
+                "explanation": f"No '{clause_name}' clause detected in the document.",
                 "severity": "MEDIUM",
+                "clauses": [clause_name],
                 "source": "structural",
             })
     contradictions = detect_contradictions(clause_results, text)
     risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
     obligations = extract_obligations(text)
+    # Fix 5: Compliance runs against full raw_text (already done in compliance.py)
     compliance = check_compliance(text)
+    # Fix 2: Compute flagged_clauses AFTER all processing is complete
+    flagged_clause_count = len(clause_results)
+    unique_flagged_texts = len(set(cr["text"] for cr in clause_results))
     result = {
         "metadata": {
             "analysis_date": datetime.now().isoformat(),
             "total_clauses": len(clauses),
+            "flagged_clauses": flagged_clause_count,
+            "unique_flagged": unique_flagged_texts,
             "model": get_model_status_text(),
+            "text_hash": hashlib.sha256(re.sub(r'\s+', ' ', text.strip()).encode()).hexdigest()[:16],
         },
         "risk": {
             "score": risk,