Spaces:
Sleeping
Sleeping
v3.1: Fix 1-6 from bug report — deterministic chunking, metadata fix, heading strip, raw_text missing-clause, guardrails
Browse files
app.py
CHANGED
|
@@ -378,11 +378,22 @@ def parse_document(file_path):
|
|
| 378 |
return None, f"Unsupported file type: {ext}"
|
| 379 |
|
| 380 |
# ═══════════════════════════════════════════════════════════════════════
|
| 381 |
-
# 4.
|
| 382 |
# ═══════════════════════════════════════════════════════════════════════
|
| 383 |
|
|
|
|
|
|
|
|
|
|
| 384 |
def split_clauses(text):
|
| 385 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
text = re.sub(r'\n{3,}', '\n\n', text.strip())
|
| 387 |
|
| 388 |
# First try to detect numbered sections (1., 2., 3.1, (a), etc.)
|
|
@@ -426,9 +437,13 @@ def split_clauses(text):
|
|
| 426 |
preamble = text[:positions[0]].strip()
|
| 427 |
if len(preamble) > 30:
|
| 428 |
clauses.insert(0, preamble)
|
| 429 |
-
|
|
|
|
|
|
|
| 430 |
else:
|
| 431 |
-
|
|
|
|
|
|
|
| 432 |
|
| 433 |
def _fallback_split(text):
|
| 434 |
"""Fallback: split on paragraph breaks and sentence boundaries."""
|
|
@@ -462,8 +477,40 @@ def _fallback_split(text):
|
|
| 462 |
|
| 463 |
# ═══════════════════════════════════════════════════════════════════════
|
| 464 |
# 5. CLAUSE DETECTION — FIXED: sigmoid + per-class thresholds + caching
|
|
|
|
|
|
|
| 465 |
# ═══════════════════════════════════════════════════════════════════════
|
| 466 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
def _text_hash(text):
|
| 468 |
return hashlib.md5(text.encode()).hexdigest()
|
| 469 |
|
|
@@ -474,14 +521,17 @@ def classify_cuad(clause_text):
|
|
| 474 |
if cuad_model is None or cuad_tokenizer is None:
|
| 475 |
return _classify_regex(clause_text)
|
| 476 |
|
|
|
|
|
|
|
|
|
|
| 477 |
# Check cache
|
| 478 |
-
h = _text_hash(
|
| 479 |
if h in _prediction_cache:
|
| 480 |
return _prediction_cache[h]
|
| 481 |
|
| 482 |
try:
|
| 483 |
inputs = cuad_tokenizer(
|
| 484 |
-
|
| 485 |
return_tensors="pt",
|
| 486 |
truncation=True,
|
| 487 |
max_length=256,
|
|
@@ -498,10 +548,15 @@ def classify_cuad(clause_text):
|
|
| 498 |
threshold = _CUAD_THRESHOLDS.get(i, 0.40)
|
| 499 |
if float(prob) > threshold and i < len(CUAD_LABELS):
|
| 500 |
label = CUAD_LABELS[i]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
risk = RISK_MAP.get(label, "LOW")
|
| 502 |
results.append({
|
| 503 |
"label": label,
|
| 504 |
-
"confidence": round(
|
| 505 |
"risk": risk,
|
| 506 |
"description": DESC_MAP.get(label, label),
|
| 507 |
"source": "ml",
|
|
@@ -773,19 +828,33 @@ def detect_contradictions(clause_results, raw_text=""):
|
|
| 773 |
"source": "heuristic",
|
| 774 |
})
|
| 775 |
|
| 776 |
-
# ── 2. Missing critical clauses ──
|
| 777 |
-
|
| 778 |
-
"Governing Law":
|
| 779 |
-
|
| 780 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
}
|
| 782 |
-
for
|
| 783 |
-
|
|
|
|
| 784 |
contradictions.append({
|
| 785 |
"type": "MISSING",
|
| 786 |
-
"explanation":
|
| 787 |
"severity": "MEDIUM",
|
| 788 |
-
"clauses": [
|
| 789 |
"source": "structural",
|
| 790 |
})
|
| 791 |
|
|
@@ -847,13 +916,21 @@ def analyze_contract(text):
|
|
| 847 |
contradictions = detect_contradictions(clause_results, text)
|
| 848 |
risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
|
| 849 |
obligations = extract_obligations(text)
|
|
|
|
| 850 |
compliance = check_compliance(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
result = {
|
| 852 |
"metadata": {
|
| 853 |
"analysis_date": datetime.now().isoformat(),
|
| 854 |
"total_clauses": len(clauses),
|
| 855 |
-
"flagged_clauses":
|
|
|
|
| 856 |
"model": get_model_status_text(),
|
|
|
|
| 857 |
},
|
| 858 |
"risk": {
|
| 859 |
"score": risk,
|
|
|
|
| 378 |
return None, f"Unsupported file type: {ext}"
|
| 379 |
|
| 380 |
# ═══════════════════════════════════════════════════════════════════════
|
| 381 |
+
# 4. DETERMINISTIC CLAUSE SPLITTING (Fix 1 from bug report)
|
| 382 |
# ═══════════════════════════════════════════════════════════════════════
|
| 383 |
|
| 384 |
+
# Document-level chunk cache: same text always produces same chunks
|
| 385 |
+
_chunk_cache = {}
|
| 386 |
+
|
| 387 |
def split_clauses(text):
|
| 388 |
+
"""Deterministic, structure-aware clause splitting.
|
| 389 |
+
Fix 1: Same input ALWAYS produces same output. Normalized text is hashed
|
| 390 |
+
and cached so repeated runs on identical documents are identical."""
|
| 391 |
+
# Normalize whitespace before hashing for determinism
|
| 392 |
+
normalized = re.sub(r'\s+', ' ', text.strip())
|
| 393 |
+
text_hash = hashlib.sha256(normalized.encode()).hexdigest()
|
| 394 |
+
if text_hash in _chunk_cache:
|
| 395 |
+
return _chunk_cache[text_hash]
|
| 396 |
+
|
| 397 |
text = re.sub(r'\n{3,}', '\n\n', text.strip())
|
| 398 |
|
| 399 |
# First try to detect numbered sections (1., 2., 3.1, (a), etc.)
|
|
|
|
| 437 |
preamble = text[:positions[0]].strip()
|
| 438 |
if len(preamble) > 30:
|
| 439 |
clauses.insert(0, preamble)
|
| 440 |
+
result = clauses if clauses else _fallback_split(text)
|
| 441 |
+
_chunk_cache[text_hash] = result
|
| 442 |
+
return result
|
| 443 |
else:
|
| 444 |
+
result = _fallback_split(text)
|
| 445 |
+
_chunk_cache[text_hash] = result
|
| 446 |
+
return result
|
| 447 |
|
| 448 |
def _fallback_split(text):
|
| 449 |
"""Fallback: split on paragraph breaks and sentence boundaries."""
|
|
|
|
| 477 |
|
| 478 |
# ═══════════════════════════════════════════════════════════════════════
|
| 479 |
# 5. CLAUSE DETECTION — FIXED: sigmoid + per-class thresholds + caching
|
| 480 |
+
# Fix 3: Strip section headings before classification
|
| 481 |
+
# Fix 6: Label guardrails for high-confidence false positives
|
| 482 |
# ═══════════════════════════════════════════════════════════════════════
|
| 483 |
|
| 484 |
+
# Fix 3: Section heading pattern — strip before classifying
|
| 485 |
+
_HEADING_RE = re.compile(r'^\d+(?:\.\d+)*\s+[A-Z][A-Z\s&,/]+$', re.MULTILINE)
|
| 486 |
+
|
| 487 |
+
def _strip_heading(text):
|
| 488 |
+
"""Remove leading section headings that confuse the classifier."""
|
| 489 |
+
lines = text.split('\n')
|
| 490 |
+
if lines and _HEADING_RE.match(lines[0].strip()):
|
| 491 |
+
stripped = '\n'.join(lines[1:]).strip()
|
| 492 |
+
return stripped if len(stripped) > 20 else text
|
| 493 |
+
return text
|
| 494 |
+
|
| 495 |
+
# Fix 6: Label guardrails — keyword validation for high-confidence labels
|
| 496 |
+
_LABEL_GUARDRAILS = {
|
| 497 |
+
"Liquidated Damages": re.compile(
|
| 498 |
+
r'liquidated|pre-?determined.{0,10}damage|agreed.{0,10}sum|penalty clause|stipulated.{0,10}damage',
|
| 499 |
+
re.IGNORECASE
|
| 500 |
+
),
|
| 501 |
+
"Uncapped Liability": re.compile(
|
| 502 |
+
r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
|
| 503 |
+
re.IGNORECASE
|
| 504 |
+
),
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
def _apply_guardrails(label, text, confidence):
|
| 508 |
+
"""Fix 6: If label has a guardrail and text lacks required keywords, demote."""
|
| 509 |
+
guard = _LABEL_GUARDRAILS.get(label)
|
| 510 |
+
if guard and not guard.search(text):
|
| 511 |
+
return "Other", confidence * 0.3 # demote to Other with reduced confidence
|
| 512 |
+
return label, confidence
|
| 513 |
+
|
| 514 |
def _text_hash(text):
|
| 515 |
return hashlib.md5(text.encode()).hexdigest()
|
| 516 |
|
|
|
|
| 521 |
if cuad_model is None or cuad_tokenizer is None:
|
| 522 |
return _classify_regex(clause_text)
|
| 523 |
|
| 524 |
+
# Fix 3: Strip section headings before classification
|
| 525 |
+
clean_text = _strip_heading(clause_text)
|
| 526 |
+
|
| 527 |
# Check cache
|
| 528 |
+
h = _text_hash(clean_text[:512])
|
| 529 |
if h in _prediction_cache:
|
| 530 |
return _prediction_cache[h]
|
| 531 |
|
| 532 |
try:
|
| 533 |
inputs = cuad_tokenizer(
|
| 534 |
+
clean_text,
|
| 535 |
return_tensors="pt",
|
| 536 |
truncation=True,
|
| 537 |
max_length=256,
|
|
|
|
| 548 |
threshold = _CUAD_THRESHOLDS.get(i, 0.40)
|
| 549 |
if float(prob) > threshold and i < len(CUAD_LABELS):
|
| 550 |
label = CUAD_LABELS[i]
|
| 551 |
+
conf = float(prob)
|
| 552 |
+
# Fix 6: Apply guardrails — reject high-confidence false positives
|
| 553 |
+
label, conf = _apply_guardrails(label, clause_text, conf)
|
| 554 |
+
if label == "Other" and conf < 0.3:
|
| 555 |
+
continue # Skip demoted labels
|
| 556 |
risk = RISK_MAP.get(label, "LOW")
|
| 557 |
results.append({
|
| 558 |
"label": label,
|
| 559 |
+
"confidence": round(conf, 3),
|
| 560 |
"risk": risk,
|
| 561 |
"description": DESC_MAP.get(label, label),
|
| 562 |
"source": "ml",
|
|
|
|
| 828 |
"source": "heuristic",
|
| 829 |
})
|
| 830 |
|
| 831 |
+
# ── 2. Missing critical clauses (Fix 4: check raw_text, not labels) ──
|
| 832 |
+
_REQUIRED_CLAUSE_PATTERNS = {
|
| 833 |
+
"Governing Law": re.compile(
|
| 834 |
+
r'govern(?:ed|ing).{0,15}law|applicable.{0,10}law|laws?\s+of\s+the\s+state',
|
| 835 |
+
re.IGNORECASE
|
| 836 |
+
),
|
| 837 |
+
"Limitation of liability": re.compile(
|
| 838 |
+
r'limitation.{0,10}liabilit|cap.{0,10}liabilit|liabilit.{0,10}shall\s+not\s+exceed|in\s+no\s+event.{0,20}liable',
|
| 839 |
+
re.IGNORECASE
|
| 840 |
+
),
|
| 841 |
+
"Arbitration": re.compile(
|
| 842 |
+
r'arbitrat|AAA|JAMS|binding.{0,10}dispute',
|
| 843 |
+
re.IGNORECASE
|
| 844 |
+
),
|
| 845 |
+
"Termination": re.compile(
|
| 846 |
+
r'terminat(?:e|ion|ed)|cancel(?:lation)?',
|
| 847 |
+
re.IGNORECASE
|
| 848 |
+
),
|
| 849 |
}
|
| 850 |
+
for clause_name, pattern in _REQUIRED_CLAUSE_PATTERNS.items():
|
| 851 |
+
# Check raw_text directly — it's stable and deterministic
|
| 852 |
+
if not pattern.search(raw_text):
|
| 853 |
contradictions.append({
|
| 854 |
"type": "MISSING",
|
| 855 |
+
"explanation": f"No '{clause_name}' clause detected in the document.",
|
| 856 |
"severity": "MEDIUM",
|
| 857 |
+
"clauses": [clause_name],
|
| 858 |
"source": "structural",
|
| 859 |
})
|
| 860 |
|
|
|
|
| 916 |
contradictions = detect_contradictions(clause_results, text)
|
| 917 |
risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
|
| 918 |
obligations = extract_obligations(text)
|
| 919 |
+
# Fix 5: Compliance runs against full raw_text (already done in compliance.py)
|
| 920 |
compliance = check_compliance(text)
|
| 921 |
+
|
| 922 |
+
# Fix 2: Compute flagged_clauses AFTER all processing is complete
|
| 923 |
+
flagged_clause_count = len(clause_results)
|
| 924 |
+
unique_flagged_texts = len(set(cr["text"] for cr in clause_results))
|
| 925 |
+
|
| 926 |
result = {
|
| 927 |
"metadata": {
|
| 928 |
"analysis_date": datetime.now().isoformat(),
|
| 929 |
"total_clauses": len(clauses),
|
| 930 |
+
"flagged_clauses": flagged_clause_count,
|
| 931 |
+
"unique_flagged": unique_flagged_texts,
|
| 932 |
"model": get_model_status_text(),
|
| 933 |
+
"text_hash": hashlib.sha256(re.sub(r'\s+', ' ', text.strip()).encode()).hexdigest()[:16],
|
| 934 |
},
|
| 935 |
"risk": {
|
| 936 |
"score": risk,
|