gaurv007 commited on
Commit
584624e
·
verified ·
1 Parent(s): adad3b7

v3.1: Fix 1-6 from bug report — deterministic chunking, metadata fix, heading strip, raw_text missing-clause, guardrails

Browse files
Files changed (1) hide show
  1. app.py +94 -17
app.py CHANGED
@@ -378,11 +378,22 @@ def parse_document(file_path):
378
  return None, f"Unsupported file type: {ext}"
379
 
380
  # ═══════════════════════════════════════════════════════════════════════
381
- # 4. STRUCTURE-AWARE CLAUSE SPLITTING
382
  # ═══════════════════════════════════════════════════════════════════════
383
 
 
 
 
384
  def split_clauses(text):
385
- """Structure-aware clause splitting that respects section numbering."""
 
 
 
 
 
 
 
 
386
  text = re.sub(r'\n{3,}', '\n\n', text.strip())
387
 
388
  # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
@@ -426,9 +437,13 @@ def split_clauses(text):
426
  preamble = text[:positions[0]].strip()
427
  if len(preamble) > 30:
428
  clauses.insert(0, preamble)
429
- return clauses if clauses else _fallback_split(text)
 
 
430
  else:
431
- return _fallback_split(text)
 
 
432
 
433
  def _fallback_split(text):
434
  """Fallback: split on paragraph breaks and sentence boundaries."""
@@ -462,8 +477,40 @@ def _fallback_split(text):
462
 
463
  # ═══════════════════════════════════════════════════════════════════════
464
  # 5. CLAUSE DETECTION — FIXED: sigmoid + per-class thresholds + caching
 
 
465
  # ═══════════════════════════════════════════════════════════════════════
466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  def _text_hash(text):
468
  return hashlib.md5(text.encode()).hexdigest()
469
 
@@ -474,14 +521,17 @@ def classify_cuad(clause_text):
474
  if cuad_model is None or cuad_tokenizer is None:
475
  return _classify_regex(clause_text)
476
 
 
 
 
477
  # Check cache
478
- h = _text_hash(clause_text[:512])
479
  if h in _prediction_cache:
480
  return _prediction_cache[h]
481
 
482
  try:
483
  inputs = cuad_tokenizer(
484
- clause_text,
485
  return_tensors="pt",
486
  truncation=True,
487
  max_length=256,
@@ -498,10 +548,15 @@ def classify_cuad(clause_text):
498
  threshold = _CUAD_THRESHOLDS.get(i, 0.40)
499
  if float(prob) > threshold and i < len(CUAD_LABELS):
500
  label = CUAD_LABELS[i]
 
 
 
 
 
501
  risk = RISK_MAP.get(label, "LOW")
502
  results.append({
503
  "label": label,
504
- "confidence": round(float(prob), 3),
505
  "risk": risk,
506
  "description": DESC_MAP.get(label, label),
507
  "source": "ml",
@@ -773,19 +828,33 @@ def detect_contradictions(clause_results, raw_text=""):
773
  "source": "heuristic",
774
  })
775
 
776
- # ── 2. Missing critical clauses ──
777
- critical_clauses = {
778
- "Governing Law": "No governing law clause detected — jurisdiction ambiguity may cause disputes.",
779
- "Termination for Convenience": "No termination clause detected — exit terms are unclear.",
780
- "Limitation of liability": "No liability limitation detected — exposure may be unlimited.",
 
 
 
 
 
 
 
 
 
 
 
 
 
781
  }
782
- for cc, explanation in critical_clauses.items():
783
- if cc not in labels_found:
 
784
  contradictions.append({
785
  "type": "MISSING",
786
- "explanation": explanation,
787
  "severity": "MEDIUM",
788
- "clauses": [cc],
789
  "source": "structural",
790
  })
791
 
@@ -847,13 +916,21 @@ def analyze_contract(text):
847
  contradictions = detect_contradictions(clause_results, text)
848
  risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
849
  obligations = extract_obligations(text)
 
850
  compliance = check_compliance(text)
 
 
 
 
 
851
  result = {
852
  "metadata": {
853
  "analysis_date": datetime.now().isoformat(),
854
  "total_clauses": len(clauses),
855
- "flagged_clauses": len(set(cr["text"] for cr in clause_results)),
 
856
  "model": get_model_status_text(),
 
857
  },
858
  "risk": {
859
  "score": risk,
 
378
  return None, f"Unsupported file type: {ext}"
379
 
380
  # ═══════════════════════════════════════════════════════════════════════
381
+ # 4. DETERMINISTIC CLAUSE SPLITTING (Fix 1 from bug report)
382
  # ═══════════════════════════════════════════════════════════════════════
383
 
384
+ # Document-level chunk cache: same text always produces same chunks
385
+ _chunk_cache = {}
386
+
387
  def split_clauses(text):
388
+ """Deterministic, structure-aware clause splitting.
389
+ Fix 1: Same input ALWAYS produces same output. Normalized text is hashed
390
+ and cached so repeated runs on identical documents are identical."""
391
+ # Normalize whitespace before hashing for determinism
392
+ normalized = re.sub(r'\s+', ' ', text.strip())
393
+ text_hash = hashlib.sha256(normalized.encode()).hexdigest()
394
+ if text_hash in _chunk_cache:
395
+ return _chunk_cache[text_hash]
396
+
397
  text = re.sub(r'\n{3,}', '\n\n', text.strip())
398
 
399
  # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
 
437
  preamble = text[:positions[0]].strip()
438
  if len(preamble) > 30:
439
  clauses.insert(0, preamble)
440
+ result = clauses if clauses else _fallback_split(text)
441
+ _chunk_cache[text_hash] = result
442
+ return result
443
  else:
444
+ result = _fallback_split(text)
445
+ _chunk_cache[text_hash] = result
446
+ return result
447
 
448
  def _fallback_split(text):
449
  """Fallback: split on paragraph breaks and sentence boundaries."""
 
477
 
478
  # ═══════════════════════════════════════════════════════════════════════
479
  # 5. CLAUSE DETECTION — FIXED: sigmoid + per-class thresholds + caching
480
+ # Fix 3: Strip section headings before classification
481
+ # Fix 6: Label guardrails for high-confidence false positives
482
  # ═══════════════════════════════════════════════════════════════════════
483
 
484
+ # Fix 3: Section heading pattern — strip before classifying
485
+ _HEADING_RE = re.compile(r'^\d+(?:\.\d+)*\s+[A-Z][A-Z\s&,/]+$', re.MULTILINE)
486
+
487
+ def _strip_heading(text):
488
+ """Remove leading section headings that confuse the classifier."""
489
+ lines = text.split('\n')
490
+ if lines and _HEADING_RE.match(lines[0].strip()):
491
+ stripped = '\n'.join(lines[1:]).strip()
492
+ return stripped if len(stripped) > 20 else text
493
+ return text
494
+
495
+ # Fix 6: Label guardrails — keyword validation for high-confidence labels
496
+ _LABEL_GUARDRAILS = {
497
+ "Liquidated Damages": re.compile(
498
+ r'liquidated|pre-?determined.{0,10}damage|agreed.{0,10}sum|penalty clause|stipulated.{0,10}damage',
499
+ re.IGNORECASE
500
+ ),
501
+ "Uncapped Liability": re.compile(
502
+ r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
503
+ re.IGNORECASE
504
+ ),
505
+ }
506
+
507
+ def _apply_guardrails(label, text, confidence):
508
+ """Fix 6: If label has a guardrail and text lacks required keywords, demote."""
509
+ guard = _LABEL_GUARDRAILS.get(label)
510
+ if guard and not guard.search(text):
511
+ return "Other", confidence * 0.3 # demote to Other with reduced confidence
512
+ return label, confidence
513
+
514
  def _text_hash(text):
515
  return hashlib.md5(text.encode()).hexdigest()
516
 
 
521
  if cuad_model is None or cuad_tokenizer is None:
522
  return _classify_regex(clause_text)
523
 
524
+ # Fix 3: Strip section headings before classification
525
+ clean_text = _strip_heading(clause_text)
526
+
527
  # Check cache
528
+ h = _text_hash(clean_text[:512])
529
  if h in _prediction_cache:
530
  return _prediction_cache[h]
531
 
532
  try:
533
  inputs = cuad_tokenizer(
534
+ clean_text,
535
  return_tensors="pt",
536
  truncation=True,
537
  max_length=256,
 
548
  threshold = _CUAD_THRESHOLDS.get(i, 0.40)
549
  if float(prob) > threshold and i < len(CUAD_LABELS):
550
  label = CUAD_LABELS[i]
551
+ conf = float(prob)
552
+ # Fix 6: Apply guardrails — reject high-confidence false positives
553
+ label, conf = _apply_guardrails(label, clause_text, conf)
554
+ if label == "Other" and conf < 0.3:
555
+ continue # Skip demoted labels
556
  risk = RISK_MAP.get(label, "LOW")
557
  results.append({
558
  "label": label,
559
+ "confidence": round(conf, 3),
560
  "risk": risk,
561
  "description": DESC_MAP.get(label, label),
562
  "source": "ml",
 
828
  "source": "heuristic",
829
  })
830
 
831
+ # ── 2. Missing critical clauses (Fix 4: check raw_text, not labels) ──
832
+ _REQUIRED_CLAUSE_PATTERNS = {
833
+ "Governing Law": re.compile(
834
+ r'govern(?:ed|ing).{0,15}law|applicable.{0,10}law|laws?\s+of\s+the\s+state',
835
+ re.IGNORECASE
836
+ ),
837
+ "Limitation of liability": re.compile(
838
+ r'limitation.{0,10}liabilit|cap.{0,10}liabilit|liabilit.{0,10}shall\s+not\s+exceed|in\s+no\s+event.{0,20}liable',
839
+ re.IGNORECASE
840
+ ),
841
+ "Arbitration": re.compile(
842
+ r'arbitrat|AAA|JAMS|binding.{0,10}dispute',
843
+ re.IGNORECASE
844
+ ),
845
+ "Termination": re.compile(
846
+ r'terminat(?:e|ion|ed)|cancel(?:lation)?',
847
+ re.IGNORECASE
848
+ ),
849
  }
850
+ for clause_name, pattern in _REQUIRED_CLAUSE_PATTERNS.items():
851
+ # Check raw_text directly — it's stable and deterministic
852
+ if not pattern.search(raw_text):
853
  contradictions.append({
854
  "type": "MISSING",
855
+ "explanation": f"No '{clause_name}' clause detected in the document.",
856
  "severity": "MEDIUM",
857
+ "clauses": [clause_name],
858
  "source": "structural",
859
  })
860
 
 
916
  contradictions = detect_contradictions(clause_results, text)
917
  risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
918
  obligations = extract_obligations(text)
919
+ # Fix 5: Compliance runs against full raw_text (already done in compliance.py)
920
  compliance = check_compliance(text)
921
+
922
+ # Fix 2: Compute flagged_clauses AFTER all processing is complete
923
+ flagged_clause_count = len(clause_results)
924
+ unique_flagged_texts = len(set(cr["text"] for cr in clause_results))
925
+
926
  result = {
927
  "metadata": {
928
  "analysis_date": datetime.now().isoformat(),
929
  "total_clauses": len(clauses),
930
+ "flagged_clauses": flagged_clause_count,
931
+ "unique_flagged": unique_flagged_texts,
932
  "model": get_model_status_text(),
933
+ "text_hash": hashlib.sha256(re.sub(r'\s+', ' ', text.strip()).encode()).hexdigest()[:16],
934
  },
935
  "risk": {
936
  "score": risk,