gaurv007 commited on
Commit
549ed6e
·
verified ·
1 Parent(s): 970316e

v4.0: Add app.py — OCR + RAG Chatbot + Clause Redlining

Browse files
Files changed (1) hide show
  1. app.py +117 -31
app.py CHANGED
@@ -1,7 +1,12 @@
1
  """
2
- ClauseGuard — World's Best Legal Contract Analysis Tool (v3.0)
3
  ═══════════════════════════════════════════════════════════════
4
- Fixes in v3.0:
 
 
 
 
 
5
  • Fixed CUAD label mapping (added missing index 6: "Notice Period to Terminate Renewal")
6
  • Switched from softmax → sigmoid for proper multi-label classification
7
  • Per-class optimized thresholds instead of flat 0.15
@@ -21,6 +26,9 @@ Models:
21
  (LoRA adapter on nlpaueb/legal-bert-base-uncased, 41 CUAD classes)
22
  • Legal NER: matterstack/legal-bert-ner (token classification)
23
  • NLI: cross-encoder/nli-deberta-v3-base (contradiction detection)
 
 
 
24
  """
25
 
26
  import os
@@ -71,6 +79,9 @@ except Exception:
71
  from compare import compare_contracts, render_comparison_html
72
  from obligations import extract_obligations, render_obligations_html
73
  from compliance import check_compliance, render_compliance_html
 
 
 
74
 
75
  # ═══════════════════════════════════════════════════════════════════════
76
  # 1. CONFIGURATION — FIXED label mapping (41 labels, index 6 restored)
@@ -335,20 +346,15 @@ _load_nli_model()
335
  # ═══════════════════════════════════════════════════════════════════════
336
 
337
  def parse_pdf(file_path):
338
- if not _HAS_PDF:
339
- return None, "PDF parsing not available (pdfplumber not installed)"
340
- try:
341
- text = ""
342
- with pdfplumber.open(file_path) as pdf:
343
- for page in pdf.pages:
344
- page_text = page.extract_text()
345
- if page_text:
346
- text += page_text + "\n\n"
347
- if not text.strip():
348
- return None, "PDF appears to be scanned/image-based. OCR is not yet supported. Please use a digital PDF or paste text directly."
349
- return text.strip(), None
350
- except Exception as e:
351
- return None, f"PDF parse error: {e}"
352
 
353
  def parse_docx(file_path):
354
  if not _HAS_DOCX:
@@ -1196,11 +1202,11 @@ def process_upload(file):
1196
  def run_analysis(text):
1197
  if not text or len(text.strip()) < 50:
1198
  err_html = '<p style="color:#dc2626;padding:16px;">Document too short (minimum 50 characters)</p>'
1199
- return [err_html] * 7 + [None, None, ""]
1200
  result, error = analyze_contract(text)
1201
  if error:
1202
  err_html = f'<p style="color:#dc2626;padding:16px;">{error}</p>'
1203
- return [err_html] * 7 + [None, None, error]
1204
 
1205
  # FIXED: per-session temp files
1206
  session_id = uuid.uuid4().hex[:8]
@@ -1213,6 +1219,10 @@ def run_analysis(text):
1213
  with open(csv_path, "w") as f:
1214
  f.write(csv_content)
1215
 
 
 
 
 
1216
  return [
1217
  render_summary(result),
1218
  render_clause_cards(result),
@@ -1221,13 +1231,15 @@ def run_analysis(text):
1221
  render_document_viewer(result),
1222
  render_obligations_html(result.get("obligations", [])),
1223
  render_compliance_html(result.get("compliance", {})),
 
1224
  json_path,
1225
  csv_path,
1226
  "Analysis complete",
 
1227
  ]
1228
 
1229
  def do_clear():
1230
- return [""] * 7 + [None, None, ""]
1231
 
1232
  # ── Example contracts ──
1233
  SPOTIFY_TOS = """By using the Spotify Service, you agree to be bound by these Terms of Use.
@@ -1311,17 +1323,22 @@ with gr.Blocks(
1311
  """
1312
  ) as demo:
1313
 
 
 
 
 
 
1314
  gr.HTML("""
1315
  <div style="display:flex;align-items:center;justify-content:space-between;padding:12px 0;border-bottom:2px solid #e5e7eb;margin-bottom:16px;">
1316
  <div>
1317
  <h1 style="font-size:24px;font-weight:700;margin:0;color:#1f2937;">🛡️ ClauseGuard</h1>
1318
- <p style="font-size:13px;color:#6b7280;margin:4px 0 0 0;">AI-Powered Legal Contract Analysis · 41 Clause Categories · Risk Scoring · ML NER · NLI Contradictions · Compliance · Obligations</p>
1319
  </div>
1320
- <div style="font-size:12px;color:#9ca3af;">v3.0 · Precision Legal AI</div>
1321
  </div>
1322
  """)
1323
 
1324
- # ── Main Tabs: Analysis vs Comparison ──
1325
  with gr.Tabs():
1326
 
1327
  # ═══════ TAB 1: Single Contract Analysis ═══════
@@ -1338,7 +1355,7 @@ with gr.Blocks(
1338
  with gr.Column(scale=3):
1339
  text_input = gr.Textbox(
1340
  label="📄 Contract Text",
1341
- placeholder="Paste contract text here, or upload a file above...",
1342
  lines=14,
1343
  max_lines=40,
1344
  show_copy_button=True,
@@ -1381,6 +1398,8 @@ with gr.Blocks(
1381
  obligations_html = gr.HTML(label="Obligation Tracker")
1382
  with gr.Tab("⚖️ Compliance"):
1383
  compliance_html = gr.HTML(label="Compliance Checker")
 
 
1384
 
1385
  # ═══════ TAB 2: Contract Comparison ═══════
1386
  with gr.Tab("🔀 Compare Contracts"):
@@ -1429,6 +1448,53 @@ with gr.Blocks(
1429
  with gr.Column(scale=2):
1430
  comp_json = gr.JSON(label="Raw Comparison Data")
1431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1432
  # ── Events ──
1433
  def _load_file(file):
1434
  text, err = parse_document(file) if file else ("", "No file")
@@ -1436,23 +1502,41 @@ with gr.Blocks(
1436
  return "", err
1437
  return text, "Loaded successfully" if not err else err
1438
 
 
 
 
 
 
 
 
 
 
 
 
 
1439
  load_btn.click(_load_file, inputs=[file_input], outputs=[text_input, load_status])
1440
  comp_load_a.click(_load_file, inputs=[comp_file_a], outputs=[comp_text_a, comp_status_a])
1441
  comp_load_b.click(_load_file, inputs=[comp_file_b], outputs=[comp_text_b, comp_status_b])
1442
 
1443
  scan_btn.click(
1444
- run_analysis,
1445
  inputs=[text_input],
1446
- outputs=[summary_html, clauses_html, entities_html, nli_html,
1447
- doc_html, obligations_html, compliance_html,
1448
- json_file, csv_file, status_msg]
 
 
 
1449
  )
1450
 
1451
  clear_btn.click(
1452
- do_clear,
1453
- outputs=[summary_html, clauses_html, entities_html, nli_html,
1454
- doc_html, obligations_html, compliance_html,
1455
- json_file, csv_file, status_msg]
 
 
 
1456
  )
1457
 
1458
  comp_btn.click(
@@ -1468,6 +1552,8 @@ with gr.Blocks(
1468
  · Model: <a href="https://huggingface.co/Mokshith31/legalbert-contract-clause-classification" style="color:#6b7280;">Legal-BERT + CUAD (41 classes)</a>
1469
  · NER: <a href="https://huggingface.co/matterstack/legal-bert-ner" style="color:#6b7280;">Legal-BERT NER</a>
1470
  · NLI: <a href="https://huggingface.co/cross-encoder/nli-deberta-v3-base" style="color:#6b7280;">DeBERTa-v3 NLI</a>
 
 
1471
  · Dataset: <a href="https://huggingface.co/datasets/theatticusproject/cuad-qa" style="color:#6b7280;">CUAD</a>
1472
  · <a href="https://huggingface.co/spaces/gaurv007/ClauseGuard" style="color:#6b7280;">ClauseGuard Space</a>
1473
  </p>
 
1
  """
2
+ ClauseGuard — World's Best Legal Contract Analysis Tool (v4.0)
3
  ═══════════════════════════════════════════════════════════════
4
+ New in v4.0:
5
+ • OCR support for scanned PDFs (docTR engine with smart native/scanned routing)
6
+ • Contract Q&A Chatbot (RAG: embedding retrieval + HF Inference API streaming)
7
+ • Clause Redlining (3-tier: template lookup + RAG + LLM refinement)
8
+
9
+ Carried from v3.0:
10
  • Fixed CUAD label mapping (added missing index 6: "Notice Period to Terminate Renewal")
11
  • Switched from softmax → sigmoid for proper multi-label classification
12
  • Per-class optimized thresholds instead of flat 0.15
 
26
  (LoRA adapter on nlpaueb/legal-bert-base-uncased, 41 CUAD classes)
27
  • Legal NER: matterstack/legal-bert-ner (token classification)
28
  • NLI: cross-encoder/nli-deberta-v3-base (contradiction detection)
29
+ • Embeddings: sentence-transformers/all-MiniLM-L6-v2 (RAG retrieval)
30
+ • OCR: docTR fast_base + crnn_vgg16_bn (scanned PDF extraction)
31
+ • LLM: Qwen/Qwen2.5-7B-Instruct via HF Inference API (chatbot + redlining)
32
  """
33
 
34
  import os
 
79
  from compare import compare_contracts, render_comparison_html
80
  from obligations import extract_obligations, render_obligations_html
81
  from compliance import check_compliance, render_compliance_html
82
+ from ocr_engine import parse_pdf_smart, get_ocr_status
83
+ from chatbot import index_contract, chat_respond, get_chatbot_status
84
+ from redlining import generate_redlines, render_redlines_html
85
 
86
  # ═══════════════════════════════════════════════════════════════════════
87
  # 1. CONFIGURATION — FIXED label mapping (41 labels, index 6 restored)
 
346
  # ═══════════════════════════════════════════════════════════════════════
347
 
348
  def parse_pdf(file_path):
349
+ """Smart PDF parser: native text extraction with OCR fallback for scanned PDFs."""
350
+ text, error, method = parse_pdf_smart(file_path)
351
+ if text:
352
+ if method == "ocr":
353
+ print(f"[ClauseGuard] PDF extracted via OCR ({len(text)} chars)")
354
+ return text, None
355
+ if error:
356
+ return None, error
357
+ return None, "Could not extract text from PDF. Try uploading a clearer scan or digital PDF."
 
 
 
 
 
358
 
359
  def parse_docx(file_path):
360
  if not _HAS_DOCX:
 
1202
  def run_analysis(text):
1203
  if not text or len(text.strip()) < 50:
1204
  err_html = '<p style="color:#dc2626;padding:16px;">Document too short (minimum 50 characters)</p>'
1205
+ return [err_html] * 8 + [None, None, "", None]
1206
  result, error = analyze_contract(text)
1207
  if error:
1208
  err_html = f'<p style="color:#dc2626;padding:16px;">{error}</p>'
1209
+ return [err_html] * 8 + [None, None, error, None]
1210
 
1211
  # FIXED: per-session temp files
1212
  session_id = uuid.uuid4().hex[:8]
 
1219
  with open(csv_path, "w") as f:
1220
  f.write(csv_content)
1221
 
1222
+ # Generate redline suggestions (Tier 1 template + Tier 3 LLM for critical/high)
1223
+ redlines = generate_redlines(result, use_llm=True)
1224
+ redlines_html = render_redlines_html(redlines)
1225
+
1226
  return [
1227
  render_summary(result),
1228
  render_clause_cards(result),
 
1231
  render_document_viewer(result),
1232
  render_obligations_html(result.get("obligations", [])),
1233
  render_compliance_html(result.get("compliance", {})),
1234
+ redlines_html,
1235
  json_path,
1236
  csv_path,
1237
  "Analysis complete",
1238
+ result, # Store analysis result for chatbot
1239
  ]
1240
 
1241
  def do_clear():
1242
+ return [""] * 8 + [None, None, "", None]
1243
 
1244
  # ── Example contracts ──
1245
  SPOTIFY_TOS = """By using the Spotify Service, you agree to be bound by these Terms of Use.
 
1323
  """
1324
  ) as demo:
1325
 
1326
+ # ── Shared State (for chatbot RAG) ──────────────────────────────
1327
+ analysis_state = gr.State(None) # Full analysis result dict
1328
+ chunks_state = gr.State([]) # Contract text chunks for RAG
1329
+ embeddings_state = gr.State(None) # Chunk embeddings (numpy array)
1330
+
1331
  gr.HTML("""
1332
  <div style="display:flex;align-items:center;justify-content:space-between;padding:12px 0;border-bottom:2px solid #e5e7eb;margin-bottom:16px;">
1333
  <div>
1334
  <h1 style="font-size:24px;font-weight:700;margin:0;color:#1f2937;">🛡️ ClauseGuard</h1>
1335
+ <p style="font-size:13px;color:#6b7280;margin:4px 0 0 0;">AI-Powered Legal Contract Analysis · 41 Clause Categories · Risk Scoring · ML NER · NLI Contradictions · Compliance · Obligations · <strong>Q&A Chatbot</strong> · <strong>Clause Redlining</strong> · <strong>OCR</strong></p>
1336
  </div>
1337
+ <div style="font-size:12px;color:#9ca3af;">v4.0 · Precision Legal AI</div>
1338
  </div>
1339
  """)
1340
 
1341
+ # ── Main Tabs: Analysis vs Comparison vs Chatbot ──
1342
  with gr.Tabs():
1343
 
1344
  # ═══════ TAB 1: Single Contract Analysis ═══════
 
1355
  with gr.Column(scale=3):
1356
  text_input = gr.Textbox(
1357
  label="📄 Contract Text",
1358
+ placeholder="Paste contract text here, or upload a file above...\n\n💡 Scanned PDFs are automatically processed with OCR.",
1359
  lines=14,
1360
  max_lines=40,
1361
  show_copy_button=True,
 
1398
  obligations_html = gr.HTML(label="Obligation Tracker")
1399
  with gr.Tab("⚖️ Compliance"):
1400
  compliance_html = gr.HTML(label="Compliance Checker")
1401
+ with gr.Tab("✏️ Redlining"):
1402
+ redlining_html = gr.HTML(label="Clause Redlining Suggestions")
1403
 
1404
  # ═══════ TAB 2: Contract Comparison ═══════
1405
  with gr.Tab("🔀 Compare Contracts"):
 
1448
  with gr.Column(scale=2):
1449
  comp_json = gr.JSON(label="Raw Comparison Data")
1450
 
1451
+ # ═══════ TAB 3: Contract Q&A Chatbot ═══════
1452
+ with gr.Tab("💬 Contract Q&A"):
1453
+ gr.HTML("""
1454
+ <div style="padding:12px 16px;background:linear-gradient(135deg,#eff6ff,#faf5ff);border-radius:10px;margin-bottom:12px;border:1px solid #e5e7eb;">
1455
+ <div style="display:flex;align-items:center;gap:8px;margin-bottom:6px;">
1456
+ <span style="font-size:20px;">💬</span>
1457
+ <h3 style="margin:0;font-size:16px;color:#1f2937;">Contract Q&A Chatbot</h3>
1458
+ </div>
1459
+ <p style="font-size:12px;color:#6b7280;margin:0;line-height:1.5;">
1460
+ Ask questions about your analyzed contract. The chatbot uses <strong>RAG</strong> (Retrieval-Augmented Generation)
1461
+ to find relevant clauses and generate accurate answers grounded in your contract text.
1462
+ <br>
1463
+ <strong>Step 1:</strong> Analyze a contract in the "📄 Single Contract Analysis" tab.
1464
+ <strong>Step 2:</strong> Come here and ask questions!
1465
+ </p>
1466
+ </div>
1467
+ """)
1468
+
1469
+ chatbot_index_status = gr.Textbox(
1470
+ label="📡 Chatbot Index Status",
1471
+ interactive=False,
1472
+ lines=1,
1473
+ value="⏳ No contract indexed yet — analyze a contract first",
1474
+ )
1475
+
1476
+ def _chatbot_fn(message, history, chunks, embeddings, analysis):
1477
+ """Wrapper for ChatInterface fn signature."""
1478
+ yield from chat_respond(message, history, chunks, embeddings, analysis)
1479
+
1480
+ gr.ChatInterface(
1481
+ fn=_chatbot_fn,
1482
+ type="messages",
1483
+ additional_inputs=[chunks_state, embeddings_state, analysis_state],
1484
+ examples=[
1485
+ "What are the main risks in this contract?",
1486
+ "Who are the parties involved?",
1487
+ "What happens if the contract is terminated?",
1488
+ "Are there any liability limitations?",
1489
+ "What are my obligations under this contract?",
1490
+ "Is there an arbitration clause?",
1491
+ "What is the governing law?",
1492
+ "Summarize the key terms in plain language.",
1493
+ ],
1494
+ title="",
1495
+ description="",
1496
+ )
1497
+
1498
  # ── Events ──
1499
  def _load_file(file):
1500
  text, err = parse_document(file) if file else ("", "No file")
 
1502
  return "", err
1503
  return text, "Loaded successfully" if not err else err
1504
 
1505
+ def _analysis_and_index(text):
1506
+ """Run analysis AND index for chatbot in one call."""
1507
+ # Run the standard analysis
1508
+ analysis_outputs = run_analysis(text)
1509
+
1510
+ # Index for chatbot (uses the raw text)
1511
+ chunks, embeddings, index_status = index_contract(text)
1512
+
1513
+ # analysis_outputs has 12 items: 8 HTML + json_path + csv_path + status + result
1514
+ # We need to add: chunks_state, embeddings_state, chatbot_index_status
1515
+ return analysis_outputs + [chunks, embeddings, index_status]
1516
+
1517
  load_btn.click(_load_file, inputs=[file_input], outputs=[text_input, load_status])
1518
  comp_load_a.click(_load_file, inputs=[comp_file_a], outputs=[comp_text_a, comp_status_a])
1519
  comp_load_b.click(_load_file, inputs=[comp_file_b], outputs=[comp_text_b, comp_status_b])
1520
 
1521
  scan_btn.click(
1522
+ _analysis_and_index,
1523
  inputs=[text_input],
1524
+ outputs=[
1525
+ summary_html, clauses_html, entities_html, nli_html,
1526
+ doc_html, obligations_html, compliance_html, redlining_html,
1527
+ json_file, csv_file, status_msg, analysis_state,
1528
+ chunks_state, embeddings_state, chatbot_index_status,
1529
+ ]
1530
  )
1531
 
1532
  clear_btn.click(
1533
+ lambda: [""] * 8 + [None, None, "", None, [], None, "⏳ No contract indexed"],
1534
+ outputs=[
1535
+ summary_html, clauses_html, entities_html, nli_html,
1536
+ doc_html, obligations_html, compliance_html, redlining_html,
1537
+ json_file, csv_file, status_msg, analysis_state,
1538
+ chunks_state, embeddings_state, chatbot_index_status,
1539
+ ]
1540
  )
1541
 
1542
  comp_btn.click(
 
1552
  · Model: <a href="https://huggingface.co/Mokshith31/legalbert-contract-clause-classification" style="color:#6b7280;">Legal-BERT + CUAD (41 classes)</a>
1553
  · NER: <a href="https://huggingface.co/matterstack/legal-bert-ner" style="color:#6b7280;">Legal-BERT NER</a>
1554
  · NLI: <a href="https://huggingface.co/cross-encoder/nli-deberta-v3-base" style="color:#6b7280;">DeBERTa-v3 NLI</a>
1555
+ · LLM: <a href="https://huggingface.co/Qwen/Qwen2.5-7B-Instruct" style="color:#6b7280;">Qwen2.5-7B</a>
1556
+ · OCR: <a href="https://github.com/mindee/doctr" style="color:#6b7280;">docTR</a>
1557
  · Dataset: <a href="https://huggingface.co/datasets/theatticusproject/cuad-qa" style="color:#6b7280;">CUAD</a>
1558
  · <a href="https://huggingface.co/spaces/gaurv007/ClauseGuard" style="color:#6b7280;">ClauseGuard Space</a>
1559
  </p>