Spaces:
Sleeping
Sleeping
π¨ FIX: Restore missing Gradio UI β app was broken (no UI rendered)
#5
by gaurv007 - opened
app.py
CHANGED
|
@@ -1,1469 +1 @@
|
|
| 1 |
-
|
| 2 |
-
ClauseGuard β World's Best Legal Contract Analysis Tool (v4.3)
|
| 3 |
-
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
-
PERF v4.3:
|
| 5 |
-
β’ PERF: Upgraded embedder to BAAI/bge-small-en-v1.5 (+21% retrieval accuracy)
|
| 6 |
-
β’ PERF: Batched clause classification (single forward pass, batch_size=8)
|
| 7 |
-
β’ PERF: ONNX INT8 quantized model support (2-4x faster on CPU)
|
| 8 |
-
β’ PERF: torch.set_num_threads(2) to prevent CPU thrashing
|
| 9 |
-
β’ NEW: ml/export_onnx_v2.py β full mergeβONNXβquantize pipeline
|
| 10 |
-
|
| 11 |
-
Fixes in v4.2:
|
| 12 |
-
β’ FIX: NLI now uses CrossEncoder.predict() β contradictions actually work
|
| 13 |
-
β’ FIX: BoundedCache uses threading.RLock β no more race conditions
|
| 14 |
-
β’ FIX: Pre-compiled ALL regex patterns at module level (perf)
|
| 15 |
-
β’ FIX: Added missing regex labels to RISK_MAP/DESC_MAP
|
| 16 |
-
β’ FIX: Extension risk formula matches backend
|
| 17 |
-
β’ FIX: Extension API_BASE URL corrected
|
| 18 |
-
β’ FIX: API CORS localhost requires explicit opt-in
|
| 19 |
-
|
| 20 |
-
Fixes in v4.1:
|
| 21 |
-
β’ FIX: Bounded LRU caches (chunk_cache, prediction_cache) β no more memory leaks
|
| 22 |
-
β’ FIX: NLI input format β pass (text_a, text_b) tuple, not [SEP]-concatenated string
|
| 23 |
-
β’ FIX: Classifier max_length raised to 512 (was 256 β truncating legal clauses)
|
| 24 |
-
β’ FIX: Risk score formula β absolute risk, not normalized by total_clauses
|
| 25 |
-
β’ FIX: Train/inference alignment β use softmax+argmax for single-label model
|
| 26 |
-
β’ FIX: Added missing regex fallback patterns for more CUAD categories
|
| 27 |
-
β’ FIX: Entity extraction batching β single pipeline call instead of sequential
|
| 28 |
-
β’ PERF: Shared model singleton via models.py module
|
| 29 |
-
β’ PERF: LRU-bounded caches everywhere
|
| 30 |
-
|
| 31 |
-
Carried from v4.0:
|
| 32 |
-
β’ OCR support for scanned PDFs (docTR engine with smart native/scanned routing)
|
| 33 |
-
β’ Contract Q&A Chatbot (RAG: embedding retrieval + HF Inference API streaming)
|
| 34 |
-
β’ Clause Redlining (3-tier: template lookup + RAG + LLM refinement)
|
| 35 |
-
β’ Fixed CUAD label mapping (added missing index 6)
|
| 36 |
-
β’ Structure-aware clause splitting
|
| 37 |
-
β’ Real NLI contradiction detection via cross-encoder model
|
| 38 |
-
β’ ML-based Legal NER with regex fallback
|
| 39 |
-
β’ Semantic compliance checking with negation handling
|
| 40 |
-
β’ Improved obligation extraction with false-positive filtering
|
| 41 |
-
β’ LLM-powered clause explanations
|
| 42 |
-
β’ Per-session temp files (no collision)
|
| 43 |
-
β’ Model health reporting
|
| 44 |
-
|
| 45 |
-
Models:
|
| 46 |
-
β’ Clause classifier: Mokshith31/legalbert-contract-clause-classification
|
| 47 |
-
(LoRA adapter on nlpaueb/legal-bert-base-uncased, 41 CUAD classes)
|
| 48 |
-
β’ Legal NER: matterstack/legal-bert-ner (token classification)
|
| 49 |
-
β’ NLI: cross-encoder/nli-deberta-v3-base (contradiction detection)
|
| 50 |
-
β’ Embeddings: sentence-transformers/all-MiniLM-L6-v2 (RAG retrieval)
|
| 51 |
-
β’ OCR: docTR fast_base + crnn_vgg16_bn (scanned PDF extraction)
|
| 52 |
-
β’ LLM: Qwen/Qwen2.5-7B-Instruct via HF Inference API (chatbot + redlining)
|
| 53 |
-
"""
|
| 54 |
-
|
| 55 |
-
import os
|
| 56 |
-
import re
|
| 57 |
-
import json
|
| 58 |
-
import csv
|
| 59 |
-
import io
|
| 60 |
-
import uuid
|
| 61 |
-
import tempfile
|
| 62 |
-
import hashlib
|
| 63 |
-
import threading
|
| 64 |
-
from collections import defaultdict, OrderedDict
|
| 65 |
-
from datetime import datetime
|
| 66 |
-
from functools import lru_cache
|
| 67 |
-
|
| 68 |
-
import gradio as gr
|
| 69 |
-
import numpy as np
|
| 70 |
-
|
| 71 |
-
# ββ Document parsers (soft-fail) ββββββββββββββββββββββββββββββββββββ
|
| 72 |
-
try:
|
| 73 |
-
import pdfplumber
|
| 74 |
-
_HAS_PDF = True
|
| 75 |
-
except Exception:
|
| 76 |
-
_HAS_PDF = False
|
| 77 |
-
|
| 78 |
-
try:
|
| 79 |
-
from docx import Document as DocxDocument
|
| 80 |
-
_HAS_DOCX = True
|
| 81 |
-
except Exception:
|
| 82 |
-
_HAS_DOCX = False
|
| 83 |
-
|
| 84 |
-
# ββ PyTorch / Transformers (soft-fail) ββββββββββββββββββββββββββββββββ
|
| 85 |
-
_HAS_TORCH = False
|
| 86 |
-
_HAS_NER_MODEL = False
|
| 87 |
-
_HAS_NLI_MODEL = False
|
| 88 |
-
|
| 89 |
-
try:
|
| 90 |
-
import torch
|
| 91 |
-
from transformers import (
|
| 92 |
-
AutoTokenizer, AutoModelForSequenceClassification,
|
| 93 |
-
AutoModelForTokenClassification, pipeline
|
| 94 |
-
)
|
| 95 |
-
from peft import PeftModel
|
| 96 |
-
_HAS_TORCH = True
|
| 97 |
-
# PERF v4.3: Limit PyTorch threads to avoid CPU thrashing under concurrent requests.
|
| 98 |
-
# HF Spaces CPU-basic has 2 vCPUs. Reserve 1 thread for Gradio server.
|
| 99 |
-
torch.set_num_threads(2)
|
| 100 |
-
torch.set_num_interop_threads(1)
|
| 101 |
-
except Exception:
|
| 102 |
-
pass
|
| 103 |
-
|
| 104 |
-
# ββ ONNX Runtime (soft-fail, for quantized model) βββββββββββββββββββββ
|
| 105 |
-
_HAS_ORT = False
|
| 106 |
-
try:
|
| 107 |
-
from optimum.onnxruntime import ORTModelForSequenceClassification as _ORTModel
|
| 108 |
-
_HAS_ORT = True
|
| 109 |
-
except ImportError:
|
| 110 |
-
pass
|
| 111 |
-
|
| 112 |
-
# ββ CrossEncoder for NLI (soft-fail) ββββββββββββββββββββββββββββββββββ
|
| 113 |
-
_HAS_CROSS_ENCODER = False
|
| 114 |
-
try:
|
| 115 |
-
from sentence_transformers import CrossEncoder as _CrossEncoder
|
| 116 |
-
_HAS_CROSS_ENCODER = True
|
| 117 |
-
except ImportError:
|
| 118 |
-
pass
|
| 119 |
-
|
| 120 |
-
# ββ Import submodules ββββββββββββββββββββββββββββοΏ½οΏ½οΏ½ββββββββββββββββββ
|
| 121 |
-
from compare import compare_contracts, render_comparison_html
|
| 122 |
-
from obligations import extract_obligations, render_obligations_html
|
| 123 |
-
from compliance import check_compliance, render_compliance_html
|
| 124 |
-
from ocr_engine import parse_pdf_smart, get_ocr_status
|
| 125 |
-
from chatbot import index_contract, chat_respond, get_chatbot_status
|
| 126 |
-
from redlining import generate_redlines, render_redlines_html
|
| 127 |
-
|
| 128 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 129 |
-
# 1. CONFIGURATION β FIXED label mapping (41 labels, index 6 restored)
|
| 130 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 131 |
-
|
| 132 |
-
CUAD_LABELS = [
|
| 133 |
-
"Document Name", # 0
|
| 134 |
-
"Parties", # 1
|
| 135 |
-
"Agreement Date", # 2
|
| 136 |
-
"Effective Date", # 3
|
| 137 |
-
"Expiration Date", # 4
|
| 138 |
-
"Renewal Term", # 5
|
| 139 |
-
"Notice Period to Terminate Renewal", # 6 β WAS MISSING
|
| 140 |
-
"Governing Law", # 7
|
| 141 |
-
"Most Favored Nation", # 8
|
| 142 |
-
"Non-Compete", # 9
|
| 143 |
-
"Exclusivity", # 10
|
| 144 |
-
"No-Solicit of Customers", # 11
|
| 145 |
-
"No-Solicit of Employees", # 12
|
| 146 |
-
"Non-Disparagement", # 13
|
| 147 |
-
"Termination for Convenience", # 14
|
| 148 |
-
"ROFR/ROFO/ROFN", # 15
|
| 149 |
-
"Change of Control", # 16
|
| 150 |
-
"Anti-Assignment", # 17
|
| 151 |
-
"Revenue/Profit Sharing", # 18
|
| 152 |
-
"Price Restriction", # 19
|
| 153 |
-
"Minimum Commitment", # 20
|
| 154 |
-
"Volume Restriction", # 21
|
| 155 |
-
"IP Ownership Assignment", # 22
|
| 156 |
-
"Joint IP Ownership", # 23
|
| 157 |
-
"License Grant", # 24
|
| 158 |
-
"Non-Transferable License", # 25
|
| 159 |
-
"Affiliate License-Licensor", # 26
|
| 160 |
-
"Affiliate License-Licensee", # 27
|
| 161 |
-
"Unlimited/All-You-Can-Eat License", # 28
|
| 162 |
-
"Irrevocable or Perpetual License", # 29
|
| 163 |
-
"Source Code Escrow", # 30
|
| 164 |
-
"Post-Termination Services", # 31
|
| 165 |
-
"Audit Rights", # 32
|
| 166 |
-
"Uncapped Liability", # 33
|
| 167 |
-
"Cap on Liability", # 34
|
| 168 |
-
"Liquidated Damages", # 35
|
| 169 |
-
"Warranty Duration", # 36
|
| 170 |
-
"Insurance", # 37
|
| 171 |
-
"Covenant Not to Sue", # 38
|
| 172 |
-
"Third Party Beneficiary", # 39
|
| 173 |
-
"Other", # 40
|
| 174 |
-
]
|
| 175 |
-
|
| 176 |
-
_UNFAIR_LABELS = [
|
| 177 |
-
"Limitation of liability", "Unilateral termination", "Unilateral change",
|
| 178 |
-
"Content removal", "Contract by using", "Choice of law",
|
| 179 |
-
"Jurisdiction", "Arbitration"
|
| 180 |
-
]
|
| 181 |
-
|
| 182 |
-
# FIX v4.2: Include regex-only labels that aren't in CUAD or Unfair lists
|
| 183 |
-
_EXTRA_REGEX_LABELS = [
|
| 184 |
-
"Indemnification", "Confidentiality", "Force Majeure", "Penalties"
|
| 185 |
-
]
|
| 186 |
-
|
| 187 |
-
_ALL_LABELS = CUAD_LABELS + _UNFAIR_LABELS + _EXTRA_REGEX_LABELS
|
| 188 |
-
|
| 189 |
-
RISK_MAP = {
|
| 190 |
-
# Critical
|
| 191 |
-
"Uncapped Liability": "CRITICAL",
|
| 192 |
-
"Arbitration": "CRITICAL",
|
| 193 |
-
"IP Ownership Assignment": "CRITICAL",
|
| 194 |
-
"Termination for Convenience": "CRITICAL",
|
| 195 |
-
"Limitation of liability": "CRITICAL",
|
| 196 |
-
"Unilateral termination": "CRITICAL",
|
| 197 |
-
"Liquidated Damages": "CRITICAL",
|
| 198 |
-
# High
|
| 199 |
-
"Non-Compete": "HIGH",
|
| 200 |
-
"Exclusivity": "HIGH",
|
| 201 |
-
"Change of Control": "HIGH",
|
| 202 |
-
"No-Solicit of Customers": "HIGH",
|
| 203 |
-
"No-Solicit of Employees": "HIGH",
|
| 204 |
-
"Unilateral change": "HIGH",
|
| 205 |
-
"Content removal": "HIGH",
|
| 206 |
-
"Anti-Assignment": "HIGH",
|
| 207 |
-
"Notice Period to Terminate Renewal": "HIGH",
|
| 208 |
-
# Medium
|
| 209 |
-
"Governing Law": "MEDIUM",
|
| 210 |
-
"Jurisdiction": "MEDIUM",
|
| 211 |
-
"Choice of law": "MEDIUM",
|
| 212 |
-
"Price Restriction": "MEDIUM",
|
| 213 |
-
"Minimum Commitment": "MEDIUM",
|
| 214 |
-
"Volume Restriction": "MEDIUM",
|
| 215 |
-
"Non-Disparagement": "MEDIUM",
|
| 216 |
-
"Most Favored Nation": "MEDIUM",
|
| 217 |
-
"Revenue/Profit Sharing": "MEDIUM",
|
| 218 |
-
"Warranty Duration": "MEDIUM",
|
| 219 |
-
# Low
|
| 220 |
-
"Document Name": "LOW",
|
| 221 |
-
"Parties": "LOW",
|
| 222 |
-
"Agreement Date": "LOW",
|
| 223 |
-
"Effective Date": "LOW",
|
| 224 |
-
"Expiration Date": "LOW",
|
| 225 |
-
"Renewal Term": "LOW",
|
| 226 |
-
"Joint IP Ownership": "LOW",
|
| 227 |
-
"License Grant": "LOW",
|
| 228 |
-
"Non-Transferable License": "LOW",
|
| 229 |
-
"Affiliate License-Licensor": "LOW",
|
| 230 |
-
"Affiliate License-Licensee": "LOW",
|
| 231 |
-
"Unlimited/All-You-Can-Eat License": "LOW",
|
| 232 |
-
"Irrevocable or Perpetual License": "LOW",
|
| 233 |
-
"Source Code Escrow": "LOW",
|
| 234 |
-
"Post-Termination Services": "LOW",
|
| 235 |
-
"Audit Rights": "LOW",
|
| 236 |
-
"Cap on Liability": "LOW",
|
| 237 |
-
"Insurance": "LOW",
|
| 238 |
-
"Covenant Not to Sue": "LOW",
|
| 239 |
-
"Third Party Beneficiary": "LOW",
|
| 240 |
-
"Other": "LOW",
|
| 241 |
-
"ROFR/ROFO/ROFN": "LOW",
|
| 242 |
-
"Contract by using": "LOW",
|
| 243 |
-
# FIX v4.2: Added regex-only labels that were missing from RISK_MAP
|
| 244 |
-
"Indemnification": "HIGH",
|
| 245 |
-
"Confidentiality": "MEDIUM",
|
| 246 |
-
"Force Majeure": "LOW",
|
| 247 |
-
"Penalties": "HIGH",
|
| 248 |
-
}
|
| 249 |
-
|
| 250 |
-
DESC_MAP = {label: label.replace("_", " ") for label in _ALL_LABELS}
|
| 251 |
-
DESC_MAP.update({
|
| 252 |
-
"Limitation of liability": "Company limits or excludes liability for losses, data breaches, or service failures.",
|
| 253 |
-
"Unilateral termination": "Company can terminate your account at any time without reason.",
|
| 254 |
-
"Unilateral change": "Company can change terms at any time without your consent.",
|
| 255 |
-
"Content removal": "Company can delete your content without notice or justification.",
|
| 256 |
-
"Contract by using": "You are bound to the contract simply by using the service.",
|
| 257 |
-
"Choice of law": "Governing law may differ from your country, reducing your legal protections.",
|
| 258 |
-
"Jurisdiction": "Disputes must be resolved in a jurisdiction that may disadvantage you.",
|
| 259 |
-
"Arbitration": "Forces disputes to arbitration instead of court. You waive your right to sue.",
|
| 260 |
-
"Uncapped Liability": "No financial limit on damages the party may be liable for.",
|
| 261 |
-
"Cap on Liability": "Maximum financial liability is explicitly capped.",
|
| 262 |
-
"Non-Compete": "Restrictions on competing with the counter-party.",
|
| 263 |
-
"Exclusivity": "Obligation to deal exclusively with one party.",
|
| 264 |
-
"IP Ownership Assignment": "Intellectual property rights are transferred entirely.",
|
| 265 |
-
"Termination for Convenience": "Either party may terminate without cause or notice.",
|
| 266 |
-
"Governing Law": "Specifies which jurisdiction's laws apply.",
|
| 267 |
-
"Non-Disparagement": "Agreement not to speak negatively about the other party.",
|
| 268 |
-
"ROFR/ROFO/ROFN": "Right of First Refusal / Offer / Negotiation clause.",
|
| 269 |
-
"Change of Control": "Provisions triggered by ownership or control changes.",
|
| 270 |
-
"Anti-Assignment": "Restrictions on transferring contract rights to third parties.",
|
| 271 |
-
"Liquidated Damages": "Pre-determined damages amount for breach of contract.",
|
| 272 |
-
"Source Code Escrow": "Third-party holds source code for release under defined conditions.",
|
| 273 |
-
"Post-Termination Services": "Services to be provided after the contract ends.",
|
| 274 |
-
"Audit Rights": "Right to inspect records or verify compliance.",
|
| 275 |
-
"Warranty Duration": "Length of time warranties remain in effect.",
|
| 276 |
-
"Covenant Not to Sue": "Agreement not to bring legal action against a party.",
|
| 277 |
-
"Third Party Beneficiary": "Non-party who benefits from the contract terms.",
|
| 278 |
-
"Insurance": "Insurance coverage requirements.",
|
| 279 |
-
"Revenue/Profit Sharing": "Revenue or profit sharing arrangements between parties.",
|
| 280 |
-
"Price Restriction": "Restrictions on pricing or discounting.",
|
| 281 |
-
"Minimum Commitment": "Minimum purchase or usage commitment.",
|
| 282 |
-
"Volume Restriction": "Limits on volume of goods or services.",
|
| 283 |
-
"License Grant": "Permission to use intellectual property.",
|
| 284 |
-
"Non-Transferable License": "License that cannot be transferred to third parties.",
|
| 285 |
-
"Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
|
| 286 |
-
"Unlimited/All-You-Can-Eat License": "License with no usage limits.",
|
| 287 |
-
"Notice Period to Terminate Renewal": "Required notice period before automatic renewal.",
|
| 288 |
-
# FIX v4.2: Added descriptions for regex-only labels
|
| 289 |
-
"Indemnification": "Obligation to compensate the other party for losses or damages.",
|
| 290 |
-
"Confidentiality": "Restrictions on sharing proprietary or sensitive information.",
|
| 291 |
-
"Force Majeure": "Excuses performance due to extraordinary events beyond control.",
|
| 292 |
-
"Penalties": "Financial penalties for breach or late performance.",
|
| 293 |
-
})
|
| 294 |
-
|
| 295 |
-
RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
|
| 296 |
-
|
| 297 |
-
RISK_STYLES = {
|
| 298 |
-
"CRITICAL": ("#dc2626", "#fef2f2", "β οΈ"),
|
| 299 |
-
"HIGH": ("#ea580c", "#fff7ed", "β‘"),
|
| 300 |
-
"MEDIUM": ("#ca8a04", "#fefce8", "π"),
|
| 301 |
-
"LOW": ("#16a34a", "#f0fdf4", "β"),
|
| 302 |
-
}
|
| 303 |
-
|
| 304 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 305 |
-
# FIX v4.1: Per-class thresholds aligned with single-label softmax
|
| 306 |
-
# The model was trained with cross-entropy (single-label), so inference
|
| 307 |
-
# now uses softmax+argmax, not sigmoid. Thresholds apply to softmax probs.
|
| 308 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 309 |
-
_CUAD_THRESHOLDS = {}
|
| 310 |
-
_WEAK_CLASSES = {0, 1, 2, 7, 9, 21, 22, 27, 37, 38}
|
| 311 |
-
for _i in range(41):
|
| 312 |
-
if _i in _WEAK_CLASSES:
|
| 313 |
-
_CUAD_THRESHOLDS[_i] = 0.85 # Only flag if very confident (these classes are unreliable)
|
| 314 |
-
else:
|
| 315 |
-
_CUAD_THRESHOLDS[_i] = 0.40 # Reasonable threshold for softmax outputs
|
| 316 |
-
|
| 317 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 318 |
-
# FIX v4.1: Bounded LRU Cache utility (replaces unbounded dicts)
|
| 319 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 320 |
-
|
| 321 |
-
class BoundedCache:
|
| 322 |
-
"""Thread-safe bounded LRU cache using OrderedDict + RLock.
|
| 323 |
-
FIX v4.2: Added threading.RLock to prevent race conditions under
|
| 324 |
-
Gradio's concurrent request handling. OrderedDict compound operations
|
| 325 |
-
(contains + setitem + move_to_end + popitem) are NOT atomic even with GIL."""
|
| 326 |
-
def __init__(self, maxsize=1000):
|
| 327 |
-
self._cache = OrderedDict()
|
| 328 |
-
self._maxsize = maxsize
|
| 329 |
-
self._lock = threading.RLock()
|
| 330 |
-
|
| 331 |
-
def get(self, key, default=None):
|
| 332 |
-
with self._lock:
|
| 333 |
-
if key in self._cache:
|
| 334 |
-
self._cache.move_to_end(key)
|
| 335 |
-
return self._cache[key]
|
| 336 |
-
return default
|
| 337 |
-
|
| 338 |
-
def put(self, key, value):
|
| 339 |
-
with self._lock:
|
| 340 |
-
if key in self._cache:
|
| 341 |
-
self._cache.move_to_end(key)
|
| 342 |
-
self._cache[key] = value
|
| 343 |
-
else:
|
| 344 |
-
if len(self._cache) >= self._maxsize:
|
| 345 |
-
self._cache.popitem(last=False)
|
| 346 |
-
self._cache[key] = value
|
| 347 |
-
|
| 348 |
-
def __contains__(self, key):
|
| 349 |
-
with self._lock:
|
| 350 |
-
return key in self._cache
|
| 351 |
-
|
| 352 |
-
def __len__(self):
|
| 353 |
-
with self._lock:
|
| 354 |
-
return len(self._cache)
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 358 |
-
# 2. MODEL LOADING
|
| 359 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 360 |
-
|
| 361 |
-
cuad_tokenizer = None
|
| 362 |
-
cuad_model = None
|
| 363 |
-
ner_pipeline = None
|
| 364 |
-
nli_model = None # FIX v4.2: CrossEncoder instead of pipeline
|
| 365 |
-
_model_status = {"cuad": "not_loaded", "ner": "not_loaded", "nli": "not_loaded"}
|
| 366 |
-
|
| 367 |
-
def _load_cuad_model():
|
| 368 |
-
global cuad_tokenizer, cuad_model, _model_status
|
| 369 |
-
# PERF v4.3: Try ONNX quantized model first (2-4x faster on CPU)
|
| 370 |
-
onnx_model_path = os.environ.get("ONNX_MODEL_PATH", "")
|
| 371 |
-
onnx_hub_id = os.environ.get("ONNX_HUB_MODEL_ID", "gaurv007/clauseguard-onnx-int8")
|
| 372 |
-
|
| 373 |
-
if _HAS_ORT:
|
| 374 |
-
for source in [onnx_model_path, onnx_hub_id]:
|
| 375 |
-
if not source:
|
| 376 |
-
continue
|
| 377 |
-
try:
|
| 378 |
-
print(f"[ClauseGuard] Trying ONNX model: {source}")
|
| 379 |
-
cuad_model = _ORTModel.from_pretrained(source, file_name="model_quantized.onnx")
|
| 380 |
-
cuad_tokenizer = AutoTokenizer.from_pretrained(source)
|
| 381 |
-
_model_status["cuad"] = "loaded (ONNX INT8)"
|
| 382 |
-
print(f"[ClauseGuard] ONNX INT8 model loaded from {source}")
|
| 383 |
-
return
|
| 384 |
-
except Exception as e:
|
| 385 |
-
print(f"[ClauseGuard] ONNX load failed from {source}: {e}")
|
| 386 |
-
|
| 387 |
-
# Fallback to PyTorch PEFT model
|
| 388 |
-
if not _HAS_TORCH:
|
| 389 |
-
print("[ClauseGuard] PyTorch not available β using regex fallback")
|
| 390 |
-
_model_status["cuad"] = "unavailable"
|
| 391 |
-
return
|
| 392 |
-
try:
|
| 393 |
-
base = "nlpaueb/legal-bert-base-uncased"
|
| 394 |
-
adapter = "Mokshith31/legalbert-contract-clause-classification"
|
| 395 |
-
print(f"[ClauseGuard] Loading CUAD classifier (PyTorch): {adapter}")
|
| 396 |
-
cuad_tokenizer = AutoTokenizer.from_pretrained(base)
|
| 397 |
-
base_model = AutoModelForSequenceClassification.from_pretrained(
|
| 398 |
-
base, num_labels=41, ignore_mismatched_sizes=True
|
| 399 |
-
)
|
| 400 |
-
cuad_model = PeftModel.from_pretrained(base_model, adapter)
|
| 401 |
-
cuad_model.eval()
|
| 402 |
-
_model_status["cuad"] = "loaded (PyTorch)"
|
| 403 |
-
print("[ClauseGuard] CUAD model loaded successfully (PyTorch)")
|
| 404 |
-
except Exception as e:
|
| 405 |
-
print(f"[ClauseGuard] CUAD model load failed: {e}")
|
| 406 |
-
cuad_tokenizer = None
|
| 407 |
-
cuad_model = None
|
| 408 |
-
_model_status["cuad"] = f"failed: {e}"
|
| 409 |
-
|
| 410 |
-
def _load_ner_model():
|
| 411 |
-
global ner_pipeline, _model_status, _HAS_NER_MODEL
|
| 412 |
-
if not _HAS_TORCH:
|
| 413 |
-
_model_status["ner"] = "unavailable"
|
| 414 |
-
return
|
| 415 |
-
try:
|
| 416 |
-
print("[ClauseGuard] Loading Legal NER model: matterstack/legal-bert-ner")
|
| 417 |
-
ner_pipeline = pipeline(
|
| 418 |
-
"ner",
|
| 419 |
-
model="matterstack/legal-bert-ner",
|
| 420 |
-
aggregation_strategy="simple",
|
| 421 |
-
device=-1, # CPU
|
| 422 |
-
)
|
| 423 |
-
_HAS_NER_MODEL = True
|
| 424 |
-
_model_status["ner"] = "loaded"
|
| 425 |
-
print("[ClauseGuard] Legal NER model loaded successfully")
|
| 426 |
-
except Exception as e:
|
| 427 |
-
print(f"[ClauseGuard] Legal NER model load failed (using regex fallback): {e}")
|
| 428 |
-
_model_status["ner"] = f"failed: {e}"
|
| 429 |
-
|
| 430 |
-
def _load_nli_model():
|
| 431 |
-
global nli_model, _model_status, _HAS_NLI_MODEL
|
| 432 |
-
if not _HAS_CROSS_ENCODER:
|
| 433 |
-
_model_status["nli"] = "unavailable (sentence-transformers not installed)"
|
| 434 |
-
return
|
| 435 |
-
try:
|
| 436 |
-
print("[ClauseGuard] Loading NLI model: cross-encoder/nli-deberta-v3-base (CrossEncoder)")
|
| 437 |
-
nli_model = _CrossEncoder("cross-encoder/nli-deberta-v3-base")
|
| 438 |
-
_HAS_NLI_MODEL = True
|
| 439 |
-
_model_status["nli"] = "loaded"
|
| 440 |
-
print("[ClauseGuard] NLI CrossEncoder loaded successfully")
|
| 441 |
-
except Exception as e:
|
| 442 |
-
print(f"[ClauseGuard] NLI model load failed (using heuristic fallback): {e}")
|
| 443 |
-
_model_status["nli"] = f"failed: {e}"
|
| 444 |
-
|
| 445 |
-
def get_model_status_text():
|
| 446 |
-
"""Return human-readable model status."""
|
| 447 |
-
parts = []
|
| 448 |
-
for name, status in _model_status.items():
|
| 449 |
-
icon = "β
" if status == "loaded" else "β οΈ" if "failed" in status else "β"
|
| 450 |
-
label = {"cuad": "Clause Classifier", "ner": "Legal NER", "nli": "NLI Contradiction"}[name]
|
| 451 |
-
parts.append(f"{icon} {label}: {status}")
|
| 452 |
-
return " Β· ".join(parts)
|
| 453 |
-
|
| 454 |
-
# Load models at startup
|
| 455 |
-
_load_cuad_model()
|
| 456 |
-
_load_ner_model()
|
| 457 |
-
_load_nli_model()
|
| 458 |
-
|
| 459 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 460 |
-
# 3. DOCUMENT PARSING
|
| 461 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 462 |
-
|
| 463 |
-
def parse_pdf(file_path):
|
| 464 |
-
"""Smart PDF parser: native text extraction with OCR fallback for scanned PDFs."""
|
| 465 |
-
text, error, method = parse_pdf_smart(file_path)
|
| 466 |
-
if text:
|
| 467 |
-
if method == "ocr":
|
| 468 |
-
print(f"[ClauseGuard] PDF extracted via OCR ({len(text)} chars)")
|
| 469 |
-
return text, None
|
| 470 |
-
if error:
|
| 471 |
-
return None, error
|
| 472 |
-
return None, "Could not extract text from PDF. Try uploading a clearer scan or digital PDF."
|
| 473 |
-
|
| 474 |
-
def parse_docx(file_path):
|
| 475 |
-
if not _HAS_DOCX:
|
| 476 |
-
return None, "DOCX parsing not available (python-docx not installed)"
|
| 477 |
-
try:
|
| 478 |
-
doc = DocxDocument(file_path)
|
| 479 |
-
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 480 |
-
return "\n\n".join(paragraphs), None
|
| 481 |
-
except Exception as e:
|
| 482 |
-
return None, f"DOCX parse error: {e}"
|
| 483 |
-
|
| 484 |
-
def parse_document(file_path):
|
| 485 |
-
if file_path is None:
|
| 486 |
-
return None, "No file uploaded"
|
| 487 |
-
ext = os.path.splitext(file_path)[1].lower()
|
| 488 |
-
if ext == ".pdf":
|
| 489 |
-
return parse_pdf(file_path)
|
| 490 |
-
elif ext in (".docx", ".doc"):
|
| 491 |
-
return parse_docx(file_path)
|
| 492 |
-
elif ext in (".txt", ".md", ".rst"):
|
| 493 |
-
try:
|
| 494 |
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 495 |
-
return f.read(), None
|
| 496 |
-
except Exception as e:
|
| 497 |
-
return None, f"Text read error: {e}"
|
| 498 |
-
else:
|
| 499 |
-
return None, f"Unsupported file type: {ext}"
|
| 500 |
-
|
| 501 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 502 |
-
# 4. DETERMINISTIC CLAUSE SPLITTING
|
| 503 |
-
# FIX v4.1: Bounded cache (max 500 documents) instead of unbounded dict
|
| 504 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 505 |
-
|
| 506 |
-
_chunk_cache = BoundedCache(maxsize=500)
|
| 507 |
-
|
| 508 |
-
# FIX v4.2: Pre-compile section pattern at module level (was recompiling per call)
|
| 509 |
-
_SECTION_PATTERN = re.compile(
|
| 510 |
-
r'(?:^|\n\n)'
|
| 511 |
-
r'(?='
|
| 512 |
-
r'\d+(?:\.\d+)*[.)]\s' # 1. 2. 3.1. 3.1)
|
| 513 |
-
r'|[A-Z]{2,}[A-Z\s]*\n' # ALL CAPS HEADERS
|
| 514 |
-
r'|\([a-z]\)\s' # (a) (b) (c)
|
| 515 |
-
r'|(?:Section|Article|Clause)\s+\d+' # Section 1, Article 2
|
| 516 |
-
r')',
|
| 517 |
-
re.MULTILINE
|
| 518 |
-
)
|
| 519 |
-
|
| 520 |
-
def split_clauses(text):
|
| 521 |
-
"""Deterministic, structure-aware clause splitting.
|
| 522 |
-
Same input ALWAYS produces same output. Normalized text is hashed
|
| 523 |
-
and cached so repeated runs on identical documents are identical."""
|
| 524 |
-
normalized = re.sub(r'\s+', ' ', text.strip())
|
| 525 |
-
text_hash = hashlib.sha256(normalized.encode()).hexdigest()
|
| 526 |
-
cached = _chunk_cache.get(text_hash)
|
| 527 |
-
if cached is not None:
|
| 528 |
-
return cached
|
| 529 |
-
|
| 530 |
-
text = re.sub(r'\n{3,}', '\n\n', text.strip())
|
| 531 |
-
|
| 532 |
-
# First try to detect numbered sections (1., 2., 3.1, (a), etc.)
|
| 533 |
-
positions = [m.start() for m in _SECTION_PATTERN.finditer(text)]
|
| 534 |
-
|
| 535 |
-
if len(positions) >= 3:
|
| 536 |
-
clauses = []
|
| 537 |
-
for i, pos in enumerate(positions):
|
| 538 |
-
end = positions[i + 1] if i + 1 < len(positions) else len(text)
|
| 539 |
-
chunk = text[pos:end].strip()
|
| 540 |
-
if len(chunk) > 30:
|
| 541 |
-
if len(chunk) > 1500:
|
| 542 |
-
sub_parts = chunk.split('\n\n')
|
| 543 |
-
current = ""
|
| 544 |
-
for sp in sub_parts:
|
| 545 |
-
if len(current) + len(sp) < 1200:
|
| 546 |
-
current += ("\n\n" + sp if current else sp)
|
| 547 |
-
else:
|
| 548 |
-
if len(current.strip()) > 30:
|
| 549 |
-
clauses.append(current.strip())
|
| 550 |
-
current = sp
|
| 551 |
-
if len(current.strip()) > 30:
|
| 552 |
-
clauses.append(current.strip())
|
| 553 |
-
else:
|
| 554 |
-
clauses.append(chunk)
|
| 555 |
-
if positions and positions[0] > 50:
|
| 556 |
-
preamble = text[:positions[0]].strip()
|
| 557 |
-
if len(preamble) > 30:
|
| 558 |
-
clauses.insert(0, preamble)
|
| 559 |
-
result = clauses if clauses else _fallback_split(text)
|
| 560 |
-
_chunk_cache.put(text_hash, result)
|
| 561 |
-
return result
|
| 562 |
-
else:
|
| 563 |
-
result = _fallback_split(text)
|
| 564 |
-
_chunk_cache.put(text_hash, result)
|
| 565 |
-
return result
|
| 566 |
-
|
| 567 |
-
def _fallback_split(text):
|
| 568 |
-
"""Fallback: split on paragraph breaks and sentence boundaries."""
|
| 569 |
-
paragraphs = text.split('\n\n')
|
| 570 |
-
if len(paragraphs) >= 3:
|
| 571 |
-
clauses = []
|
| 572 |
-
for p in paragraphs:
|
| 573 |
-
p = p.strip()
|
| 574 |
-
if len(p) > 30:
|
| 575 |
-
if len(p) > 1500:
|
| 576 |
-
sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', p)
|
| 577 |
-
current = ""
|
| 578 |
-
for s in sents:
|
| 579 |
-
if len(current) + len(s) < 1000:
|
| 580 |
-
current += (" " + s if current else s)
|
| 581 |
-
else:
|
| 582 |
-
if len(current.strip()) > 30:
|
| 583 |
-
clauses.append(current.strip())
|
| 584 |
-
current = s
|
| 585 |
-
if len(current.strip()) > 30:
|
| 586 |
-
clauses.append(current.strip())
|
| 587 |
-
else:
|
| 588 |
-
clauses.append(p)
|
| 589 |
-
return clauses
|
| 590 |
-
|
| 591 |
-
parts = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(])', text)
|
| 592 |
-
return [p.strip() for p in parts if len(p.strip()) > 30]
|
| 593 |
-
|
| 594 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 595 |
-
# 5. CLAUSE DETECTION
|
| 596 |
-
# FIX v4.1: Use softmax (matching training) instead of sigmoid
|
| 597 |
-
# FIX v4.1: max_length raised to 512 (was 256)
|
| 598 |
-
# FIX v4.1: Bounded prediction cache
|
| 599 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 600 |
-
|
| 601 |
-
_HEADING_RE = re.compile(r'^\d+(?:\.\d+)*\s+[A-Z][A-Z\s&,/]+$', re.MULTILINE)
|
| 602 |
-
|
| 603 |
-
def _strip_heading(text):
|
| 604 |
-
"""Remove leading section headings that confuse the classifier."""
|
| 605 |
-
lines = text.split('\n')
|
| 606 |
-
if lines and _HEADING_RE.match(lines[0].strip()):
|
| 607 |
-
stripped = '\n'.join(lines[1:]).strip()
|
| 608 |
-
return stripped if len(stripped) > 20 else text
|
| 609 |
-
return text
|
| 610 |
-
|
| 611 |
-
_LABEL_GUARDRAILS = {
|
| 612 |
-
"Liquidated Damages": re.compile(
|
| 613 |
-
r'liquidated|pre-?determined.{0,10}damage|agreed.{0,10}sum|penalty clause|stipulated.{0,10}damage',
|
| 614 |
-
re.IGNORECASE
|
| 615 |
-
),
|
| 616 |
-
"Uncapped Liability": re.compile(
|
| 617 |
-
r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
|
| 618 |
-
re.IGNORECASE
|
| 619 |
-
),
|
| 620 |
-
}
|
| 621 |
-
|
| 622 |
-
def _apply_guardrails(label, text, confidence):
|
| 623 |
-
guard = _LABEL_GUARDRAILS.get(label)
|
| 624 |
-
if guard and not guard.search(text):
|
| 625 |
-
return "Other", confidence * 0.3
|
| 626 |
-
return label, confidence
|
| 627 |
-
|
| 628 |
-
def _text_hash(text):
|
| 629 |
-
return hashlib.md5(text.encode()).hexdigest()
|
| 630 |
-
|
| 631 |
-
# FIX v4.1: Bounded prediction cache
|
| 632 |
-
_prediction_cache = BoundedCache(maxsize=2000)
|
| 633 |
-
|
| 634 |
-
def classify_cuad(clause_text):
|
| 635 |
-
if cuad_model is None or cuad_tokenizer is None:
|
| 636 |
-
return _classify_regex(clause_text)
|
| 637 |
-
|
| 638 |
-
clean_text = _strip_heading(clause_text)
|
| 639 |
-
|
| 640 |
-
h = _text_hash(clean_text[:512])
|
| 641 |
-
cached = _prediction_cache.get(h)
|
| 642 |
-
if cached is not None:
|
| 643 |
-
return cached
|
| 644 |
-
|
| 645 |
-
try:
|
| 646 |
-
# FIX v4.1: max_length=512 (was 256 β truncating long legal clauses)
|
| 647 |
-
inputs = cuad_tokenizer(
|
| 648 |
-
clean_text,
|
| 649 |
-
return_tensors="pt",
|
| 650 |
-
truncation=True,
|
| 651 |
-
max_length=512,
|
| 652 |
-
padding=True
|
| 653 |
-
)
|
| 654 |
-
with torch.no_grad():
|
| 655 |
-
logits = cuad_model(**inputs).logits
|
| 656 |
-
|
| 657 |
-
# FIX v4.1: Use softmax (matching single-label cross-entropy training)
|
| 658 |
-
# The model was trained with F.cross_entropy, so softmax is correct.
|
| 659 |
-
probs = torch.softmax(logits, dim=-1)[0]
|
| 660 |
-
|
| 661 |
-
# Get the top prediction
|
| 662 |
-
top_prob, top_idx = torch.max(probs, dim=0)
|
| 663 |
-
top_idx = int(top_idx)
|
| 664 |
-
top_conf = float(top_prob)
|
| 665 |
-
|
| 666 |
-
results = []
|
| 667 |
-
|
| 668 |
-
# Primary prediction
|
| 669 |
-
threshold = _CUAD_THRESHOLDS.get(top_idx, 0.40)
|
| 670 |
-
if top_conf > threshold and top_idx < len(CUAD_LABELS):
|
| 671 |
-
label = CUAD_LABELS[top_idx]
|
| 672 |
-
conf = top_conf
|
| 673 |
-
label, conf = _apply_guardrails(label, clause_text, conf)
|
| 674 |
-
if not (label == "Other" and conf < 0.3):
|
| 675 |
-
risk = RISK_MAP.get(label, "LOW")
|
| 676 |
-
results.append({
|
| 677 |
-
"label": label,
|
| 678 |
-
"confidence": round(conf, 3),
|
| 679 |
-
"risk": risk,
|
| 680 |
-
"description": DESC_MAP.get(label, label),
|
| 681 |
-
"source": "ml",
|
| 682 |
-
})
|
| 683 |
-
|
| 684 |
-
# Also check 2nd-best prediction if confident enough
|
| 685 |
-
if len(probs) > 1:
|
| 686 |
-
sorted_probs, sorted_indices = torch.sort(probs, descending=True)
|
| 687 |
-
if len(sorted_probs) > 1:
|
| 688 |
-
second_idx = int(sorted_indices[1])
|
| 689 |
-
second_conf = float(sorted_probs[1])
|
| 690 |
-
second_threshold = _CUAD_THRESHOLDS.get(second_idx, 0.40)
|
| 691 |
-
if second_conf > second_threshold and second_idx < len(CUAD_LABELS):
|
| 692 |
-
label2 = CUAD_LABELS[second_idx]
|
| 693 |
-
conf2 = second_conf
|
| 694 |
-
label2, conf2 = _apply_guardrails(label2, clause_text, conf2)
|
| 695 |
-
if not (label2 == "Other" and conf2 < 0.3):
|
| 696 |
-
# Only add if different from primary
|
| 697 |
-
if not results or results[0]["label"] != label2:
|
| 698 |
-
risk2 = RISK_MAP.get(label2, "LOW")
|
| 699 |
-
results.append({
|
| 700 |
-
"label": label2,
|
| 701 |
-
"confidence": round(conf2, 3),
|
| 702 |
-
"risk": risk2,
|
| 703 |
-
"description": DESC_MAP.get(label2, label2),
|
| 704 |
-
"source": "ml",
|
| 705 |
-
})
|
| 706 |
-
|
| 707 |
-
results.sort(key=lambda x: x["confidence"], reverse=True)
|
| 708 |
-
|
| 709 |
-
# If no ML results, also try regex to catch what model misses
|
| 710 |
-
if not results:
|
| 711 |
-
results = _classify_regex(clause_text)
|
| 712 |
-
|
| 713 |
-
_prediction_cache.put(h, results)
|
| 714 |
-
return results
|
| 715 |
-
except Exception as e:
|
| 716 |
-
print(f"[ClauseGuard] CUAD inference error: {e}")
|
| 717 |
-
return _classify_regex(clause_text)
|
| 718 |
-
|
| 719 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 720 |
-
# 5b. BATCHED CLAUSE CLASSIFICATION
|
| 721 |
-
# PERF v4.3: Single forward pass for all clauses instead of one-by-one
|
| 722 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 723 |
-
|
| 724 |
-
def classify_cuad_batch(clauses, batch_size=8):
|
| 725 |
-
"""Classify a batch of clauses in a single forward pass.
|
| 726 |
-
PERF v4.3: Replaces sequential classify_cuad() loop.
|
| 727 |
-
On CPU, batch_size=8 balances memory vs throughput."""
|
| 728 |
-
if cuad_model is None or cuad_tokenizer is None:
|
| 729 |
-
# Fallback to regex for all clauses
|
| 730 |
-
return [_classify_regex(c) for c in clauses]
|
| 731 |
-
|
| 732 |
-
all_results = []
|
| 733 |
-
# Check cache first, collect uncached clauses
|
| 734 |
-
uncached_indices = []
|
| 735 |
-
uncached_texts = []
|
| 736 |
-
for i, clause in enumerate(clauses):
|
| 737 |
-
clean = _strip_heading(clause)
|
| 738 |
-
h = _text_hash(clean[:512])
|
| 739 |
-
cached = _prediction_cache.get(h)
|
| 740 |
-
if cached is not None:
|
| 741 |
-
all_results.append((i, cached))
|
| 742 |
-
else:
|
| 743 |
-
uncached_indices.append(i)
|
| 744 |
-
uncached_texts.append(clean)
|
| 745 |
-
all_results.append((i, None)) # placeholder
|
| 746 |
-
|
| 747 |
-
if not uncached_texts:
|
| 748 |
-
return [r for _, r in sorted(all_results)]
|
| 749 |
-
|
| 750 |
-
# Process uncached in batches
|
| 751 |
-
for batch_start in range(0, len(uncached_texts), batch_size):
|
| 752 |
-
batch_texts = uncached_texts[batch_start:batch_start + batch_size]
|
| 753 |
-
batch_original = [clauses[uncached_indices[batch_start + j]] for j in range(len(batch_texts))]
|
| 754 |
-
|
| 755 |
-
try:
|
| 756 |
-
inputs = cuad_tokenizer(
|
| 757 |
-
batch_texts,
|
| 758 |
-
return_tensors="pt",
|
| 759 |
-
truncation=True,
|
| 760 |
-
max_length=512,
|
| 761 |
-
padding=True,
|
| 762 |
-
)
|
| 763 |
-
with torch.no_grad():
|
| 764 |
-
logits = cuad_model(**inputs).logits
|
| 765 |
-
|
| 766 |
-
probs = torch.softmax(logits, dim=-1)
|
| 767 |
-
|
| 768 |
-
for j in range(len(batch_texts)):
|
| 769 |
-
clause_probs = probs[j]
|
| 770 |
-
original_text = batch_original[j]
|
| 771 |
-
results = []
|
| 772 |
-
|
| 773 |
-
# Primary prediction
|
| 774 |
-
top_prob, top_idx = torch.max(clause_probs, dim=0)
|
| 775 |
-
top_idx_int = int(top_idx)
|
| 776 |
-
top_conf = float(top_prob)
|
| 777 |
-
|
| 778 |
-
threshold = _CUAD_THRESHOLDS.get(top_idx_int, 0.40)
|
| 779 |
-
if top_conf > threshold and top_idx_int < len(CUAD_LABELS):
|
| 780 |
-
label = CUAD_LABELS[top_idx_int]
|
| 781 |
-
conf = top_conf
|
| 782 |
-
label, conf = _apply_guardrails(label, original_text, conf)
|
| 783 |
-
if not (label == "Other" and conf < 0.3):
|
| 784 |
-
risk = RISK_MAP.get(label, "LOW")
|
| 785 |
-
results.append({
|
| 786 |
-
"label": label,
|
| 787 |
-
"confidence": round(conf, 3),
|
| 788 |
-
"risk": risk,
|
| 789 |
-
"description": DESC_MAP.get(label, label),
|
| 790 |
-
"source": "ml",
|
| 791 |
-
})
|
| 792 |
-
|
| 793 |
-
# 2nd-best prediction
|
| 794 |
-
sorted_probs, sorted_indices = torch.sort(clause_probs, descending=True)
|
| 795 |
-
if len(sorted_probs) > 1:
|
| 796 |
-
second_idx = int(sorted_indices[1])
|
| 797 |
-
second_conf = float(sorted_probs[1])
|
| 798 |
-
second_threshold = _CUAD_THRESHOLDS.get(second_idx, 0.40)
|
| 799 |
-
if second_conf > second_threshold and second_idx < len(CUAD_LABELS):
|
| 800 |
-
label2 = CUAD_LABELS[second_idx]
|
| 801 |
-
conf2 = second_conf
|
| 802 |
-
label2, conf2 = _apply_guardrails(label2, original_text, conf2)
|
| 803 |
-
if not (label2 == "Other" and conf2 < 0.3):
|
| 804 |
-
if not results or results[0]["label"] != label2:
|
| 805 |
-
risk2 = RISK_MAP.get(label2, "LOW")
|
| 806 |
-
results.append({
|
| 807 |
-
"label": label2,
|
| 808 |
-
"confidence": round(conf2, 3),
|
| 809 |
-
"risk": risk2,
|
| 810 |
-
"description": DESC_MAP.get(label2, label2),
|
| 811 |
-
"source": "ml",
|
| 812 |
-
})
|
| 813 |
-
|
| 814 |
-
results.sort(key=lambda x: x["confidence"], reverse=True)
|
| 815 |
-
|
| 816 |
-
if not results:
|
| 817 |
-
results = _classify_regex(original_text)
|
| 818 |
-
|
| 819 |
-
# Cache the result
|
| 820 |
-
h = _text_hash(batch_texts[j][:512])
|
| 821 |
-
_prediction_cache.put(h, results)
|
| 822 |
-
|
| 823 |
-
# Update placeholder in all_results
|
| 824 |
-
global_idx = uncached_indices[batch_start + j]
|
| 825 |
-
for k, (idx, _) in enumerate(all_results):
|
| 826 |
-
if idx == global_idx:
|
| 827 |
-
all_results[k] = (idx, results)
|
| 828 |
-
break
|
| 829 |
-
|
| 830 |
-
except Exception as e:
|
| 831 |
-
print(f"[ClauseGuard] Batch CUAD inference error: {e}")
|
| 832 |
-
# Fallback to regex for this batch
|
| 833 |
-
for j in range(len(batch_texts)):
|
| 834 |
-
global_idx = uncached_indices[batch_start + j]
|
| 835 |
-
results = _classify_regex(batch_original[j])
|
| 836 |
-
for k, (idx, _) in enumerate(all_results):
|
| 837 |
-
if idx == global_idx:
|
| 838 |
-
all_results[k] = (idx, results)
|
| 839 |
-
break
|
| 840 |
-
|
| 841 |
-
return [r for _, r in sorted(all_results)]
|
| 842 |
-
|
| 843 |
-
# FIX v4.1: Extended regex patterns to cover more CUAD categories
|
| 844 |
-
_REGEX_PATTERNS = {
|
| 845 |
-
"Limitation of liability": [r"not liable", r"shall not be (liable|responsible)", r"in no event.*liable", r"limitation of liability", r"without warranty", r"disclaim"],
|
| 846 |
-
"Unilateral termination": [r"terminat.*at any time", r"suspend.*account.*without", r"we may (terminat|suspend|discontinu)", r"right to (terminat|suspend)"],
|
| 847 |
-
"Unilateral change": [r"sole discretion", r"reserves? the right to (modify|change|update|amend)", r"at any time.*without (prior )?notice", r"we may (modify|change|update)"],
|
| 848 |
-
"Content removal": [r"remove.*content.*without", r"right to remove", r"we may.*remove"],
|
| 849 |
-
"Contract by using": [r"by (using|accessing).*you agree", r"continued use.*constitutes? acceptance"],
|
| 850 |
-
"Choice of law": [r"governed by.*laws? of", r"shall be governed", r"laws of the state of"],
|
| 851 |
-
"Jurisdiction": [r"exclusive jurisdiction", r"courts? of.*(california|delaware|new york|ireland|england)", r"submit to.*jurisdiction"],
|
| 852 |
-
"Arbitration": [r"arbitrat", r"binding arbitration", r"waive.*right.*court", r"class action waiver"],
|
| 853 |
-
"Governing Law": [r"governed by", r"laws of", r"jurisdiction of"],
|
| 854 |
-
"Termination for Convenience": [r"terminat.*for convenience", r"terminat.*without cause", r"terminat.*at any time"],
|
| 855 |
-
"Non-Compete": [r"non-compete", r"shall not compete", r"competition restriction"],
|
| 856 |
-
"Exclusivity": [r"exclusive(?:ly)?(?:\s+(?:deal|relationship|partner|right))", r"exclusivity"],
|
| 857 |
-
"IP Ownership Assignment": [r"assign.*intellectual property", r"ownership of.*ip", r"all rights.*assign", r"work.?for.?hire"],
|
| 858 |
-
"Uncapped Liability": [r"unlimited liability", r"uncapped", r"no.*limit.*liability"],
|
| 859 |
-
"Cap on Liability": [r"cap on liability", r"maximum liability", r"liability.*shall not exceed", r"aggregate liability.*not exceed"],
|
| 860 |
-
"Indemnification": [r"indemnif", r"hold harmless", r"defend.*against.*claim"],
|
| 861 |
-
"Confidentiality": [r"confidential(?:ity)?", r"non-disclosure", r"\bnda\b"],
|
| 862 |
-
"Force Majeure": [r"force majeure", r"act of god", r"beyond.*(?:reasonable\s+)?control"],
|
| 863 |
-
"Penalties": [r"penalt(?:y|ies)", r"late fee", r"default charge", r"interest on overdue"],
|
| 864 |
-
# FIX v4.1: Added missing regex patterns for more CUAD categories
|
| 865 |
-
"Audit Rights": [r"audit rights?", r"right to audit", r"inspect.*records?", r"examination of.*records?", r"access to.*books"],
|
| 866 |
-
"Warranty Duration": [r"warrant(?:y|ies).*(?:period|duration|term|months?|years?)", r"warranty.*shall.*(?:remain|last|continue)", r"limited warranty"],
|
| 867 |
-
"Insurance": [r"(?:shall|must).*maintain.*insurance", r"insurance.*coverage", r"policy of insurance", r"certificate of insurance"],
|
| 868 |
-
"Source Code Escrow": [r"source code escrow", r"escrow.*source code", r"escrow agent"],
|
| 869 |
-
"Post-Termination Services": [r"post.?termination.*(?:service|obligation|support)", r"(?:after|following|upon).*termination.*(?:shall|must|will).*(?:provide|continue)"],
|
| 870 |
-
"Renewal Term": [r"renew(?:al)?.*term", r"auto(?:matic(?:ally)?)?.*renew", r"successive.*(?:term|period)"],
|
| 871 |
-
"Notice Period to Terminate Renewal": [r"notice.*(?:to\s+)?terminat.*renew", r"(?:days?|months?).*(?:prior|advance).*(?:notice|written).*(?:terminat|renew)", r"notice of non.?renewal"],
|
| 872 |
-
"Change of Control": [r"change of control", r"change in.*(?:ownership|control)", r"merger.*acquisition", r"sale of.*(?:all|substantially).*assets"],
|
| 873 |
-
"Anti-Assignment": [r"(?:shall|may)\s+not\s+assign", r"anti.?assignment", r"no.*assignment.*without.*consent"],
|
| 874 |
-
"Revenue/Profit Sharing": [r"revenue.*shar", r"profit.*shar", r"royalt(?:y|ies)"],
|
| 875 |
-
"Liquidated Damages": [r"liquidated.*damages?", r"pre.?determined.*damage", r"stipulated.*damage"],
|
| 876 |
-
"Covenant Not to Sue": [r"covenant not to sue", r"(?:shall|agree).*not.*(?:bring|file|commence).*(?:action|claim|suit)"],
|
| 877 |
-
"Joint IP Ownership": [r"joint(?:ly)?.*own(?:ed|ership)?.*(?:ip|intellectual property)", r"co.?own(?:ed|ership)?"],
|
| 878 |
-
"License Grant": [r"(?:grant|license).*(?:non.?exclusive|exclusive|perpetual|irrevocable).*(?:license|right)", r"hereby grants?.*license"],
|
| 879 |
-
"Non-Transferable License": [r"non.?transferable.*license", r"license.*(?:shall|may)\s+not.*(?:transfer|assign|sublicense)"],
|
| 880 |
-
"ROFR/ROFO/ROFN": [r"right of first.*(?:refusal|offer|negotiation)", r"ROFR", r"ROFO", r"ROFN"],
|
| 881 |
-
"No-Solicit of Customers": [r"(?:shall|must|agree).*not.*solicit.*customer", r"no.?solicit.*customer", r"non.?solicitation.*customer"],
|
| 882 |
-
"No-Solicit of Employees": [r"(?:shall|must|agree).*not.*solicit.*employee", r"no.?solicit.*employee", r"non.?solicitation.*employee", r"no.?hire"],
|
| 883 |
-
"Non-Disparagement": [r"non.?disparagement", r"(?:shall|must|agree).*not.*(?:disparag|defam|make.*negative)", r"not.*make.*derogatory"],
|
| 884 |
-
"Most Favored Nation": [r"most favou?red.*nation", r"MFN", r"most favou?red.*(?:customer|pricing|terms)"],
|
| 885 |
-
"Third Party Beneficiary": [r"third.?party.*beneficiar", r"no.*third.?party.*beneficiar"],
|
| 886 |
-
"Minimum Commitment": [r"minimum.*(?:commitment|purchase|order|volume|spend)", r"(?:shall|must).*(?:purchase|order).*(?:at least|minimum|no less than)"],
|
| 887 |
-
"Volume Restriction": [r"volume.*(?:restriction|limitation|cap|ceiling)", r"(?:shall|may).*not.*exceed.*(?:volume|quantity)"],
|
| 888 |
-
"Price Restriction": [r"price.*(?:restriction|limitation|ceiling|cap|floor)", r"(?:shall|may).*not.*(?:increase|raise|exceed).*price"],
|
| 889 |
-
}
|
| 890 |
-
|
| 891 |
-
# FIX v4.2: Pre-compile regex patterns at module level (was recompiling per call)
|
| 892 |
-
_REGEX_PATTERNS_COMPILED = {}
|
| 893 |
-
for _label, _pats in _REGEX_PATTERNS.items():
|
| 894 |
-
_REGEX_PATTERNS_COMPILED[_label] = [re.compile(p, re.IGNORECASE) for p in _pats]
|
| 895 |
-
|
| 896 |
-
def _classify_regex(text):
|
| 897 |
-
"""Regex fallback β returns pattern match, NOT fake confidence."""
|
| 898 |
-
text_lower = text.lower()
|
| 899 |
-
results = []
|
| 900 |
-
seen = set()
|
| 901 |
-
for label, patterns in _REGEX_PATTERNS_COMPILED.items():
|
| 902 |
-
for pat in patterns:
|
| 903 |
-
if pat.search(text_lower):
|
| 904 |
-
if label not in seen:
|
| 905 |
-
risk = RISK_MAP.get(label, "MEDIUM")
|
| 906 |
-
results.append({
|
| 907 |
-
"label": label,
|
| 908 |
-
"confidence": None,
|
| 909 |
-
"risk": risk,
|
| 910 |
-
"description": DESC_MAP.get(label, label),
|
| 911 |
-
"source": "pattern",
|
| 912 |
-
})
|
| 913 |
-
seen.add(label)
|
| 914 |
-
break
|
| 915 |
-
return results
|
| 916 |
-
|
| 917 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 918 |
-
# 6. LEGAL NER β ML model with regex fallback
|
| 919 |
-
# FIX v4.1: Batch all chunks in single pipeline call
|
| 920 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 921 |
-
|
| 922 |
-
def extract_entities(text):
|
| 923 |
-
"""Extract entities using ML model (matterstack/legal-bert-ner) with regex fallback."""
|
| 924 |
-
entities = []
|
| 925 |
-
|
| 926 |
-
if _HAS_NER_MODEL and ner_pipeline is not None:
|
| 927 |
-
try:
|
| 928 |
-
# FIX v4.1: Create overlapping chunks but batch them in a SINGLE pipeline call
|
| 929 |
-
max_text = min(len(text), 10000)
|
| 930 |
-
chunks = [text[i:i+512] for i in range(0, max_text, 450)]
|
| 931 |
-
offsets = list(range(0, max_text, 450))
|
| 932 |
-
|
| 933 |
-
# Single batched pipeline call instead of sequential
|
| 934 |
-
all_ner_results = ner_pipeline(chunks, batch_size=8)
|
| 935 |
-
|
| 936 |
-
for chunk_idx, ner_results in enumerate(all_ner_results):
|
| 937 |
-
offset = offsets[chunk_idx]
|
| 938 |
-
for ent in ner_results:
|
| 939 |
-
if ent.get("score", 0) > 0.5:
|
| 940 |
-
entities.append({
|
| 941 |
-
"text": ent["word"],
|
| 942 |
-
"type": _map_ner_label(ent.get("entity_group", ent.get("entity", "MISC"))),
|
| 943 |
-
"start": ent["start"] + offset,
|
| 944 |
-
"end": ent["end"] + offset,
|
| 945 |
-
"score": round(ent["score"], 3),
|
| 946 |
-
"source": "ml",
|
| 947 |
-
})
|
| 948 |
-
except Exception as e:
|
| 949 |
-
print(f"[ClauseGuard] ML NER error, falling back to regex: {e}")
|
| 950 |
-
entities = _extract_entities_regex(text)
|
| 951 |
-
else:
|
| 952 |
-
entities = _extract_entities_regex(text)
|
| 953 |
-
|
| 954 |
-
# Always supplement with regex patterns for things NER often misses
|
| 955 |
-
regex_ents = _extract_entities_regex(text)
|
| 956 |
-
ml_spans = set()
|
| 957 |
-
for e in entities:
|
| 958 |
-
for pos in range(e["start"], e["end"]):
|
| 959 |
-
ml_spans.add(pos)
|
| 960 |
-
for re_ent in regex_ents:
|
| 961 |
-
if not any(pos in ml_spans for pos in range(re_ent["start"], re_ent["end"])):
|
| 962 |
-
entities.append(re_ent)
|
| 963 |
-
|
| 964 |
-
# Deduplicate and sort
|
| 965 |
-
entities.sort(key=lambda x: (x["start"], -(x["end"] - x["start"])))
|
| 966 |
-
filtered = []
|
| 967 |
-
last_end = -1
|
| 968 |
-
for e in entities:
|
| 969 |
-
if e["start"] >= last_end:
|
| 970 |
-
filtered.append(e)
|
| 971 |
-
last_end = e["end"]
|
| 972 |
-
return filtered
|
| 973 |
-
|
| 974 |
-
def _map_ner_label(label):
|
| 975 |
-
label = label.upper()
|
| 976 |
-
mapping = {
|
| 977 |
-
"PER": "PERSON", "PERSON": "PERSON",
|
| 978 |
-
"ORG": "PARTY", "ORGANIZATION": "PARTY",
|
| 979 |
-
"LOC": "JURISDICTION", "LOCATION": "JURISDICTION",
|
| 980 |
-
"GPE": "JURISDICTION", "DATE": "DATE",
|
| 981 |
-
"MONEY": "MONEY", "MISC": "MISC", "LAW": "LEGAL_REF",
|
| 982 |
-
}
|
| 983 |
-
return mapping.get(label, label)
|
| 984 |
-
|
| 985 |
-
def _extract_entities_regex(text):
|
| 986 |
-
"""Regex-based NER fallback."""
|
| 987 |
-
entities = []
|
| 988 |
-
patterns = [
|
| 989 |
-
(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', "DATE"),
|
| 990 |
-
(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', "DATE"),
|
| 991 |
-
(r'\b\d{1,2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2,4}\b', "DATE"),
|
| 992 |
-
(r'\b(?:Effective|Commencement|Expiration|Termination)\s+Date\b', "DATE_REF"),
|
| 993 |
-
(r'\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?(?:\s*(?:million|billion|thousand|M|B|K))?', "MONEY"),
|
| 994 |
-
(r'\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|EUR|GBP|dollars|euros|pounds)', "MONEY"),
|
| 995 |
-
(r'\b(?:USD|EUR|GBP)\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?', "MONEY"),
|
| 996 |
-
(r'\b\d+(?:\.\d+)?%', "PERCENTAGE"),
|
| 997 |
-
(r'\b\d+\s*(?:year|month|week|day|business day)s?\b', "DURATION"),
|
| 998 |
-
(r'\b[A-Z][A-Za-z0-9\s&,]+?(?:Inc\.?|LLC|Ltd\.?|Limited|Corp\.?|Corporation|PLC|GmbH|AG|S\.A\.?|B\.V\.?|L\.P\.?|LLP)\b', "PARTY"),
|
| 999 |
-
(r'\b(?:Party A|Party B|Disclosing Party|Receiving Party|Licensor|Licensee|Buyer|Seller|Tenant|Landlord|Employer|Employee|Customer|Vendor|Client)\b', "PARTY_ROLE"),
|
| 1000 |
-
(r'\b(?:State|Commonwealth)\s+of\s+[A-Z][a-zA-Z\s]+', "JURISDICTION"),
|
| 1001 |
-
(r'\b(?:California|Delaware|New York|Texas|Florida|England|Ireland|Germany|France|Singapore|Hong Kong|Ontario|British Columbia)\b', "JURISDICTION"),
|
| 1002 |
-
(r'"([A-Z][A-Za-z\s]{1,40})"', "DEFINED_TERM"),
|
| 1003 |
-
(r'\((?:the\s+)?"([A-Z][A-Za-z\s]{1,40})"\)', "DEFINED_TERM"),
|
| 1004 |
-
]
|
| 1005 |
-
for pat, etype in patterns:
|
| 1006 |
-
for m in re.finditer(pat, text, re.IGNORECASE if etype in ("DATE", "MONEY", "DURATION", "PERCENTAGE") else 0):
|
| 1007 |
-
txt = m.group(1) if m.lastindex else m.group()
|
| 1008 |
-
entities.append({
|
| 1009 |
-
"text": txt,
|
| 1010 |
-
"type": etype,
|
| 1011 |
-
"start": m.start(),
|
| 1012 |
-
"end": m.end(),
|
| 1013 |
-
"source": "pattern",
|
| 1014 |
-
})
|
| 1015 |
-
return entities
|
| 1016 |
-
|
| 1017 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1018 |
-
# 7. NLI / CONTRADICTION DETECTION
|
| 1019 |
-
# FIX v4.1: Pass (text_a, text_b) as dict with proper keys for
|
| 1020 |
-
# cross-encoder pipeline, not [SEP]-concatenated string
|
| 1021 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1022 |
-
|
| 1023 |
-
def _run_nli(text_a, text_b):
|
| 1024 |
-
"""Run NLI using CrossEncoder with correct input format.
|
| 1025 |
-
FIX v4.2: Use sentence_transformers.CrossEncoder.predict() which accepts
|
| 1026 |
-
a list of (text_a, text_b) tuples. Returns scores for [contradiction, entailment, neutral].
|
| 1027 |
-
The old code used pipeline("text-classification") with dict input, which was broken."""
|
| 1028 |
-
try:
|
| 1029 |
-
# CrossEncoder.predict returns numpy array of shape (n_pairs, 3)
|
| 1030 |
-
# Columns: [contradiction, entailment, neutral]
|
| 1031 |
-
scores = nli_model.predict([(text_a[:256], text_b[:256])])
|
| 1032 |
-
label_mapping = ["contradiction", "entailment", "neutral"]
|
| 1033 |
-
top_idx = int(scores[0].argmax())
|
| 1034 |
-
top_score = float(scores[0][top_idx])
|
| 1035 |
-
return [{"label": label_mapping[top_idx], "score": top_score}]
|
| 1036 |
-
except Exception as e:
|
| 1037 |
-
print(f"[ClauseGuard] NLI inference error: {e}")
|
| 1038 |
-
return None
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
def detect_contradictions(clause_results, raw_text=""):
|
| 1042 |
-
"""
|
| 1043 |
-
Detect contradictions using:
|
| 1044 |
-
1. NLI cross-encoder model (semantic contradiction detection)
|
| 1045 |
-
2. Structural conflict detection (mutually exclusive labels)
|
| 1046 |
-
3. Missing critical clause detection
|
| 1047 |
-
"""
|
| 1048 |
-
contradictions = []
|
| 1049 |
-
labels_found = set()
|
| 1050 |
-
clause_texts_by_label = defaultdict(list)
|
| 1051 |
-
|
| 1052 |
-
for cr in clause_results:
|
| 1053 |
-
labels_found.add(cr["label"])
|
| 1054 |
-
clause_texts_by_label[cr["label"]].append(cr.get("text", ""))
|
| 1055 |
-
|
| 1056 |
-
# ββ 1. Semantic NLI (if model available) ββ
|
| 1057 |
-
if _HAS_NLI_MODEL and nli_model is not None:
|
| 1058 |
-
conflict_pairs = [
|
| 1059 |
-
("Uncapped Liability", "Cap on Liability",
|
| 1060 |
-
"Liability cannot be both uncapped and capped simultaneously."),
|
| 1061 |
-
("IP Ownership Assignment", "Joint IP Ownership",
|
| 1062 |
-
"IP cannot be both fully assigned and jointly owned."),
|
| 1063 |
-
("Exclusivity", "Non-Transferable License",
|
| 1064 |
-
"Exclusivity and non-transferable license may conflict."),
|
| 1065 |
-
]
|
| 1066 |
-
for label_a, label_b, explanation in conflict_pairs:
|
| 1067 |
-
if label_a in labels_found and label_b in labels_found:
|
| 1068 |
-
texts_a = clause_texts_by_label[label_a]
|
| 1069 |
-
texts_b = clause_texts_by_label[label_b]
|
| 1070 |
-
for ta in texts_a[:2]:
|
| 1071 |
-
for tb in texts_b[:2]:
|
| 1072 |
-
# FIX v4.1: Use proper NLI input format
|
| 1073 |
-
nli_result = _run_nli(ta, tb)
|
| 1074 |
-
if nli_result is None:
|
| 1075 |
-
continue
|
| 1076 |
-
for r in (nli_result if isinstance(nli_result, list) else [nli_result]):
|
| 1077 |
-
if r.get("label", "").lower() == "contradiction" and r.get("score", 0) > 0.6:
|
| 1078 |
-
contradictions.append({
|
| 1079 |
-
"type": "CONTRADICTION",
|
| 1080 |
-
"explanation": explanation,
|
| 1081 |
-
"severity": "HIGH",
|
| 1082 |
-
"clauses": [label_a, label_b],
|
| 1083 |
-
"confidence": round(r["score"], 3),
|
| 1084 |
-
"source": "nli_model",
|
| 1085 |
-
})
|
| 1086 |
-
|
| 1087 |
-
# Also check for internal contradictions within governing law / termination
|
| 1088 |
-
for label in ["Governing Law", "Termination for Convenience"]:
|
| 1089 |
-
texts = clause_texts_by_label.get(label, [])
|
| 1090 |
-
if len(texts) >= 2:
|
| 1091 |
-
for i in range(len(texts)):
|
| 1092 |
-
for j in range(i + 1, min(len(texts), i + 3)):
|
| 1093 |
-
nli_result = _run_nli(texts[i], texts[j])
|
| 1094 |
-
if nli_result is None:
|
| 1095 |
-
continue
|
| 1096 |
-
for r in (nli_result if isinstance(nli_result, list) else [nli_result]):
|
| 1097 |
-
if r.get("label", "").lower() == "contradiction" and r.get("score", 0) > 0.6:
|
| 1098 |
-
contradictions.append({
|
| 1099 |
-
"type": "CONTRADICTION",
|
| 1100 |
-
"explanation": f"Conflicting {label} provisions detected β clauses contradict each other.",
|
| 1101 |
-
"severity": "HIGH",
|
| 1102 |
-
"clauses": [label],
|
| 1103 |
-
"confidence": round(r["score"], 3),
|
| 1104 |
-
"source": "nli_model",
|
| 1105 |
-
})
|
| 1106 |
-
else:
|
| 1107 |
-
# ββ Heuristic fallback (improved) ββ
|
| 1108 |
-
_heuristic_pairs = [
|
| 1109 |
-
(["Uncapped Liability"], ["Cap on Liability"],
|
| 1110 |
-
"Liability cannot be both uncapped and capped simultaneously."),
|
| 1111 |
-
(["IP Ownership Assignment"], ["Joint IP Ownership"],
|
| 1112 |
-
"IP cannot be both fully assigned and jointly owned."),
|
| 1113 |
-
]
|
| 1114 |
-
for group_a, group_b, explanation in _heuristic_pairs:
|
| 1115 |
-
found_a = any(l in labels_found for l in group_a)
|
| 1116 |
-
found_b = any(l in labels_found for l in group_b)
|
| 1117 |
-
if found_a and found_b:
|
| 1118 |
-
contradictions.append({
|
| 1119 |
-
"type": "CONTRADICTION",
|
| 1120 |
-
"explanation": explanation,
|
| 1121 |
-
"severity": "HIGH",
|
| 1122 |
-
"clauses": group_a + group_b,
|
| 1123 |
-
"source": "heuristic",
|
| 1124 |
-
})
|
| 1125 |
-
|
| 1126 |
-
# ββ 2. Missing critical clauses ββ
|
| 1127 |
-
_REQUIRED_CLAUSE_PATTERNS = {
|
| 1128 |
-
"Governing Law": re.compile(
|
| 1129 |
-
r'govern(?:ed|ing).{0,15}law|applicable.{0,10}law|laws?\s+of\s+the\s+state',
|
| 1130 |
-
re.IGNORECASE
|
| 1131 |
-
),
|
| 1132 |
-
"Limitation of liability": re.compile(
|
| 1133 |
-
r'limitation.{0,10}liabilit|cap.{0,10}liabilit|liabilit.{0,10}shall\s+not\s+exceed|in\s+no\s+event.{0,20}liable',
|
| 1134 |
-
re.IGNORECASE
|
| 1135 |
-
),
|
| 1136 |
-
"Arbitration": re.compile(
|
| 1137 |
-
r'arbitrat|AAA|JAMS|binding.{0,10}dispute',
|
| 1138 |
-
re.IGNORECASE
|
| 1139 |
-
),
|
| 1140 |
-
"Termination": re.compile(
|
| 1141 |
-
r'terminat(?:e|ion|ed)|cancel(?:lation)?',
|
| 1142 |
-
re.IGNORECASE
|
| 1143 |
-
),
|
| 1144 |
-
}
|
| 1145 |
-
for clause_name, pattern in _REQUIRED_CLAUSE_PATTERNS.items():
|
| 1146 |
-
if not pattern.search(raw_text):
|
| 1147 |
-
contradictions.append({
|
| 1148 |
-
"type": "MISSING",
|
| 1149 |
-
"explanation": f"No '{clause_name}' clause detected in the document.",
|
| 1150 |
-
"severity": "MEDIUM",
|
| 1151 |
-
"clauses": [clause_name],
|
| 1152 |
-
"source": "structural",
|
| 1153 |
-
})
|
| 1154 |
-
|
| 1155 |
-
# Deduplicate
|
| 1156 |
-
seen = set()
|
| 1157 |
-
unique = []
|
| 1158 |
-
for c in contradictions:
|
| 1159 |
-
key = (c["type"], c["explanation"])
|
| 1160 |
-
if key not in seen:
|
| 1161 |
-
seen.add(key)
|
| 1162 |
-
unique.append(c)
|
| 1163 |
-
|
| 1164 |
-
return unique
|
| 1165 |
-
|
| 1166 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1167 |
-
# 8. RISK SCORING
|
| 1168 |
-
# FIX v4.1: Absolute risk based on findings, not normalized by doc length
|
| 1169 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1170 |
-
|
| 1171 |
-
def compute_risk_score(clause_results, total_clauses):
|
| 1172 |
-
sev_counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
|
| 1173 |
-
for cr in clause_results:
|
| 1174 |
-
sev = cr.get("risk", "LOW")
|
| 1175 |
-
sev_counts[sev] += 1
|
| 1176 |
-
if total_clauses == 0:
|
| 1177 |
-
return 0, "A", sev_counts
|
| 1178 |
-
|
| 1179 |
-
# FIX v4.1: Absolute risk β critical findings should always score high
|
| 1180 |
-
# regardless of document size. A 200-clause doc with 5 critical findings
|
| 1181 |
-
# is just as dangerous as a 10-clause doc with 5 critical findings.
|
| 1182 |
-
weighted = sum(sev_counts[s] * RISK_WEIGHTS[s] for s in sev_counts)
|
| 1183 |
-
|
| 1184 |
-
# Diminishing returns formula: starts linear, flattens near 100
|
| 1185 |
-
# max theoretical = 100, one CRITICAL finding = ~30, two = ~48, five = ~72
|
| 1186 |
-
risk = min(100, round(100 * (1 - (1 / (1 + weighted / 30)))))
|
| 1187 |
-
|
| 1188 |
-
if risk >= 70: grade = "F"
|
| 1189 |
-
elif risk >= 50: grade = "D"
|
| 1190 |
-
elif risk >= 30: grade = "C"
|
| 1191 |
-
elif risk >= 15: grade = "B"
|
| 1192 |
-
else: grade = "A"
|
| 1193 |
-
return risk, grade, sev_counts
|
| 1194 |
-
|
| 1195 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1196 |
-
# 9. MAIN ANALYSIS PIPELINE
|
| 1197 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1198 |
-
|
| 1199 |
-
def analyze_contract(text):
|
| 1200 |
-
if not text or len(text.strip()) < 50:
|
| 1201 |
-
return None, "Document too short (minimum 50 characters)"
|
| 1202 |
-
clauses = split_clauses(text)
|
| 1203 |
-
if not clauses:
|
| 1204 |
-
return None, "No clauses detected in document"
|
| 1205 |
-
|
| 1206 |
-
# PERF v4.3: Batch classification β single forward pass instead of per-clause
|
| 1207 |
-
batch_predictions = classify_cuad_batch(clauses, batch_size=8)
|
| 1208 |
-
|
| 1209 |
-
clause_results = []
|
| 1210 |
-
for clause, predictions in zip(clauses, batch_predictions):
|
| 1211 |
-
if predictions:
|
| 1212 |
-
for pred in predictions:
|
| 1213 |
-
clause_results.append({
|
| 1214 |
-
"text": clause,
|
| 1215 |
-
"label": pred["label"],
|
| 1216 |
-
"confidence": pred["confidence"],
|
| 1217 |
-
"risk": pred["risk"],
|
| 1218 |
-
"description": pred["description"],
|
| 1219 |
-
"source": pred.get("source", "unknown"),
|
| 1220 |
-
})
|
| 1221 |
-
entities = extract_entities(text)
|
| 1222 |
-
contradictions = detect_contradictions(clause_results, text)
|
| 1223 |
-
risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
|
| 1224 |
-
obligations = extract_obligations(text)
|
| 1225 |
-
compliance = check_compliance(text)
|
| 1226 |
-
|
| 1227 |
-
flagged_clause_count = len(clause_results)
|
| 1228 |
-
unique_flagged_texts = len(set(cr["text"] for cr in clause_results))
|
| 1229 |
-
|
| 1230 |
-
result = {
|
| 1231 |
-
"metadata": {
|
| 1232 |
-
"analysis_date": datetime.now().isoformat(),
|
| 1233 |
-
"total_clauses": len(clauses),
|
| 1234 |
-
"flagged_clauses": flagged_clause_count,
|
| 1235 |
-
"unique_flagged": unique_flagged_texts,
|
| 1236 |
-
"model": get_model_status_text(),
|
| 1237 |
-
"text_hash": hashlib.sha256(re.sub(r'\s+', ' ', text.strip()).encode()).hexdigest()[:16],
|
| 1238 |
-
},
|
| 1239 |
-
"risk": {
|
| 1240 |
-
"score": risk,
|
| 1241 |
-
"grade": grade,
|
| 1242 |
-
"breakdown": sev_counts,
|
| 1243 |
-
},
|
| 1244 |
-
"clauses": clause_results,
|
| 1245 |
-
"entities": entities,
|
| 1246 |
-
"contradictions": contradictions,
|
| 1247 |
-
"obligations": obligations,
|
| 1248 |
-
"compliance": compliance,
|
| 1249 |
-
"raw_text": text,
|
| 1250 |
-
}
|
| 1251 |
-
return result, None
|
| 1252 |
-
|
| 1253 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1254 |
-
# 10. EXPORT FUNCTIONS
|
| 1255 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1256 |
-
|
| 1257 |
-
def export_json(result):
|
| 1258 |
-
if result is None:
|
| 1259 |
-
return None
|
| 1260 |
-
return json.dumps(result, indent=2, default=str)
|
| 1261 |
-
|
| 1262 |
-
def export_csv(result):
|
| 1263 |
-
if result is None:
|
| 1264 |
-
return None
|
| 1265 |
-
output = io.StringIO()
|
| 1266 |
-
writer = csv.writer(output)
|
| 1267 |
-
writer.writerow(["Clause Text", "Label", "Risk", "Confidence", "Description", "Source"])
|
| 1268 |
-
for cr in result.get("clauses", []):
|
| 1269 |
-
conf = cr.get("confidence")
|
| 1270 |
-
conf_str = f"{conf:.3f}" if conf is not None else "pattern match"
|
| 1271 |
-
writer.writerow([
|
| 1272 |
-
cr.get("text", "")[:500],
|
| 1273 |
-
cr.get("label", ""),
|
| 1274 |
-
cr.get("risk", ""),
|
| 1275 |
-
conf_str,
|
| 1276 |
-
cr.get("description", ""),
|
| 1277 |
-
cr.get("source", ""),
|
| 1278 |
-
])
|
| 1279 |
-
return output.getvalue()
|
| 1280 |
-
|
| 1281 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1282 |
-
# 11. UI RENDERING
|
| 1283 |
-
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1284 |
-
|
| 1285 |
-
def render_summary(result):
|
| 1286 |
-
if result is None:
|
| 1287 |
-
return ""
|
| 1288 |
-
risk = result["risk"]
|
| 1289 |
-
score = risk["score"]
|
| 1290 |
-
grade = risk["grade"]
|
| 1291 |
-
breakdown = risk["breakdown"]
|
| 1292 |
-
grade_color = {
|
| 1293 |
-
"A": "#16a34a", "B": "#65a30d", "C": "#ca8a04",
|
| 1294 |
-
"D": "#ea580c", "F": "#dc2626",
|
| 1295 |
-
}.get(grade, "#6b7280")
|
| 1296 |
-
crit, high, med, low = breakdown["CRITICAL"], breakdown["HIGH"], breakdown["MEDIUM"], breakdown["LOW"]
|
| 1297 |
-
html = f"""
|
| 1298 |
-
<div style="font-family:system-ui,sans-serif;padding:16px;border:1px solid #e5e7eb;border-radius:12px;background:#fff;">
|
| 1299 |
-
<div style="text-align:center;margin-bottom:16px;">
|
| 1300 |
-
<div style="font-size:48px;font-weight:700;color:{grade_color};">{score}</div>
|
| 1301 |
-
<div style="font-size:14px;color:#6b7280;">/100 Risk Score</div>
|
| 1302 |
-
<div style="display:inline-block;margin-top:8px;padding:4px 16px;border-radius:20px;background:{grade_color};color:white;font-weight:600;font-size:14px;">
|
| 1303 |
-
Grade {grade}
|
| 1304 |
-
</div>
|
| 1305 |
-
</div>
|
| 1306 |
-
<div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;margin-bottom:12px;">
|
| 1307 |
-
<div style="padding:8px;border-radius:6px;background:#fef2f2;text-align:center;">
|
| 1308 |
-
<div style="font-size:20px;font-weight:700;color:#dc2626;">{crit}</div>
|
| 1309 |
-
<div style="font-size:11px;color:#991b1b;">Critical</div>
|
| 1310 |
-
</div>
|
| 1311 |
-
<div style="padding:8px;border-radius:6px;background:#fff7ed;text-align:center;">
|
| 1312 |
-
<div style="font-size:20px;font-weight:700;color:#ea580c;">{high}</div>
|
| 1313 |
-
<div style="font-size:11px;color:#9a3412;">High</div>
|
| 1314 |
-
</div>
|
| 1315 |
-
<div style="padding:8px;border-radius:6px;background:#fefce8;text-align:center;">
|
| 1316 |
-
<div style="font-size:20px;font-weight:700;color:#ca8a04;">{med}</div>
|
| 1317 |
-
<div style="font-size:11px;color:#854d0e;">Medium</div>
|
| 1318 |
-
</div>
|
| 1319 |
-
<div style="padding:8px;border-radius:6px;background:#f0fdf4;text-align:center;">
|
| 1320 |
-
<div style="font-size:20px;font-weight:700;color:#16a34a;">{low}</div>
|
| 1321 |
-
<div style="font-size:11px;color:#166534;">Low</div>
|
| 1322 |
-
</div>
|
| 1323 |
-
</div>
|
| 1324 |
-
<div style="font-size:12px;color:#6b7280;text-align:center;">
|
| 1325 |
-
{result['metadata']['total_clauses']} clauses analyzed Β· {result['metadata']['flagged_clauses']} flagged
|
| 1326 |
-
<br><span style="font-size:10px;">{result['metadata']['model']}</span>
|
| 1327 |
-
</div>
|
| 1328 |
-
</div>
|
| 1329 |
-
"""
|
| 1330 |
-
return html
|
| 1331 |
-
|
| 1332 |
-
def render_clause_cards(result):
|
| 1333 |
-
if result is None:
|
| 1334 |
-
return ""
|
| 1335 |
-
clauses = result.get("clauses", [])
|
| 1336 |
-
if not clauses:
|
| 1337 |
-
return '<div style="padding:24px;text-align:center;color:#6b7280;">No clauses detected.</div>'
|
| 1338 |
-
grouped = defaultdict(list)
|
| 1339 |
-
for cr in clauses:
|
| 1340 |
-
grouped[cr["text"]].append(cr)
|
| 1341 |
-
html = '<div style="font-family:system-ui,sans-serif;">'
|
| 1342 |
-
for text, items in grouped.items():
|
| 1343 |
-
max_risk = max(items, key=lambda x: {"CRITICAL":4,"HIGH":3,"MEDIUM":2,"LOW":1}[x["risk"]])["risk"]
|
| 1344 |
-
border, bg, icon = RISK_STYLES[max_risk]
|
| 1345 |
-
tags = ""
|
| 1346 |
-
for item in items:
|
| 1347 |
-
tag_bg = RISK_STYLES[item["risk"]][1]
|
| 1348 |
-
tag_color = RISK_STYLES[item["risk"]][0]
|
| 1349 |
-
conf = item.get("confidence")
|
| 1350 |
-
source = item.get("source", "")
|
| 1351 |
-
if conf is not None:
|
| 1352 |
-
conf_text = f"{conf:.0%}"
|
| 1353 |
-
else:
|
| 1354 |
-
conf_text = "pattern"
|
| 1355 |
-
source_icon = "π€" if source == "ml" else "π"
|
| 1356 |
-
tags += f'<span style="background:{tag_bg};color:{tag_color};border:1px solid {tag_color}33;padding:2px 8px;border-radius:12px;font-size:11px;font-weight:500;margin-right:4px;">{source_icon} {item["label"]} ({conf_text})</span>'
|
| 1357 |
-
descs = "".join(
|
| 1358 |
-
f'<p style="font-size:12px;color:#6b7280;margin:4px 0 0 0;">{item["description"]}</p>'
|
| 1359 |
-
for item in items
|
| 1360 |
-
)
|
| 1361 |
-
preview = text[:300] + ("..." if len(text) > 300 else "")
|
| 1362 |
-
preview = preview.replace("<", "<").replace(">", ">")
|
| 1363 |
-
html += f"""
|
| 1364 |
-
<div style="border:1px solid #e5e7eb;border-left:4px solid {border};border-radius:8px;padding:14px;margin-bottom:10px;background:#fafafa;">
|
| 1365 |
-
<div style="display:flex;align-items:center;gap:6px;margin-bottom:6px;">
|
| 1366 |
-
<span style="font-size:16px;">{icon}</span>
|
| 1367 |
-
<span style="font-size:12px;font-weight:600;color:{border};text-transform:uppercase;">{max_risk}</span>
|
| 1368 |
-
</div>
|
| 1369 |
-
<p style="font-size:13px;color:#374151;line-height:1.6;margin:0 0 8px 0;">{preview}</p>
|
| 1370 |
-
<div style="margin-bottom:6px;">{tags}</div>
|
| 1371 |
-
{descs}
|
| 1372 |
-
</div>
|
| 1373 |
-
"""
|
| 1374 |
-
html += "</div>"
|
| 1375 |
-
return html
|
| 1376 |
-
|
| 1377 |
-
def render_entities(result):
|
| 1378 |
-
if result is None:
|
| 1379 |
-
return ""
|
| 1380 |
-
entities = result.get("entities", [])
|
| 1381 |
-
if not entities:
|
| 1382 |
-
return '<div style="padding:16px;color:#6b7280;">No entities detected.</div>'
|
| 1383 |
-
grouped = defaultdict(list)
|
| 1384 |
-
for e in entities:
|
| 1385 |
-
grouped[e["type"]].append(e["text"])
|
| 1386 |
-
html = '<div style="font-family:system-ui,sans-serif;">'
|
| 1387 |
-
for etype, texts in grouped.items():
|
| 1388 |
-
unique = list(dict.fromkeys(texts))[:20]
|
| 1389 |
-
color = {
|
| 1390 |
-
"DATE": "#3b82f6", "DATE_REF": "#60a5fa",
|
| 1391 |
-
"MONEY": "#22c55e", "PERCENTAGE": "#10b981",
|
| 1392 |
-
"DURATION": "#6366f1",
|
| 1393 |
-
"PARTY": "#8b5cf6", "PARTY_ROLE": "#a78bfa",
|
| 1394 |
-
"PERSON": "#ec4899",
|
| 1395 |
-
"JURISDICTION": "#f59e0b",
|
| 1396 |
-
"DEFINED_TERM": "#ec4899",
|
| 1397 |
-
"LEGAL_REF": "#6b7280",
|
| 1398 |
-
"MISC": "#9ca3af",
|
| 1399 |
-
}.get(etype, "#6b7280")
|
| 1400 |
-
items_html = "".join(
|
| 1401 |
-
f'<span style="display:inline-block;background:{color}15;color:{color};border:1px solid {color}40;padding:3px 10px;border-radius:6px;font-size:12px;margin:3px;">{t}</span>'
|
| 1402 |
-
for t in unique
|
| 1403 |
-
)
|
| 1404 |
-
html += f"""
|
| 1405 |
-
<div style="margin-bottom:12px;">
|
| 1406 |
-
<div style="font-size:12px;font-weight:600;color:#374151;margin-bottom:6px;text-transform:uppercase;">{etype}</div>
|
| 1407 |
-
<div>{items_html}</div>
|
| 1408 |
-
</div>
|
| 1409 |
-
"""
|
| 1410 |
-
html += "</div>"
|
| 1411 |
-
return html
|
| 1412 |
-
|
| 1413 |
-
def render_contradictions(result):
|
| 1414 |
-
if result is None:
|
| 1415 |
-
return ""
|
| 1416 |
-
contradictions = result.get("contradictions", [])
|
| 1417 |
-
if not contradictions:
|
| 1418 |
-
return '<div style="padding:16px;color:#16a34a;">β No contradictions or missing clauses detected.</div>'
|
| 1419 |
-
html = '<div style="font-family:system-ui,sans-serif;">'
|
| 1420 |
-
for c in contradictions:
|
| 1421 |
-
sev_color = RISK_STYLES[c["severity"]][0]
|
| 1422 |
-
icon = "β οΈ" if c["type"] == "CONTRADICTION" else "π"
|
| 1423 |
-
source = c.get("source", "")
|
| 1424 |
-
source_badge = ""
|
| 1425 |
-
if source == "nli_model":
|
| 1426 |
-
conf = c.get("confidence", 0)
|
| 1427 |
-
source_badge = f'<span style="font-size:10px;background:#eff6ff;color:#3b82f6;padding:1px 6px;border-radius:4px;margin-left:8px;">π€ NLI {conf:.0%}</span>'
|
| 1428 |
-
elif source == "heuristic":
|
| 1429 |
-
source_badge = '<span style="font-size:10px;background:#fef3c7;color:#92400e;padding:1px 6px;border-radius:4px;margin-left:8px;">π Heuristic</span>'
|
| 1430 |
-
html += f"""
|
| 1431 |
-
<div style="border:1px solid #e5e7eb;border-left:4px solid {sev_color};border-radius:8px;padding:12px;margin-bottom:8px;background:#fafafa;">
|
| 1432 |
-
<div style="display:flex;align-items:center;gap:6px;margin-bottom:4px;">
|
| 1433 |
-
<span>{icon}</span>
|
| 1434 |
-
<span style="font-size:12px;font-weight:600;color:{sev_color};">{c["type"]}</span>
|
| 1435 |
-
{source_badge}
|
| 1436 |
-
</div>
|
| 1437 |
-
<p style="font-size:13px;color:#374151;margin:0;">{c["explanation"]}</p>
|
| 1438 |
-
</div>
|
| 1439 |
-
"""
|
| 1440 |
-
html += "</div>"
|
| 1441 |
-
return html
|
| 1442 |
-
|
| 1443 |
-
def render_document_viewer(result):
|
| 1444 |
-
if result is None:
|
| 1445 |
-
return ""
|
| 1446 |
-
text = result.get("raw_text", "")
|
| 1447 |
-
entities = sorted(result.get("entities", []), key=lambda x: x["start"])
|
| 1448 |
-
html_parts = []
|
| 1449 |
-
last_end = 0
|
| 1450 |
-
entity_colors = {
|
| 1451 |
-
"DATE": "#3b82f6", "DATE_REF": "#60a5fa", "MONEY": "#22c55e",
|
| 1452 |
-
"PERCENTAGE": "#10b981", "DURATION": "#6366f1", "PARTY": "#8b5cf6",
|
| 1453 |
-
"PARTY_ROLE": "#a78bfa", "PERSON": "#ec4899", "JURISDICTION": "#f59e0b",
|
| 1454 |
-
"DEFINED_TERM": "#ec4899", "LEGAL_REF": "#6b7280", "MISC": "#9ca3af",
|
| 1455 |
-
}
|
| 1456 |
-
for e in entities:
|
| 1457 |
-
if e["start"] >= last_end:
|
| 1458 |
-
plain = text[last_end:e["start"]].replace("<", "<").replace(">", ">")
|
| 1459 |
-
html_parts.append(plain)
|
| 1460 |
-
color = entity_colors.get(e["type"], "#6b7280")
|
| 1461 |
-
entity_text = text[e["start"]:e["end"]].replace("<", "<").replace(">", ">")
|
| 1462 |
-
html_parts.append(
|
| 1463 |
-
f'<span style="background:{color}20;color:{color};border-bottom:2px solid {color};padding:0 2px;border-radius:2px;" '
|
| 1464 |
-
f'title="{e["type"]}">{entity_text}</span>'
|
| 1465 |
-
)
|
| 1466 |
-
last_end = e["end"]
|
| 1467 |
-
if last_end < len(text):
|
| 1468 |
-
html_parts.append(text[last_end:].replace("<", "<").replace(">", ">"))
|
| 1469 |
-
return f'<div style="font-family:ui-monospace,monospace;font-size:13px;line-height:1.8;white-space:pre-wrap;padding:16px;">{"".join(html_parts)}</div>'
|
|
|
|
| 1 |
+
${file:/app/app.py}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|