🚨 FIX: Restore missing Gradio UI β€” app was broken (no UI rendered)

#5
by gaurv007 - opened
Files changed (1) hide show
  1. app.py +1 -1469
app.py CHANGED
@@ -1,1469 +1 @@
1
- """
2
- ClauseGuard β€” World's Best Legal Contract Analysis Tool (v4.3)
3
- ═══════════════════════════════════════════════════════════════
4
- PERF v4.3:
5
- β€’ PERF: Upgraded embedder to BAAI/bge-small-en-v1.5 (+21% retrieval accuracy)
6
- β€’ PERF: Batched clause classification (single forward pass, batch_size=8)
7
- β€’ PERF: ONNX INT8 quantized model support (2-4x faster on CPU)
8
- β€’ PERF: torch.set_num_threads(2) to prevent CPU thrashing
9
- ‒ NEW: ml/export_onnx_v2.py — full merge→ONNX→quantize pipeline
10
-
11
- Fixes in v4.2:
12
- β€’ FIX: NLI now uses CrossEncoder.predict() β€” contradictions actually work
13
- β€’ FIX: BoundedCache uses threading.RLock β€” no more race conditions
14
- β€’ FIX: Pre-compiled ALL regex patterns at module level (perf)
15
- β€’ FIX: Added missing regex labels to RISK_MAP/DESC_MAP
16
- β€’ FIX: Extension risk formula matches backend
17
- β€’ FIX: Extension API_BASE URL corrected
18
- β€’ FIX: API CORS localhost requires explicit opt-in
19
-
20
- Fixes in v4.1:
21
- β€’ FIX: Bounded LRU caches (chunk_cache, prediction_cache) β€” no more memory leaks
22
- β€’ FIX: NLI input format β€” pass (text_a, text_b) tuple, not [SEP]-concatenated string
23
- β€’ FIX: Classifier max_length raised to 512 (was 256 β€” truncating legal clauses)
24
- β€’ FIX: Risk score formula β€” absolute risk, not normalized by total_clauses
25
- β€’ FIX: Train/inference alignment β€” use softmax+argmax for single-label model
26
- β€’ FIX: Added missing regex fallback patterns for more CUAD categories
27
- β€’ FIX: Entity extraction batching β€” single pipeline call instead of sequential
28
- β€’ PERF: Shared model singleton via models.py module
29
- β€’ PERF: LRU-bounded caches everywhere
30
-
31
- Carried from v4.0:
32
- β€’ OCR support for scanned PDFs (docTR engine with smart native/scanned routing)
33
- β€’ Contract Q&A Chatbot (RAG: embedding retrieval + HF Inference API streaming)
34
- β€’ Clause Redlining (3-tier: template lookup + RAG + LLM refinement)
35
- β€’ Fixed CUAD label mapping (added missing index 6)
36
- β€’ Structure-aware clause splitting
37
- β€’ Real NLI contradiction detection via cross-encoder model
38
- β€’ ML-based Legal NER with regex fallback
39
- β€’ Semantic compliance checking with negation handling
40
- β€’ Improved obligation extraction with false-positive filtering
41
- β€’ LLM-powered clause explanations
42
- β€’ Per-session temp files (no collision)
43
- β€’ Model health reporting
44
-
45
- Models:
46
- β€’ Clause classifier: Mokshith31/legalbert-contract-clause-classification
47
- (LoRA adapter on nlpaueb/legal-bert-base-uncased, 41 CUAD classes)
48
- β€’ Legal NER: matterstack/legal-bert-ner (token classification)
49
- β€’ NLI: cross-encoder/nli-deberta-v3-base (contradiction detection)
50
- β€’ Embeddings: sentence-transformers/all-MiniLM-L6-v2 (RAG retrieval)
51
- β€’ OCR: docTR fast_base + crnn_vgg16_bn (scanned PDF extraction)
52
- β€’ LLM: Qwen/Qwen2.5-7B-Instruct via HF Inference API (chatbot + redlining)
53
- """
54
-
55
- import os
56
- import re
57
- import json
58
- import csv
59
- import io
60
- import uuid
61
- import tempfile
62
- import hashlib
63
- import threading
64
- from collections import defaultdict, OrderedDict
65
- from datetime import datetime
66
- from functools import lru_cache
67
-
68
- import gradio as gr
69
- import numpy as np
70
-
71
- # ── Document parsers (soft-fail) ────────────────────────────────────
72
- try:
73
- import pdfplumber
74
- _HAS_PDF = True
75
- except Exception:
76
- _HAS_PDF = False
77
-
78
- try:
79
- from docx import Document as DocxDocument
80
- _HAS_DOCX = True
81
- except Exception:
82
- _HAS_DOCX = False
83
-
84
- # ── PyTorch / Transformers (soft-fail) ────────────────────────────────
85
- _HAS_TORCH = False
86
- _HAS_NER_MODEL = False
87
- _HAS_NLI_MODEL = False
88
-
89
- try:
90
- import torch
91
- from transformers import (
92
- AutoTokenizer, AutoModelForSequenceClassification,
93
- AutoModelForTokenClassification, pipeline
94
- )
95
- from peft import PeftModel
96
- _HAS_TORCH = True
97
- # PERF v4.3: Limit PyTorch threads to avoid CPU thrashing under concurrent requests.
98
- # HF Spaces CPU-basic has 2 vCPUs. Reserve 1 thread for Gradio server.
99
- torch.set_num_threads(2)
100
- torch.set_num_interop_threads(1)
101
- except Exception:
102
- pass
103
-
104
- # ── ONNX Runtime (soft-fail, for quantized model) ─────────────────────
105
- _HAS_ORT = False
106
- try:
107
- from optimum.onnxruntime import ORTModelForSequenceClassification as _ORTModel
108
- _HAS_ORT = True
109
- except ImportError:
110
- pass
111
-
112
- # ── CrossEncoder for NLI (soft-fail) ──────────────────────────────────
113
- _HAS_CROSS_ENCODER = False
114
- try:
115
- from sentence_transformers import CrossEncoder as _CrossEncoder
116
- _HAS_CROSS_ENCODER = True
117
- except ImportError:
118
- pass
119
-
120
- # ── Import submodules ────────────────────────────���──────────────────
121
- from compare import compare_contracts, render_comparison_html
122
- from obligations import extract_obligations, render_obligations_html
123
- from compliance import check_compliance, render_compliance_html
124
- from ocr_engine import parse_pdf_smart, get_ocr_status
125
- from chatbot import index_contract, chat_respond, get_chatbot_status
126
- from redlining import generate_redlines, render_redlines_html
127
-
128
- # ═══════════════════════════════════════════════════════════════════════
129
- # 1. CONFIGURATION β€” FIXED label mapping (41 labels, index 6 restored)
130
- # ═══════════════════════════════════════════════════════════════════════
131
-
132
- CUAD_LABELS = [
133
- "Document Name", # 0
134
- "Parties", # 1
135
- "Agreement Date", # 2
136
- "Effective Date", # 3
137
- "Expiration Date", # 4
138
- "Renewal Term", # 5
139
- "Notice Period to Terminate Renewal", # 6 ← WAS MISSING
140
- "Governing Law", # 7
141
- "Most Favored Nation", # 8
142
- "Non-Compete", # 9
143
- "Exclusivity", # 10
144
- "No-Solicit of Customers", # 11
145
- "No-Solicit of Employees", # 12
146
- "Non-Disparagement", # 13
147
- "Termination for Convenience", # 14
148
- "ROFR/ROFO/ROFN", # 15
149
- "Change of Control", # 16
150
- "Anti-Assignment", # 17
151
- "Revenue/Profit Sharing", # 18
152
- "Price Restriction", # 19
153
- "Minimum Commitment", # 20
154
- "Volume Restriction", # 21
155
- "IP Ownership Assignment", # 22
156
- "Joint IP Ownership", # 23
157
- "License Grant", # 24
158
- "Non-Transferable License", # 25
159
- "Affiliate License-Licensor", # 26
160
- "Affiliate License-Licensee", # 27
161
- "Unlimited/All-You-Can-Eat License", # 28
162
- "Irrevocable or Perpetual License", # 29
163
- "Source Code Escrow", # 30
164
- "Post-Termination Services", # 31
165
- "Audit Rights", # 32
166
- "Uncapped Liability", # 33
167
- "Cap on Liability", # 34
168
- "Liquidated Damages", # 35
169
- "Warranty Duration", # 36
170
- "Insurance", # 37
171
- "Covenant Not to Sue", # 38
172
- "Third Party Beneficiary", # 39
173
- "Other", # 40
174
- ]
175
-
176
- _UNFAIR_LABELS = [
177
- "Limitation of liability", "Unilateral termination", "Unilateral change",
178
- "Content removal", "Contract by using", "Choice of law",
179
- "Jurisdiction", "Arbitration"
180
- ]
181
-
182
- # FIX v4.2: Include regex-only labels that aren't in CUAD or Unfair lists
183
- _EXTRA_REGEX_LABELS = [
184
- "Indemnification", "Confidentiality", "Force Majeure", "Penalties"
185
- ]
186
-
187
- _ALL_LABELS = CUAD_LABELS + _UNFAIR_LABELS + _EXTRA_REGEX_LABELS
188
-
189
- RISK_MAP = {
190
- # Critical
191
- "Uncapped Liability": "CRITICAL",
192
- "Arbitration": "CRITICAL",
193
- "IP Ownership Assignment": "CRITICAL",
194
- "Termination for Convenience": "CRITICAL",
195
- "Limitation of liability": "CRITICAL",
196
- "Unilateral termination": "CRITICAL",
197
- "Liquidated Damages": "CRITICAL",
198
- # High
199
- "Non-Compete": "HIGH",
200
- "Exclusivity": "HIGH",
201
- "Change of Control": "HIGH",
202
- "No-Solicit of Customers": "HIGH",
203
- "No-Solicit of Employees": "HIGH",
204
- "Unilateral change": "HIGH",
205
- "Content removal": "HIGH",
206
- "Anti-Assignment": "HIGH",
207
- "Notice Period to Terminate Renewal": "HIGH",
208
- # Medium
209
- "Governing Law": "MEDIUM",
210
- "Jurisdiction": "MEDIUM",
211
- "Choice of law": "MEDIUM",
212
- "Price Restriction": "MEDIUM",
213
- "Minimum Commitment": "MEDIUM",
214
- "Volume Restriction": "MEDIUM",
215
- "Non-Disparagement": "MEDIUM",
216
- "Most Favored Nation": "MEDIUM",
217
- "Revenue/Profit Sharing": "MEDIUM",
218
- "Warranty Duration": "MEDIUM",
219
- # Low
220
- "Document Name": "LOW",
221
- "Parties": "LOW",
222
- "Agreement Date": "LOW",
223
- "Effective Date": "LOW",
224
- "Expiration Date": "LOW",
225
- "Renewal Term": "LOW",
226
- "Joint IP Ownership": "LOW",
227
- "License Grant": "LOW",
228
- "Non-Transferable License": "LOW",
229
- "Affiliate License-Licensor": "LOW",
230
- "Affiliate License-Licensee": "LOW",
231
- "Unlimited/All-You-Can-Eat License": "LOW",
232
- "Irrevocable or Perpetual License": "LOW",
233
- "Source Code Escrow": "LOW",
234
- "Post-Termination Services": "LOW",
235
- "Audit Rights": "LOW",
236
- "Cap on Liability": "LOW",
237
- "Insurance": "LOW",
238
- "Covenant Not to Sue": "LOW",
239
- "Third Party Beneficiary": "LOW",
240
- "Other": "LOW",
241
- "ROFR/ROFO/ROFN": "LOW",
242
- "Contract by using": "LOW",
243
- # FIX v4.2: Added regex-only labels that were missing from RISK_MAP
244
- "Indemnification": "HIGH",
245
- "Confidentiality": "MEDIUM",
246
- "Force Majeure": "LOW",
247
- "Penalties": "HIGH",
248
- }
249
-
250
- DESC_MAP = {label: label.replace("_", " ") for label in _ALL_LABELS}
251
- DESC_MAP.update({
252
- "Limitation of liability": "Company limits or excludes liability for losses, data breaches, or service failures.",
253
- "Unilateral termination": "Company can terminate your account at any time without reason.",
254
- "Unilateral change": "Company can change terms at any time without your consent.",
255
- "Content removal": "Company can delete your content without notice or justification.",
256
- "Contract by using": "You are bound to the contract simply by using the service.",
257
- "Choice of law": "Governing law may differ from your country, reducing your legal protections.",
258
- "Jurisdiction": "Disputes must be resolved in a jurisdiction that may disadvantage you.",
259
- "Arbitration": "Forces disputes to arbitration instead of court. You waive your right to sue.",
260
- "Uncapped Liability": "No financial limit on damages the party may be liable for.",
261
- "Cap on Liability": "Maximum financial liability is explicitly capped.",
262
- "Non-Compete": "Restrictions on competing with the counter-party.",
263
- "Exclusivity": "Obligation to deal exclusively with one party.",
264
- "IP Ownership Assignment": "Intellectual property rights are transferred entirely.",
265
- "Termination for Convenience": "Either party may terminate without cause or notice.",
266
- "Governing Law": "Specifies which jurisdiction's laws apply.",
267
- "Non-Disparagement": "Agreement not to speak negatively about the other party.",
268
- "ROFR/ROFO/ROFN": "Right of First Refusal / Offer / Negotiation clause.",
269
- "Change of Control": "Provisions triggered by ownership or control changes.",
270
- "Anti-Assignment": "Restrictions on transferring contract rights to third parties.",
271
- "Liquidated Damages": "Pre-determined damages amount for breach of contract.",
272
- "Source Code Escrow": "Third-party holds source code for release under defined conditions.",
273
- "Post-Termination Services": "Services to be provided after the contract ends.",
274
- "Audit Rights": "Right to inspect records or verify compliance.",
275
- "Warranty Duration": "Length of time warranties remain in effect.",
276
- "Covenant Not to Sue": "Agreement not to bring legal action against a party.",
277
- "Third Party Beneficiary": "Non-party who benefits from the contract terms.",
278
- "Insurance": "Insurance coverage requirements.",
279
- "Revenue/Profit Sharing": "Revenue or profit sharing arrangements between parties.",
280
- "Price Restriction": "Restrictions on pricing or discounting.",
281
- "Minimum Commitment": "Minimum purchase or usage commitment.",
282
- "Volume Restriction": "Limits on volume of goods or services.",
283
- "License Grant": "Permission to use intellectual property.",
284
- "Non-Transferable License": "License that cannot be transferred to third parties.",
285
- "Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
286
- "Unlimited/All-You-Can-Eat License": "License with no usage limits.",
287
- "Notice Period to Terminate Renewal": "Required notice period before automatic renewal.",
288
- # FIX v4.2: Added descriptions for regex-only labels
289
- "Indemnification": "Obligation to compensate the other party for losses or damages.",
290
- "Confidentiality": "Restrictions on sharing proprietary or sensitive information.",
291
- "Force Majeure": "Excuses performance due to extraordinary events beyond control.",
292
- "Penalties": "Financial penalties for breach or late performance.",
293
- })
294
-
295
- RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
296
-
297
- RISK_STYLES = {
298
- "CRITICAL": ("#dc2626", "#fef2f2", "⚠️"),
299
- "HIGH": ("#ea580c", "#fff7ed", "⚑"),
300
- "MEDIUM": ("#ca8a04", "#fefce8", "πŸ“‹"),
301
- "LOW": ("#16a34a", "#f0fdf4", "βœ“"),
302
- }
303
-
304
- # ═══════════════════════════════════════════════════════════════════════
305
- # FIX v4.1: Per-class thresholds aligned with single-label softmax
306
- # The model was trained with cross-entropy (single-label), so inference
307
- # now uses softmax+argmax, not sigmoid. Thresholds apply to softmax probs.
308
- # ═══════════════════════════════════════════════════════════════════════
309
- _CUAD_THRESHOLDS = {}
310
- _WEAK_CLASSES = {0, 1, 2, 7, 9, 21, 22, 27, 37, 38}
311
- for _i in range(41):
312
- if _i in _WEAK_CLASSES:
313
- _CUAD_THRESHOLDS[_i] = 0.85 # Only flag if very confident (these classes are unreliable)
314
- else:
315
- _CUAD_THRESHOLDS[_i] = 0.40 # Reasonable threshold for softmax outputs
316
-
317
- # ═══════════════════════════════════════════════════════════════════════
318
- # FIX v4.1: Bounded LRU Cache utility (replaces unbounded dicts)
319
- # ═══════════════════════════════════════════════════════════════════════
320
-
321
- class BoundedCache:
322
- """Thread-safe bounded LRU cache using OrderedDict + RLock.
323
- FIX v4.2: Added threading.RLock to prevent race conditions under
324
- Gradio's concurrent request handling. OrderedDict compound operations
325
- (contains + setitem + move_to_end + popitem) are NOT atomic even with GIL."""
326
- def __init__(self, maxsize=1000):
327
- self._cache = OrderedDict()
328
- self._maxsize = maxsize
329
- self._lock = threading.RLock()
330
-
331
- def get(self, key, default=None):
332
- with self._lock:
333
- if key in self._cache:
334
- self._cache.move_to_end(key)
335
- return self._cache[key]
336
- return default
337
-
338
- def put(self, key, value):
339
- with self._lock:
340
- if key in self._cache:
341
- self._cache.move_to_end(key)
342
- self._cache[key] = value
343
- else:
344
- if len(self._cache) >= self._maxsize:
345
- self._cache.popitem(last=False)
346
- self._cache[key] = value
347
-
348
- def __contains__(self, key):
349
- with self._lock:
350
- return key in self._cache
351
-
352
- def __len__(self):
353
- with self._lock:
354
- return len(self._cache)
355
-
356
-
357
- # ═══════════════════════════════════════════════════════════════════════
358
- # 2. MODEL LOADING
359
- # ═══════════════════════════════════════════════════════════════════════
360
-
361
- cuad_tokenizer = None
362
- cuad_model = None
363
- ner_pipeline = None
364
- nli_model = None # FIX v4.2: CrossEncoder instead of pipeline
365
- _model_status = {"cuad": "not_loaded", "ner": "not_loaded", "nli": "not_loaded"}
366
-
367
- def _load_cuad_model():
368
- global cuad_tokenizer, cuad_model, _model_status
369
- # PERF v4.3: Try ONNX quantized model first (2-4x faster on CPU)
370
- onnx_model_path = os.environ.get("ONNX_MODEL_PATH", "")
371
- onnx_hub_id = os.environ.get("ONNX_HUB_MODEL_ID", "gaurv007/clauseguard-onnx-int8")
372
-
373
- if _HAS_ORT:
374
- for source in [onnx_model_path, onnx_hub_id]:
375
- if not source:
376
- continue
377
- try:
378
- print(f"[ClauseGuard] Trying ONNX model: {source}")
379
- cuad_model = _ORTModel.from_pretrained(source, file_name="model_quantized.onnx")
380
- cuad_tokenizer = AutoTokenizer.from_pretrained(source)
381
- _model_status["cuad"] = "loaded (ONNX INT8)"
382
- print(f"[ClauseGuard] ONNX INT8 model loaded from {source}")
383
- return
384
- except Exception as e:
385
- print(f"[ClauseGuard] ONNX load failed from {source}: {e}")
386
-
387
- # Fallback to PyTorch PEFT model
388
- if not _HAS_TORCH:
389
- print("[ClauseGuard] PyTorch not available β€” using regex fallback")
390
- _model_status["cuad"] = "unavailable"
391
- return
392
- try:
393
- base = "nlpaueb/legal-bert-base-uncased"
394
- adapter = "Mokshith31/legalbert-contract-clause-classification"
395
- print(f"[ClauseGuard] Loading CUAD classifier (PyTorch): {adapter}")
396
- cuad_tokenizer = AutoTokenizer.from_pretrained(base)
397
- base_model = AutoModelForSequenceClassification.from_pretrained(
398
- base, num_labels=41, ignore_mismatched_sizes=True
399
- )
400
- cuad_model = PeftModel.from_pretrained(base_model, adapter)
401
- cuad_model.eval()
402
- _model_status["cuad"] = "loaded (PyTorch)"
403
- print("[ClauseGuard] CUAD model loaded successfully (PyTorch)")
404
- except Exception as e:
405
- print(f"[ClauseGuard] CUAD model load failed: {e}")
406
- cuad_tokenizer = None
407
- cuad_model = None
408
- _model_status["cuad"] = f"failed: {e}"
409
-
410
- def _load_ner_model():
411
- global ner_pipeline, _model_status, _HAS_NER_MODEL
412
- if not _HAS_TORCH:
413
- _model_status["ner"] = "unavailable"
414
- return
415
- try:
416
- print("[ClauseGuard] Loading Legal NER model: matterstack/legal-bert-ner")
417
- ner_pipeline = pipeline(
418
- "ner",
419
- model="matterstack/legal-bert-ner",
420
- aggregation_strategy="simple",
421
- device=-1, # CPU
422
- )
423
- _HAS_NER_MODEL = True
424
- _model_status["ner"] = "loaded"
425
- print("[ClauseGuard] Legal NER model loaded successfully")
426
- except Exception as e:
427
- print(f"[ClauseGuard] Legal NER model load failed (using regex fallback): {e}")
428
- _model_status["ner"] = f"failed: {e}"
429
-
430
- def _load_nli_model():
431
- global nli_model, _model_status, _HAS_NLI_MODEL
432
- if not _HAS_CROSS_ENCODER:
433
- _model_status["nli"] = "unavailable (sentence-transformers not installed)"
434
- return
435
- try:
436
- print("[ClauseGuard] Loading NLI model: cross-encoder/nli-deberta-v3-base (CrossEncoder)")
437
- nli_model = _CrossEncoder("cross-encoder/nli-deberta-v3-base")
438
- _HAS_NLI_MODEL = True
439
- _model_status["nli"] = "loaded"
440
- print("[ClauseGuard] NLI CrossEncoder loaded successfully")
441
- except Exception as e:
442
- print(f"[ClauseGuard] NLI model load failed (using heuristic fallback): {e}")
443
- _model_status["nli"] = f"failed: {e}"
444
-
445
- def get_model_status_text():
446
- """Return human-readable model status."""
447
- parts = []
448
- for name, status in _model_status.items():
449
- icon = "βœ…" if status == "loaded" else "⚠️" if "failed" in status else "❌"
450
- label = {"cuad": "Clause Classifier", "ner": "Legal NER", "nli": "NLI Contradiction"}[name]
451
- parts.append(f"{icon} {label}: {status}")
452
- return " Β· ".join(parts)
453
-
454
- # Load models at startup
455
- _load_cuad_model()
456
- _load_ner_model()
457
- _load_nli_model()
458
-
459
- # ═══════════════════════════════════════════════════════════════════════
460
- # 3. DOCUMENT PARSING
461
- # ═══════════════════════════════════════════════════════════════════════
462
-
463
- def parse_pdf(file_path):
464
- """Smart PDF parser: native text extraction with OCR fallback for scanned PDFs."""
465
- text, error, method = parse_pdf_smart(file_path)
466
- if text:
467
- if method == "ocr":
468
- print(f"[ClauseGuard] PDF extracted via OCR ({len(text)} chars)")
469
- return text, None
470
- if error:
471
- return None, error
472
- return None, "Could not extract text from PDF. Try uploading a clearer scan or digital PDF."
473
-
474
- def parse_docx(file_path):
475
- if not _HAS_DOCX:
476
- return None, "DOCX parsing not available (python-docx not installed)"
477
- try:
478
- doc = DocxDocument(file_path)
479
- paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
480
- return "\n\n".join(paragraphs), None
481
- except Exception as e:
482
- return None, f"DOCX parse error: {e}"
483
-
484
- def parse_document(file_path):
485
- if file_path is None:
486
- return None, "No file uploaded"
487
- ext = os.path.splitext(file_path)[1].lower()
488
- if ext == ".pdf":
489
- return parse_pdf(file_path)
490
- elif ext in (".docx", ".doc"):
491
- return parse_docx(file_path)
492
- elif ext in (".txt", ".md", ".rst"):
493
- try:
494
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
495
- return f.read(), None
496
- except Exception as e:
497
- return None, f"Text read error: {e}"
498
- else:
499
- return None, f"Unsupported file type: {ext}"
500
-
501
- # ═══════════════════════════════════════════════════════════════════════
502
- # 4. DETERMINISTIC CLAUSE SPLITTING
503
- # FIX v4.1: Bounded cache (max 500 documents) instead of unbounded dict
504
- # ═══════════════════════════════════════════════════════════════════════
505
-
506
- _chunk_cache = BoundedCache(maxsize=500)
507
-
508
- # FIX v4.2: Pre-compile section pattern at module level (was recompiling per call)
509
- _SECTION_PATTERN = re.compile(
510
- r'(?:^|\n\n)'
511
- r'(?='
512
- r'\d+(?:\.\d+)*[.)]\s' # 1. 2. 3.1. 3.1)
513
- r'|[A-Z]{2,}[A-Z\s]*\n' # ALL CAPS HEADERS
514
- r'|\([a-z]\)\s' # (a) (b) (c)
515
- r'|(?:Section|Article|Clause)\s+\d+' # Section 1, Article 2
516
- r')',
517
- re.MULTILINE
518
- )
519
-
520
- def split_clauses(text):
521
- """Deterministic, structure-aware clause splitting.
522
- Same input ALWAYS produces same output. Normalized text is hashed
523
- and cached so repeated runs on identical documents are identical."""
524
- normalized = re.sub(r'\s+', ' ', text.strip())
525
- text_hash = hashlib.sha256(normalized.encode()).hexdigest()
526
- cached = _chunk_cache.get(text_hash)
527
- if cached is not None:
528
- return cached
529
-
530
- text = re.sub(r'\n{3,}', '\n\n', text.strip())
531
-
532
- # First try to detect numbered sections (1., 2., 3.1, (a), etc.)
533
- positions = [m.start() for m in _SECTION_PATTERN.finditer(text)]
534
-
535
- if len(positions) >= 3:
536
- clauses = []
537
- for i, pos in enumerate(positions):
538
- end = positions[i + 1] if i + 1 < len(positions) else len(text)
539
- chunk = text[pos:end].strip()
540
- if len(chunk) > 30:
541
- if len(chunk) > 1500:
542
- sub_parts = chunk.split('\n\n')
543
- current = ""
544
- for sp in sub_parts:
545
- if len(current) + len(sp) < 1200:
546
- current += ("\n\n" + sp if current else sp)
547
- else:
548
- if len(current.strip()) > 30:
549
- clauses.append(current.strip())
550
- current = sp
551
- if len(current.strip()) > 30:
552
- clauses.append(current.strip())
553
- else:
554
- clauses.append(chunk)
555
- if positions and positions[0] > 50:
556
- preamble = text[:positions[0]].strip()
557
- if len(preamble) > 30:
558
- clauses.insert(0, preamble)
559
- result = clauses if clauses else _fallback_split(text)
560
- _chunk_cache.put(text_hash, result)
561
- return result
562
- else:
563
- result = _fallback_split(text)
564
- _chunk_cache.put(text_hash, result)
565
- return result
566
-
567
- def _fallback_split(text):
568
- """Fallback: split on paragraph breaks and sentence boundaries."""
569
- paragraphs = text.split('\n\n')
570
- if len(paragraphs) >= 3:
571
- clauses = []
572
- for p in paragraphs:
573
- p = p.strip()
574
- if len(p) > 30:
575
- if len(p) > 1500:
576
- sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', p)
577
- current = ""
578
- for s in sents:
579
- if len(current) + len(s) < 1000:
580
- current += (" " + s if current else s)
581
- else:
582
- if len(current.strip()) > 30:
583
- clauses.append(current.strip())
584
- current = s
585
- if len(current.strip()) > 30:
586
- clauses.append(current.strip())
587
- else:
588
- clauses.append(p)
589
- return clauses
590
-
591
- parts = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(])', text)
592
- return [p.strip() for p in parts if len(p.strip()) > 30]
593
-
594
- # ═══════════════════════════════════════════════════════════════════════
595
- # 5. CLAUSE DETECTION
596
- # FIX v4.1: Use softmax (matching training) instead of sigmoid
597
- # FIX v4.1: max_length raised to 512 (was 256)
598
- # FIX v4.1: Bounded prediction cache
599
- # ═══════════════════════════════════════════════════════════════════════
600
-
601
- _HEADING_RE = re.compile(r'^\d+(?:\.\d+)*\s+[A-Z][A-Z\s&,/]+$', re.MULTILINE)
602
-
603
- def _strip_heading(text):
604
- """Remove leading section headings that confuse the classifier."""
605
- lines = text.split('\n')
606
- if lines and _HEADING_RE.match(lines[0].strip()):
607
- stripped = '\n'.join(lines[1:]).strip()
608
- return stripped if len(stripped) > 20 else text
609
- return text
610
-
611
- _LABEL_GUARDRAILS = {
612
- "Liquidated Damages": re.compile(
613
- r'liquidated|pre-?determined.{0,10}damage|agreed.{0,10}sum|penalty clause|stipulated.{0,10}damage',
614
- re.IGNORECASE
615
- ),
616
- "Uncapped Liability": re.compile(
617
- r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
618
- re.IGNORECASE
619
- ),
620
- }
621
-
622
- def _apply_guardrails(label, text, confidence):
623
- guard = _LABEL_GUARDRAILS.get(label)
624
- if guard and not guard.search(text):
625
- return "Other", confidence * 0.3
626
- return label, confidence
627
-
628
- def _text_hash(text):
629
- return hashlib.md5(text.encode()).hexdigest()
630
-
631
- # FIX v4.1: Bounded prediction cache
632
- _prediction_cache = BoundedCache(maxsize=2000)
633
-
634
- def classify_cuad(clause_text):
635
- if cuad_model is None or cuad_tokenizer is None:
636
- return _classify_regex(clause_text)
637
-
638
- clean_text = _strip_heading(clause_text)
639
-
640
- h = _text_hash(clean_text[:512])
641
- cached = _prediction_cache.get(h)
642
- if cached is not None:
643
- return cached
644
-
645
- try:
646
- # FIX v4.1: max_length=512 (was 256 β€” truncating long legal clauses)
647
- inputs = cuad_tokenizer(
648
- clean_text,
649
- return_tensors="pt",
650
- truncation=True,
651
- max_length=512,
652
- padding=True
653
- )
654
- with torch.no_grad():
655
- logits = cuad_model(**inputs).logits
656
-
657
- # FIX v4.1: Use softmax (matching single-label cross-entropy training)
658
- # The model was trained with F.cross_entropy, so softmax is correct.
659
- probs = torch.softmax(logits, dim=-1)[0]
660
-
661
- # Get the top prediction
662
- top_prob, top_idx = torch.max(probs, dim=0)
663
- top_idx = int(top_idx)
664
- top_conf = float(top_prob)
665
-
666
- results = []
667
-
668
- # Primary prediction
669
- threshold = _CUAD_THRESHOLDS.get(top_idx, 0.40)
670
- if top_conf > threshold and top_idx < len(CUAD_LABELS):
671
- label = CUAD_LABELS[top_idx]
672
- conf = top_conf
673
- label, conf = _apply_guardrails(label, clause_text, conf)
674
- if not (label == "Other" and conf < 0.3):
675
- risk = RISK_MAP.get(label, "LOW")
676
- results.append({
677
- "label": label,
678
- "confidence": round(conf, 3),
679
- "risk": risk,
680
- "description": DESC_MAP.get(label, label),
681
- "source": "ml",
682
- })
683
-
684
- # Also check 2nd-best prediction if confident enough
685
- if len(probs) > 1:
686
- sorted_probs, sorted_indices = torch.sort(probs, descending=True)
687
- if len(sorted_probs) > 1:
688
- second_idx = int(sorted_indices[1])
689
- second_conf = float(sorted_probs[1])
690
- second_threshold = _CUAD_THRESHOLDS.get(second_idx, 0.40)
691
- if second_conf > second_threshold and second_idx < len(CUAD_LABELS):
692
- label2 = CUAD_LABELS[second_idx]
693
- conf2 = second_conf
694
- label2, conf2 = _apply_guardrails(label2, clause_text, conf2)
695
- if not (label2 == "Other" and conf2 < 0.3):
696
- # Only add if different from primary
697
- if not results or results[0]["label"] != label2:
698
- risk2 = RISK_MAP.get(label2, "LOW")
699
- results.append({
700
- "label": label2,
701
- "confidence": round(conf2, 3),
702
- "risk": risk2,
703
- "description": DESC_MAP.get(label2, label2),
704
- "source": "ml",
705
- })
706
-
707
- results.sort(key=lambda x: x["confidence"], reverse=True)
708
-
709
- # If no ML results, also try regex to catch what model misses
710
- if not results:
711
- results = _classify_regex(clause_text)
712
-
713
- _prediction_cache.put(h, results)
714
- return results
715
- except Exception as e:
716
- print(f"[ClauseGuard] CUAD inference error: {e}")
717
- return _classify_regex(clause_text)
718
-
719
- # ═══════════════════════════════════════════════════════════════════════
720
- # 5b. BATCHED CLAUSE CLASSIFICATION
721
- # PERF v4.3: Single forward pass for all clauses instead of one-by-one
722
- # ═══════════════════════════════════════════════════════════════════════
723
-
724
- def classify_cuad_batch(clauses, batch_size=8):
725
- """Classify a batch of clauses in a single forward pass.
726
- PERF v4.3: Replaces sequential classify_cuad() loop.
727
- On CPU, batch_size=8 balances memory vs throughput."""
728
- if cuad_model is None or cuad_tokenizer is None:
729
- # Fallback to regex for all clauses
730
- return [_classify_regex(c) for c in clauses]
731
-
732
- all_results = []
733
- # Check cache first, collect uncached clauses
734
- uncached_indices = []
735
- uncached_texts = []
736
- for i, clause in enumerate(clauses):
737
- clean = _strip_heading(clause)
738
- h = _text_hash(clean[:512])
739
- cached = _prediction_cache.get(h)
740
- if cached is not None:
741
- all_results.append((i, cached))
742
- else:
743
- uncached_indices.append(i)
744
- uncached_texts.append(clean)
745
- all_results.append((i, None)) # placeholder
746
-
747
- if not uncached_texts:
748
- return [r for _, r in sorted(all_results)]
749
-
750
- # Process uncached in batches
751
- for batch_start in range(0, len(uncached_texts), batch_size):
752
- batch_texts = uncached_texts[batch_start:batch_start + batch_size]
753
- batch_original = [clauses[uncached_indices[batch_start + j]] for j in range(len(batch_texts))]
754
-
755
- try:
756
- inputs = cuad_tokenizer(
757
- batch_texts,
758
- return_tensors="pt",
759
- truncation=True,
760
- max_length=512,
761
- padding=True,
762
- )
763
- with torch.no_grad():
764
- logits = cuad_model(**inputs).logits
765
-
766
- probs = torch.softmax(logits, dim=-1)
767
-
768
- for j in range(len(batch_texts)):
769
- clause_probs = probs[j]
770
- original_text = batch_original[j]
771
- results = []
772
-
773
- # Primary prediction
774
- top_prob, top_idx = torch.max(clause_probs, dim=0)
775
- top_idx_int = int(top_idx)
776
- top_conf = float(top_prob)
777
-
778
- threshold = _CUAD_THRESHOLDS.get(top_idx_int, 0.40)
779
- if top_conf > threshold and top_idx_int < len(CUAD_LABELS):
780
- label = CUAD_LABELS[top_idx_int]
781
- conf = top_conf
782
- label, conf = _apply_guardrails(label, original_text, conf)
783
- if not (label == "Other" and conf < 0.3):
784
- risk = RISK_MAP.get(label, "LOW")
785
- results.append({
786
- "label": label,
787
- "confidence": round(conf, 3),
788
- "risk": risk,
789
- "description": DESC_MAP.get(label, label),
790
- "source": "ml",
791
- })
792
-
793
- # 2nd-best prediction
794
- sorted_probs, sorted_indices = torch.sort(clause_probs, descending=True)
795
- if len(sorted_probs) > 1:
796
- second_idx = int(sorted_indices[1])
797
- second_conf = float(sorted_probs[1])
798
- second_threshold = _CUAD_THRESHOLDS.get(second_idx, 0.40)
799
- if second_conf > second_threshold and second_idx < len(CUAD_LABELS):
800
- label2 = CUAD_LABELS[second_idx]
801
- conf2 = second_conf
802
- label2, conf2 = _apply_guardrails(label2, original_text, conf2)
803
- if not (label2 == "Other" and conf2 < 0.3):
804
- if not results or results[0]["label"] != label2:
805
- risk2 = RISK_MAP.get(label2, "LOW")
806
- results.append({
807
- "label": label2,
808
- "confidence": round(conf2, 3),
809
- "risk": risk2,
810
- "description": DESC_MAP.get(label2, label2),
811
- "source": "ml",
812
- })
813
-
814
- results.sort(key=lambda x: x["confidence"], reverse=True)
815
-
816
- if not results:
817
- results = _classify_regex(original_text)
818
-
819
- # Cache the result
820
- h = _text_hash(batch_texts[j][:512])
821
- _prediction_cache.put(h, results)
822
-
823
- # Update placeholder in all_results
824
- global_idx = uncached_indices[batch_start + j]
825
- for k, (idx, _) in enumerate(all_results):
826
- if idx == global_idx:
827
- all_results[k] = (idx, results)
828
- break
829
-
830
- except Exception as e:
831
- print(f"[ClauseGuard] Batch CUAD inference error: {e}")
832
- # Fallback to regex for this batch
833
- for j in range(len(batch_texts)):
834
- global_idx = uncached_indices[batch_start + j]
835
- results = _classify_regex(batch_original[j])
836
- for k, (idx, _) in enumerate(all_results):
837
- if idx == global_idx:
838
- all_results[k] = (idx, results)
839
- break
840
-
841
- return [r for _, r in sorted(all_results)]
842
-
843
- # FIX v4.1: Extended regex patterns to cover more CUAD categories
844
- _REGEX_PATTERNS = {
845
- "Limitation of liability": [r"not liable", r"shall not be (liable|responsible)", r"in no event.*liable", r"limitation of liability", r"without warranty", r"disclaim"],
846
- "Unilateral termination": [r"terminat.*at any time", r"suspend.*account.*without", r"we may (terminat|suspend|discontinu)", r"right to (terminat|suspend)"],
847
- "Unilateral change": [r"sole discretion", r"reserves? the right to (modify|change|update|amend)", r"at any time.*without (prior )?notice", r"we may (modify|change|update)"],
848
- "Content removal": [r"remove.*content.*without", r"right to remove", r"we may.*remove"],
849
- "Contract by using": [r"by (using|accessing).*you agree", r"continued use.*constitutes? acceptance"],
850
- "Choice of law": [r"governed by.*laws? of", r"shall be governed", r"laws of the state of"],
851
- "Jurisdiction": [r"exclusive jurisdiction", r"courts? of.*(california|delaware|new york|ireland|england)", r"submit to.*jurisdiction"],
852
- "Arbitration": [r"arbitrat", r"binding arbitration", r"waive.*right.*court", r"class action waiver"],
853
- "Governing Law": [r"governed by", r"laws of", r"jurisdiction of"],
854
- "Termination for Convenience": [r"terminat.*for convenience", r"terminat.*without cause", r"terminat.*at any time"],
855
- "Non-Compete": [r"non-compete", r"shall not compete", r"competition restriction"],
856
- "Exclusivity": [r"exclusive(?:ly)?(?:\s+(?:deal|relationship|partner|right))", r"exclusivity"],
857
- "IP Ownership Assignment": [r"assign.*intellectual property", r"ownership of.*ip", r"all rights.*assign", r"work.?for.?hire"],
858
- "Uncapped Liability": [r"unlimited liability", r"uncapped", r"no.*limit.*liability"],
859
- "Cap on Liability": [r"cap on liability", r"maximum liability", r"liability.*shall not exceed", r"aggregate liability.*not exceed"],
860
- "Indemnification": [r"indemnif", r"hold harmless", r"defend.*against.*claim"],
861
- "Confidentiality": [r"confidential(?:ity)?", r"non-disclosure", r"\bnda\b"],
862
- "Force Majeure": [r"force majeure", r"act of god", r"beyond.*(?:reasonable\s+)?control"],
863
- "Penalties": [r"penalt(?:y|ies)", r"late fee", r"default charge", r"interest on overdue"],
864
- # FIX v4.1: Added missing regex patterns for more CUAD categories
865
- "Audit Rights": [r"audit rights?", r"right to audit", r"inspect.*records?", r"examination of.*records?", r"access to.*books"],
866
- "Warranty Duration": [r"warrant(?:y|ies).*(?:period|duration|term|months?|years?)", r"warranty.*shall.*(?:remain|last|continue)", r"limited warranty"],
867
- "Insurance": [r"(?:shall|must).*maintain.*insurance", r"insurance.*coverage", r"policy of insurance", r"certificate of insurance"],
868
- "Source Code Escrow": [r"source code escrow", r"escrow.*source code", r"escrow agent"],
869
- "Post-Termination Services": [r"post.?termination.*(?:service|obligation|support)", r"(?:after|following|upon).*termination.*(?:shall|must|will).*(?:provide|continue)"],
870
- "Renewal Term": [r"renew(?:al)?.*term", r"auto(?:matic(?:ally)?)?.*renew", r"successive.*(?:term|period)"],
871
- "Notice Period to Terminate Renewal": [r"notice.*(?:to\s+)?terminat.*renew", r"(?:days?|months?).*(?:prior|advance).*(?:notice|written).*(?:terminat|renew)", r"notice of non.?renewal"],
872
- "Change of Control": [r"change of control", r"change in.*(?:ownership|control)", r"merger.*acquisition", r"sale of.*(?:all|substantially).*assets"],
873
- "Anti-Assignment": [r"(?:shall|may)\s+not\s+assign", r"anti.?assignment", r"no.*assignment.*without.*consent"],
874
- "Revenue/Profit Sharing": [r"revenue.*shar", r"profit.*shar", r"royalt(?:y|ies)"],
875
- "Liquidated Damages": [r"liquidated.*damages?", r"pre.?determined.*damage", r"stipulated.*damage"],
876
- "Covenant Not to Sue": [r"covenant not to sue", r"(?:shall|agree).*not.*(?:bring|file|commence).*(?:action|claim|suit)"],
877
- "Joint IP Ownership": [r"joint(?:ly)?.*own(?:ed|ership)?.*(?:ip|intellectual property)", r"co.?own(?:ed|ership)?"],
878
- "License Grant": [r"(?:grant|license).*(?:non.?exclusive|exclusive|perpetual|irrevocable).*(?:license|right)", r"hereby grants?.*license"],
879
- "Non-Transferable License": [r"non.?transferable.*license", r"license.*(?:shall|may)\s+not.*(?:transfer|assign|sublicense)"],
880
- "ROFR/ROFO/ROFN": [r"right of first.*(?:refusal|offer|negotiation)", r"ROFR", r"ROFO", r"ROFN"],
881
- "No-Solicit of Customers": [r"(?:shall|must|agree).*not.*solicit.*customer", r"no.?solicit.*customer", r"non.?solicitation.*customer"],
882
- "No-Solicit of Employees": [r"(?:shall|must|agree).*not.*solicit.*employee", r"no.?solicit.*employee", r"non.?solicitation.*employee", r"no.?hire"],
883
- "Non-Disparagement": [r"non.?disparagement", r"(?:shall|must|agree).*not.*(?:disparag|defam|make.*negative)", r"not.*make.*derogatory"],
884
- "Most Favored Nation": [r"most favou?red.*nation", r"MFN", r"most favou?red.*(?:customer|pricing|terms)"],
885
- "Third Party Beneficiary": [r"third.?party.*beneficiar", r"no.*third.?party.*beneficiar"],
886
- "Minimum Commitment": [r"minimum.*(?:commitment|purchase|order|volume|spend)", r"(?:shall|must).*(?:purchase|order).*(?:at least|minimum|no less than)"],
887
- "Volume Restriction": [r"volume.*(?:restriction|limitation|cap|ceiling)", r"(?:shall|may).*not.*exceed.*(?:volume|quantity)"],
888
- "Price Restriction": [r"price.*(?:restriction|limitation|ceiling|cap|floor)", r"(?:shall|may).*not.*(?:increase|raise|exceed).*price"],
889
- }
890
-
891
- # FIX v4.2: Pre-compile regex patterns at module level (was recompiling per call)
892
- _REGEX_PATTERNS_COMPILED = {}
893
- for _label, _pats in _REGEX_PATTERNS.items():
894
- _REGEX_PATTERNS_COMPILED[_label] = [re.compile(p, re.IGNORECASE) for p in _pats]
895
-
896
- def _classify_regex(text):
897
- """Regex fallback β€” returns pattern match, NOT fake confidence."""
898
- text_lower = text.lower()
899
- results = []
900
- seen = set()
901
- for label, patterns in _REGEX_PATTERNS_COMPILED.items():
902
- for pat in patterns:
903
- if pat.search(text_lower):
904
- if label not in seen:
905
- risk = RISK_MAP.get(label, "MEDIUM")
906
- results.append({
907
- "label": label,
908
- "confidence": None,
909
- "risk": risk,
910
- "description": DESC_MAP.get(label, label),
911
- "source": "pattern",
912
- })
913
- seen.add(label)
914
- break
915
- return results
916
-
917
- # ═══════════════════════════════════════════════════════════════════════
918
- # 6. LEGAL NER β€” ML model with regex fallback
919
- # FIX v4.1: Batch all chunks in single pipeline call
920
- # ═══════════════════════════════════════════════════════════════════════
921
-
922
- def extract_entities(text):
923
- """Extract entities using ML model (matterstack/legal-bert-ner) with regex fallback."""
924
- entities = []
925
-
926
- if _HAS_NER_MODEL and ner_pipeline is not None:
927
- try:
928
- # FIX v4.1: Create overlapping chunks but batch them in a SINGLE pipeline call
929
- max_text = min(len(text), 10000)
930
- chunks = [text[i:i+512] for i in range(0, max_text, 450)]
931
- offsets = list(range(0, max_text, 450))
932
-
933
- # Single batched pipeline call instead of sequential
934
- all_ner_results = ner_pipeline(chunks, batch_size=8)
935
-
936
- for chunk_idx, ner_results in enumerate(all_ner_results):
937
- offset = offsets[chunk_idx]
938
- for ent in ner_results:
939
- if ent.get("score", 0) > 0.5:
940
- entities.append({
941
- "text": ent["word"],
942
- "type": _map_ner_label(ent.get("entity_group", ent.get("entity", "MISC"))),
943
- "start": ent["start"] + offset,
944
- "end": ent["end"] + offset,
945
- "score": round(ent["score"], 3),
946
- "source": "ml",
947
- })
948
- except Exception as e:
949
- print(f"[ClauseGuard] ML NER error, falling back to regex: {e}")
950
- entities = _extract_entities_regex(text)
951
- else:
952
- entities = _extract_entities_regex(text)
953
-
954
- # Always supplement with regex patterns for things NER often misses
955
- regex_ents = _extract_entities_regex(text)
956
- ml_spans = set()
957
- for e in entities:
958
- for pos in range(e["start"], e["end"]):
959
- ml_spans.add(pos)
960
- for re_ent in regex_ents:
961
- if not any(pos in ml_spans for pos in range(re_ent["start"], re_ent["end"])):
962
- entities.append(re_ent)
963
-
964
- # Deduplicate and sort
965
- entities.sort(key=lambda x: (x["start"], -(x["end"] - x["start"])))
966
- filtered = []
967
- last_end = -1
968
- for e in entities:
969
- if e["start"] >= last_end:
970
- filtered.append(e)
971
- last_end = e["end"]
972
- return filtered
973
-
974
- def _map_ner_label(label):
975
- label = label.upper()
976
- mapping = {
977
- "PER": "PERSON", "PERSON": "PERSON",
978
- "ORG": "PARTY", "ORGANIZATION": "PARTY",
979
- "LOC": "JURISDICTION", "LOCATION": "JURISDICTION",
980
- "GPE": "JURISDICTION", "DATE": "DATE",
981
- "MONEY": "MONEY", "MISC": "MISC", "LAW": "LEGAL_REF",
982
- }
983
- return mapping.get(label, label)
984
-
985
- def _extract_entities_regex(text):
986
- """Regex-based NER fallback."""
987
- entities = []
988
- patterns = [
989
- (r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', "DATE"),
990
- (r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', "DATE"),
991
- (r'\b\d{1,2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2,4}\b', "DATE"),
992
- (r'\b(?:Effective|Commencement|Expiration|Termination)\s+Date\b', "DATE_REF"),
993
- (r'\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?(?:\s*(?:million|billion|thousand|M|B|K))?', "MONEY"),
994
- (r'\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|EUR|GBP|dollars|euros|pounds)', "MONEY"),
995
- (r'\b(?:USD|EUR|GBP)\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?', "MONEY"),
996
- (r'\b\d+(?:\.\d+)?%', "PERCENTAGE"),
997
- (r'\b\d+\s*(?:year|month|week|day|business day)s?\b', "DURATION"),
998
- (r'\b[A-Z][A-Za-z0-9\s&,]+?(?:Inc\.?|LLC|Ltd\.?|Limited|Corp\.?|Corporation|PLC|GmbH|AG|S\.A\.?|B\.V\.?|L\.P\.?|LLP)\b', "PARTY"),
999
- (r'\b(?:Party A|Party B|Disclosing Party|Receiving Party|Licensor|Licensee|Buyer|Seller|Tenant|Landlord|Employer|Employee|Customer|Vendor|Client)\b', "PARTY_ROLE"),
1000
- (r'\b(?:State|Commonwealth)\s+of\s+[A-Z][a-zA-Z\s]+', "JURISDICTION"),
1001
- (r'\b(?:California|Delaware|New York|Texas|Florida|England|Ireland|Germany|France|Singapore|Hong Kong|Ontario|British Columbia)\b', "JURISDICTION"),
1002
- (r'"([A-Z][A-Za-z\s]{1,40})"', "DEFINED_TERM"),
1003
- (r'\((?:the\s+)?"([A-Z][A-Za-z\s]{1,40})"\)', "DEFINED_TERM"),
1004
- ]
1005
- for pat, etype in patterns:
1006
- for m in re.finditer(pat, text, re.IGNORECASE if etype in ("DATE", "MONEY", "DURATION", "PERCENTAGE") else 0):
1007
- txt = m.group(1) if m.lastindex else m.group()
1008
- entities.append({
1009
- "text": txt,
1010
- "type": etype,
1011
- "start": m.start(),
1012
- "end": m.end(),
1013
- "source": "pattern",
1014
- })
1015
- return entities
1016
-
1017
- # ═══════════════════════════════════════════════════════════════════════
1018
- # 7. NLI / CONTRADICTION DETECTION
1019
- # FIX v4.1: Pass (text_a, text_b) as dict with proper keys for
1020
- # cross-encoder pipeline, not [SEP]-concatenated string
1021
- # ═══════════════════════════════════════════════════════════════════════
1022
-
1023
- def _run_nli(text_a, text_b):
1024
- """Run NLI using CrossEncoder with correct input format.
1025
- FIX v4.2: Use sentence_transformers.CrossEncoder.predict() which accepts
1026
- a list of (text_a, text_b) tuples. Returns scores for [contradiction, entailment, neutral].
1027
- The old code used pipeline("text-classification") with dict input, which was broken."""
1028
- try:
1029
- # CrossEncoder.predict returns numpy array of shape (n_pairs, 3)
1030
- # Columns: [contradiction, entailment, neutral]
1031
- scores = nli_model.predict([(text_a[:256], text_b[:256])])
1032
- label_mapping = ["contradiction", "entailment", "neutral"]
1033
- top_idx = int(scores[0].argmax())
1034
- top_score = float(scores[0][top_idx])
1035
- return [{"label": label_mapping[top_idx], "score": top_score}]
1036
- except Exception as e:
1037
- print(f"[ClauseGuard] NLI inference error: {e}")
1038
- return None
1039
-
1040
-
1041
- def detect_contradictions(clause_results, raw_text=""):
1042
- """
1043
- Detect contradictions using:
1044
- 1. NLI cross-encoder model (semantic contradiction detection)
1045
- 2. Structural conflict detection (mutually exclusive labels)
1046
- 3. Missing critical clause detection
1047
- """
1048
- contradictions = []
1049
- labels_found = set()
1050
- clause_texts_by_label = defaultdict(list)
1051
-
1052
- for cr in clause_results:
1053
- labels_found.add(cr["label"])
1054
- clause_texts_by_label[cr["label"]].append(cr.get("text", ""))
1055
-
1056
- # ── 1. Semantic NLI (if model available) ──
1057
- if _HAS_NLI_MODEL and nli_model is not None:
1058
- conflict_pairs = [
1059
- ("Uncapped Liability", "Cap on Liability",
1060
- "Liability cannot be both uncapped and capped simultaneously."),
1061
- ("IP Ownership Assignment", "Joint IP Ownership",
1062
- "IP cannot be both fully assigned and jointly owned."),
1063
- ("Exclusivity", "Non-Transferable License",
1064
- "Exclusivity and non-transferable license may conflict."),
1065
- ]
1066
- for label_a, label_b, explanation in conflict_pairs:
1067
- if label_a in labels_found and label_b in labels_found:
1068
- texts_a = clause_texts_by_label[label_a]
1069
- texts_b = clause_texts_by_label[label_b]
1070
- for ta in texts_a[:2]:
1071
- for tb in texts_b[:2]:
1072
- # FIX v4.1: Use proper NLI input format
1073
- nli_result = _run_nli(ta, tb)
1074
- if nli_result is None:
1075
- continue
1076
- for r in (nli_result if isinstance(nli_result, list) else [nli_result]):
1077
- if r.get("label", "").lower() == "contradiction" and r.get("score", 0) > 0.6:
1078
- contradictions.append({
1079
- "type": "CONTRADICTION",
1080
- "explanation": explanation,
1081
- "severity": "HIGH",
1082
- "clauses": [label_a, label_b],
1083
- "confidence": round(r["score"], 3),
1084
- "source": "nli_model",
1085
- })
1086
-
1087
- # Also check for internal contradictions within governing law / termination
1088
- for label in ["Governing Law", "Termination for Convenience"]:
1089
- texts = clause_texts_by_label.get(label, [])
1090
- if len(texts) >= 2:
1091
- for i in range(len(texts)):
1092
- for j in range(i + 1, min(len(texts), i + 3)):
1093
- nli_result = _run_nli(texts[i], texts[j])
1094
- if nli_result is None:
1095
- continue
1096
- for r in (nli_result if isinstance(nli_result, list) else [nli_result]):
1097
- if r.get("label", "").lower() == "contradiction" and r.get("score", 0) > 0.6:
1098
- contradictions.append({
1099
- "type": "CONTRADICTION",
1100
- "explanation": f"Conflicting {label} provisions detected β€” clauses contradict each other.",
1101
- "severity": "HIGH",
1102
- "clauses": [label],
1103
- "confidence": round(r["score"], 3),
1104
- "source": "nli_model",
1105
- })
1106
- else:
1107
- # ── Heuristic fallback (improved) ──
1108
- _heuristic_pairs = [
1109
- (["Uncapped Liability"], ["Cap on Liability"],
1110
- "Liability cannot be both uncapped and capped simultaneously."),
1111
- (["IP Ownership Assignment"], ["Joint IP Ownership"],
1112
- "IP cannot be both fully assigned and jointly owned."),
1113
- ]
1114
- for group_a, group_b, explanation in _heuristic_pairs:
1115
- found_a = any(l in labels_found for l in group_a)
1116
- found_b = any(l in labels_found for l in group_b)
1117
- if found_a and found_b:
1118
- contradictions.append({
1119
- "type": "CONTRADICTION",
1120
- "explanation": explanation,
1121
- "severity": "HIGH",
1122
- "clauses": group_a + group_b,
1123
- "source": "heuristic",
1124
- })
1125
-
1126
- # ── 2. Missing critical clauses ──
1127
- _REQUIRED_CLAUSE_PATTERNS = {
1128
- "Governing Law": re.compile(
1129
- r'govern(?:ed|ing).{0,15}law|applicable.{0,10}law|laws?\s+of\s+the\s+state',
1130
- re.IGNORECASE
1131
- ),
1132
- "Limitation of liability": re.compile(
1133
- r'limitation.{0,10}liabilit|cap.{0,10}liabilit|liabilit.{0,10}shall\s+not\s+exceed|in\s+no\s+event.{0,20}liable',
1134
- re.IGNORECASE
1135
- ),
1136
- "Arbitration": re.compile(
1137
- r'arbitrat|AAA|JAMS|binding.{0,10}dispute',
1138
- re.IGNORECASE
1139
- ),
1140
- "Termination": re.compile(
1141
- r'terminat(?:e|ion|ed)|cancel(?:lation)?',
1142
- re.IGNORECASE
1143
- ),
1144
- }
1145
- for clause_name, pattern in _REQUIRED_CLAUSE_PATTERNS.items():
1146
- if not pattern.search(raw_text):
1147
- contradictions.append({
1148
- "type": "MISSING",
1149
- "explanation": f"No '{clause_name}' clause detected in the document.",
1150
- "severity": "MEDIUM",
1151
- "clauses": [clause_name],
1152
- "source": "structural",
1153
- })
1154
-
1155
- # Deduplicate
1156
- seen = set()
1157
- unique = []
1158
- for c in contradictions:
1159
- key = (c["type"], c["explanation"])
1160
- if key not in seen:
1161
- seen.add(key)
1162
- unique.append(c)
1163
-
1164
- return unique
1165
-
1166
- # ═══════════════════════════════════════════════════════════════════════
1167
- # 8. RISK SCORING
1168
- # FIX v4.1: Absolute risk based on findings, not normalized by doc length
1169
- # ═══════════════════════════════════════════════════════════════════════
1170
-
1171
- def compute_risk_score(clause_results, total_clauses):
1172
- sev_counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
1173
- for cr in clause_results:
1174
- sev = cr.get("risk", "LOW")
1175
- sev_counts[sev] += 1
1176
- if total_clauses == 0:
1177
- return 0, "A", sev_counts
1178
-
1179
- # FIX v4.1: Absolute risk β€” critical findings should always score high
1180
- # regardless of document size. A 200-clause doc with 5 critical findings
1181
- # is just as dangerous as a 10-clause doc with 5 critical findings.
1182
- weighted = sum(sev_counts[s] * RISK_WEIGHTS[s] for s in sev_counts)
1183
-
1184
- # Diminishing returns formula: starts linear, flattens near 100
1185
- # max theoretical = 100, one CRITICAL finding = ~30, two = ~48, five = ~72
1186
- risk = min(100, round(100 * (1 - (1 / (1 + weighted / 30)))))
1187
-
1188
- if risk >= 70: grade = "F"
1189
- elif risk >= 50: grade = "D"
1190
- elif risk >= 30: grade = "C"
1191
- elif risk >= 15: grade = "B"
1192
- else: grade = "A"
1193
- return risk, grade, sev_counts
1194
-
1195
- # ═══════════════════════════════════════════════════════════════════════
1196
- # 9. MAIN ANALYSIS PIPELINE
1197
- # ═══════════════════════════════════════════════════════════════════════
1198
-
1199
- def analyze_contract(text):
1200
- if not text or len(text.strip()) < 50:
1201
- return None, "Document too short (minimum 50 characters)"
1202
- clauses = split_clauses(text)
1203
- if not clauses:
1204
- return None, "No clauses detected in document"
1205
-
1206
- # PERF v4.3: Batch classification β€” single forward pass instead of per-clause
1207
- batch_predictions = classify_cuad_batch(clauses, batch_size=8)
1208
-
1209
- clause_results = []
1210
- for clause, predictions in zip(clauses, batch_predictions):
1211
- if predictions:
1212
- for pred in predictions:
1213
- clause_results.append({
1214
- "text": clause,
1215
- "label": pred["label"],
1216
- "confidence": pred["confidence"],
1217
- "risk": pred["risk"],
1218
- "description": pred["description"],
1219
- "source": pred.get("source", "unknown"),
1220
- })
1221
- entities = extract_entities(text)
1222
- contradictions = detect_contradictions(clause_results, text)
1223
- risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
1224
- obligations = extract_obligations(text)
1225
- compliance = check_compliance(text)
1226
-
1227
- flagged_clause_count = len(clause_results)
1228
- unique_flagged_texts = len(set(cr["text"] for cr in clause_results))
1229
-
1230
- result = {
1231
- "metadata": {
1232
- "analysis_date": datetime.now().isoformat(),
1233
- "total_clauses": len(clauses),
1234
- "flagged_clauses": flagged_clause_count,
1235
- "unique_flagged": unique_flagged_texts,
1236
- "model": get_model_status_text(),
1237
- "text_hash": hashlib.sha256(re.sub(r'\s+', ' ', text.strip()).encode()).hexdigest()[:16],
1238
- },
1239
- "risk": {
1240
- "score": risk,
1241
- "grade": grade,
1242
- "breakdown": sev_counts,
1243
- },
1244
- "clauses": clause_results,
1245
- "entities": entities,
1246
- "contradictions": contradictions,
1247
- "obligations": obligations,
1248
- "compliance": compliance,
1249
- "raw_text": text,
1250
- }
1251
- return result, None
1252
-
1253
- # ═══════════════════════════════════════════════════════════════════════
1254
- # 10. EXPORT FUNCTIONS
1255
- # ═══════════════════════════════════════════════════════════════════════
1256
-
1257
- def export_json(result):
1258
- if result is None:
1259
- return None
1260
- return json.dumps(result, indent=2, default=str)
1261
-
1262
- def export_csv(result):
1263
- if result is None:
1264
- return None
1265
- output = io.StringIO()
1266
- writer = csv.writer(output)
1267
- writer.writerow(["Clause Text", "Label", "Risk", "Confidence", "Description", "Source"])
1268
- for cr in result.get("clauses", []):
1269
- conf = cr.get("confidence")
1270
- conf_str = f"{conf:.3f}" if conf is not None else "pattern match"
1271
- writer.writerow([
1272
- cr.get("text", "")[:500],
1273
- cr.get("label", ""),
1274
- cr.get("risk", ""),
1275
- conf_str,
1276
- cr.get("description", ""),
1277
- cr.get("source", ""),
1278
- ])
1279
- return output.getvalue()
1280
-
1281
- # ═══════════════════════════════════════════════════════════════════════
1282
- # 11. UI RENDERING
1283
- # ═══════════════════════════════════════════════════════════════════════
1284
-
1285
- def render_summary(result):
1286
- if result is None:
1287
- return ""
1288
- risk = result["risk"]
1289
- score = risk["score"]
1290
- grade = risk["grade"]
1291
- breakdown = risk["breakdown"]
1292
- grade_color = {
1293
- "A": "#16a34a", "B": "#65a30d", "C": "#ca8a04",
1294
- "D": "#ea580c", "F": "#dc2626",
1295
- }.get(grade, "#6b7280")
1296
- crit, high, med, low = breakdown["CRITICAL"], breakdown["HIGH"], breakdown["MEDIUM"], breakdown["LOW"]
1297
- html = f"""
1298
- <div style="font-family:system-ui,sans-serif;padding:16px;border:1px solid #e5e7eb;border-radius:12px;background:#fff;">
1299
- <div style="text-align:center;margin-bottom:16px;">
1300
- <div style="font-size:48px;font-weight:700;color:{grade_color};">{score}</div>
1301
- <div style="font-size:14px;color:#6b7280;">/100 Risk Score</div>
1302
- <div style="display:inline-block;margin-top:8px;padding:4px 16px;border-radius:20px;background:{grade_color};color:white;font-weight:600;font-size:14px;">
1303
- Grade {grade}
1304
- </div>
1305
- </div>
1306
- <div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;margin-bottom:12px;">
1307
- <div style="padding:8px;border-radius:6px;background:#fef2f2;text-align:center;">
1308
- <div style="font-size:20px;font-weight:700;color:#dc2626;">{crit}</div>
1309
- <div style="font-size:11px;color:#991b1b;">Critical</div>
1310
- </div>
1311
- <div style="padding:8px;border-radius:6px;background:#fff7ed;text-align:center;">
1312
- <div style="font-size:20px;font-weight:700;color:#ea580c;">{high}</div>
1313
- <div style="font-size:11px;color:#9a3412;">High</div>
1314
- </div>
1315
- <div style="padding:8px;border-radius:6px;background:#fefce8;text-align:center;">
1316
- <div style="font-size:20px;font-weight:700;color:#ca8a04;">{med}</div>
1317
- <div style="font-size:11px;color:#854d0e;">Medium</div>
1318
- </div>
1319
- <div style="padding:8px;border-radius:6px;background:#f0fdf4;text-align:center;">
1320
- <div style="font-size:20px;font-weight:700;color:#16a34a;">{low}</div>
1321
- <div style="font-size:11px;color:#166534;">Low</div>
1322
- </div>
1323
- </div>
1324
- <div style="font-size:12px;color:#6b7280;text-align:center;">
1325
- {result['metadata']['total_clauses']} clauses analyzed Β· {result['metadata']['flagged_clauses']} flagged
1326
- <br><span style="font-size:10px;">{result['metadata']['model']}</span>
1327
- </div>
1328
- </div>
1329
- """
1330
- return html
1331
-
1332
- def render_clause_cards(result):
1333
- if result is None:
1334
- return ""
1335
- clauses = result.get("clauses", [])
1336
- if not clauses:
1337
- return '<div style="padding:24px;text-align:center;color:#6b7280;">No clauses detected.</div>'
1338
- grouped = defaultdict(list)
1339
- for cr in clauses:
1340
- grouped[cr["text"]].append(cr)
1341
- html = '<div style="font-family:system-ui,sans-serif;">'
1342
- for text, items in grouped.items():
1343
- max_risk = max(items, key=lambda x: {"CRITICAL":4,"HIGH":3,"MEDIUM":2,"LOW":1}[x["risk"]])["risk"]
1344
- border, bg, icon = RISK_STYLES[max_risk]
1345
- tags = ""
1346
- for item in items:
1347
- tag_bg = RISK_STYLES[item["risk"]][1]
1348
- tag_color = RISK_STYLES[item["risk"]][0]
1349
- conf = item.get("confidence")
1350
- source = item.get("source", "")
1351
- if conf is not None:
1352
- conf_text = f"{conf:.0%}"
1353
- else:
1354
- conf_text = "pattern"
1355
- source_icon = "πŸ€–" if source == "ml" else "πŸ“"
1356
- tags += f'<span style="background:{tag_bg};color:{tag_color};border:1px solid {tag_color}33;padding:2px 8px;border-radius:12px;font-size:11px;font-weight:500;margin-right:4px;">{source_icon} {item["label"]} ({conf_text})</span>'
1357
- descs = "".join(
1358
- f'<p style="font-size:12px;color:#6b7280;margin:4px 0 0 0;">{item["description"]}</p>'
1359
- for item in items
1360
- )
1361
- preview = text[:300] + ("..." if len(text) > 300 else "")
1362
- preview = preview.replace("<", "&lt;").replace(">", "&gt;")
1363
- html += f"""
1364
- <div style="border:1px solid #e5e7eb;border-left:4px solid {border};border-radius:8px;padding:14px;margin-bottom:10px;background:#fafafa;">
1365
- <div style="display:flex;align-items:center;gap:6px;margin-bottom:6px;">
1366
- <span style="font-size:16px;">{icon}</span>
1367
- <span style="font-size:12px;font-weight:600;color:{border};text-transform:uppercase;">{max_risk}</span>
1368
- </div>
1369
- <p style="font-size:13px;color:#374151;line-height:1.6;margin:0 0 8px 0;">{preview}</p>
1370
- <div style="margin-bottom:6px;">{tags}</div>
1371
- {descs}
1372
- </div>
1373
- """
1374
- html += "</div>"
1375
- return html
1376
-
1377
- def render_entities(result):
1378
- if result is None:
1379
- return ""
1380
- entities = result.get("entities", [])
1381
- if not entities:
1382
- return '<div style="padding:16px;color:#6b7280;">No entities detected.</div>'
1383
- grouped = defaultdict(list)
1384
- for e in entities:
1385
- grouped[e["type"]].append(e["text"])
1386
- html = '<div style="font-family:system-ui,sans-serif;">'
1387
- for etype, texts in grouped.items():
1388
- unique = list(dict.fromkeys(texts))[:20]
1389
- color = {
1390
- "DATE": "#3b82f6", "DATE_REF": "#60a5fa",
1391
- "MONEY": "#22c55e", "PERCENTAGE": "#10b981",
1392
- "DURATION": "#6366f1",
1393
- "PARTY": "#8b5cf6", "PARTY_ROLE": "#a78bfa",
1394
- "PERSON": "#ec4899",
1395
- "JURISDICTION": "#f59e0b",
1396
- "DEFINED_TERM": "#ec4899",
1397
- "LEGAL_REF": "#6b7280",
1398
- "MISC": "#9ca3af",
1399
- }.get(etype, "#6b7280")
1400
- items_html = "".join(
1401
- f'<span style="display:inline-block;background:{color}15;color:{color};border:1px solid {color}40;padding:3px 10px;border-radius:6px;font-size:12px;margin:3px;">{t}</span>'
1402
- for t in unique
1403
- )
1404
- html += f"""
1405
- <div style="margin-bottom:12px;">
1406
- <div style="font-size:12px;font-weight:600;color:#374151;margin-bottom:6px;text-transform:uppercase;">{etype}</div>
1407
- <div>{items_html}</div>
1408
- </div>
1409
- """
1410
- html += "</div>"
1411
- return html
1412
-
1413
- def render_contradictions(result):
1414
- if result is None:
1415
- return ""
1416
- contradictions = result.get("contradictions", [])
1417
- if not contradictions:
1418
- return '<div style="padding:16px;color:#16a34a;">βœ“ No contradictions or missing clauses detected.</div>'
1419
- html = '<div style="font-family:system-ui,sans-serif;">'
1420
- for c in contradictions:
1421
- sev_color = RISK_STYLES[c["severity"]][0]
1422
- icon = "⚠️" if c["type"] == "CONTRADICTION" else "πŸ“‹"
1423
- source = c.get("source", "")
1424
- source_badge = ""
1425
- if source == "nli_model":
1426
- conf = c.get("confidence", 0)
1427
- source_badge = f'<span style="font-size:10px;background:#eff6ff;color:#3b82f6;padding:1px 6px;border-radius:4px;margin-left:8px;">πŸ€– NLI {conf:.0%}</span>'
1428
- elif source == "heuristic":
1429
- source_badge = '<span style="font-size:10px;background:#fef3c7;color:#92400e;padding:1px 6px;border-radius:4px;margin-left:8px;">πŸ“ Heuristic</span>'
1430
- html += f"""
1431
- <div style="border:1px solid #e5e7eb;border-left:4px solid {sev_color};border-radius:8px;padding:12px;margin-bottom:8px;background:#fafafa;">
1432
- <div style="display:flex;align-items:center;gap:6px;margin-bottom:4px;">
1433
- <span>{icon}</span>
1434
- <span style="font-size:12px;font-weight:600;color:{sev_color};">{c["type"]}</span>
1435
- {source_badge}
1436
- </div>
1437
- <p style="font-size:13px;color:#374151;margin:0;">{c["explanation"]}</p>
1438
- </div>
1439
- """
1440
- html += "</div>"
1441
- return html
1442
-
1443
- def render_document_viewer(result):
1444
- if result is None:
1445
- return ""
1446
- text = result.get("raw_text", "")
1447
- entities = sorted(result.get("entities", []), key=lambda x: x["start"])
1448
- html_parts = []
1449
- last_end = 0
1450
- entity_colors = {
1451
- "DATE": "#3b82f6", "DATE_REF": "#60a5fa", "MONEY": "#22c55e",
1452
- "PERCENTAGE": "#10b981", "DURATION": "#6366f1", "PARTY": "#8b5cf6",
1453
- "PARTY_ROLE": "#a78bfa", "PERSON": "#ec4899", "JURISDICTION": "#f59e0b",
1454
- "DEFINED_TERM": "#ec4899", "LEGAL_REF": "#6b7280", "MISC": "#9ca3af",
1455
- }
1456
- for e in entities:
1457
- if e["start"] >= last_end:
1458
- plain = text[last_end:e["start"]].replace("<", "&lt;").replace(">", "&gt;")
1459
- html_parts.append(plain)
1460
- color = entity_colors.get(e["type"], "#6b7280")
1461
- entity_text = text[e["start"]:e["end"]].replace("<", "&lt;").replace(">", "&gt;")
1462
- html_parts.append(
1463
- f'<span style="background:{color}20;color:{color};border-bottom:2px solid {color};padding:0 2px;border-radius:2px;" '
1464
- f'title="{e["type"]}">{entity_text}</span>'
1465
- )
1466
- last_end = e["end"]
1467
- if last_end < len(text):
1468
- html_parts.append(text[last_end:].replace("<", "&lt;").replace(">", "&gt;"))
1469
- return f'<div style="font-family:ui-monospace,monospace;font-size:13px;line-height:1.8;white-space:pre-wrap;padding:16px;">{"".join(html_parts)}</div>'
 
1
+ ${file:/app/app.py}