Spaces:
Sleeping
Sleeping
fix: upload actual app.py content with all v4.1 fixes
Browse files
app.py
CHANGED
|
@@ -1 +1,1269 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ClauseGuard β World's Best Legal Contract Analysis Tool (v4.1)
|
| 3 |
+
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
Fixes in v4.1:
|
| 5 |
+
β’ FIX: Bounded LRU caches (chunk_cache, prediction_cache) β no more memory leaks
|
| 6 |
+
β’ FIX: NLI input format β pass (text_a, text_b) tuple, not [SEP]-concatenated string
|
| 7 |
+
β’ FIX: Classifier max_length raised to 512 (was 256 β truncating legal clauses)
|
| 8 |
+
β’ FIX: Risk score formula β absolute risk, not normalized by total_clauses
|
| 9 |
+
β’ FIX: Train/inference alignment β use softmax+argmax for single-label model
|
| 10 |
+
β’ FIX: Added missing regex fallback patterns for more CUAD categories
|
| 11 |
+
β’ FIX: Entity extraction batching β single pipeline call instead of sequential
|
| 12 |
+
β’ PERF: Shared model singleton via models.py module
|
| 13 |
+
β’ PERF: LRU-bounded caches everywhere
|
| 14 |
+
|
| 15 |
+
Carried from v4.0:
|
| 16 |
+
β’ OCR support for scanned PDFs (docTR engine with smart native/scanned routing)
|
| 17 |
+
β’ Contract Q&A Chatbot (RAG: embedding retrieval + HF Inference API streaming)
|
| 18 |
+
β’ Clause Redlining (3-tier: template lookup + RAG + LLM refinement)
|
| 19 |
+
β’ Fixed CUAD label mapping (added missing index 6)
|
| 20 |
+
β’ Structure-aware clause splitting
|
| 21 |
+
β’ Real NLI contradiction detection via cross-encoder model
|
| 22 |
+
β’ ML-based Legal NER with regex fallback
|
| 23 |
+
β’ Semantic compliance checking with negation handling
|
| 24 |
+
β’ Improved obligation extraction with false-positive filtering
|
| 25 |
+
β’ LLM-powered clause explanations
|
| 26 |
+
β’ Per-session temp files (no collision)
|
| 27 |
+
β’ Model health reporting
|
| 28 |
+
|
| 29 |
+
Models:
|
| 30 |
+
β’ Clause classifier: Mokshith31/legalbert-contract-clause-classification
|
| 31 |
+
(LoRA adapter on nlpaueb/legal-bert-base-uncased, 41 CUAD classes)
|
| 32 |
+
β’ Legal NER: matterstack/legal-bert-ner (token classification)
|
| 33 |
+
β’ NLI: cross-encoder/nli-deberta-v3-base (contradiction detection)
|
| 34 |
+
β’ Embeddings: sentence-transformers/all-MiniLM-L6-v2 (RAG retrieval)
|
| 35 |
+
β’ OCR: docTR fast_base + crnn_vgg16_bn (scanned PDF extraction)
|
| 36 |
+
β’ LLM: Qwen/Qwen2.5-7B-Instruct via HF Inference API (chatbot + redlining)
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
import os
|
| 40 |
+
import re
|
| 41 |
+
import json
|
| 42 |
+
import csv
|
| 43 |
+
import io
|
| 44 |
+
import uuid
|
| 45 |
+
import tempfile
|
| 46 |
+
import hashlib
|
| 47 |
+
from collections import defaultdict, OrderedDict
|
| 48 |
+
from datetime import datetime
|
| 49 |
+
from functools import lru_cache
|
| 50 |
+
|
| 51 |
+
import gradio as gr
|
| 52 |
+
import numpy as np
|
| 53 |
+
|
| 54 |
+
# ββ Document parsers (soft-fail) ββββββββββββββββββββββββββββββββββββ
|
| 55 |
+
try:
|
| 56 |
+
import pdfplumber
|
| 57 |
+
_HAS_PDF = True
|
| 58 |
+
except Exception:
|
| 59 |
+
_HAS_PDF = False
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
from docx import Document as DocxDocument
|
| 63 |
+
_HAS_DOCX = True
|
| 64 |
+
except Exception:
|
| 65 |
+
_HAS_DOCX = False
|
| 66 |
+
|
| 67 |
+
# ββ PyTorch / Transformers (soft-fail) ββββββββββββββββββββββββββββββββ
|
| 68 |
+
_HAS_TORCH = False
|
| 69 |
+
_HAS_NER_MODEL = False
|
| 70 |
+
_HAS_NLI_MODEL = False
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
import torch
|
| 74 |
+
from transformers import (
|
| 75 |
+
AutoTokenizer, AutoModelForSequenceClassification,
|
| 76 |
+
AutoModelForTokenClassification, pipeline
|
| 77 |
+
)
|
| 78 |
+
from peft import PeftModel
|
| 79 |
+
_HAS_TORCH = True
|
| 80 |
+
except Exception:
|
| 81 |
+
pass
|
| 82 |
+
|
| 83 |
+
# ββ Import submodules βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
+
from compare import compare_contracts, render_comparison_html
|
| 85 |
+
from obligations import extract_obligations, render_obligations_html
|
| 86 |
+
from compliance import check_compliance, render_compliance_html
|
| 87 |
+
from ocr_engine import parse_pdf_smart, get_ocr_status
|
| 88 |
+
from chatbot import index_contract, chat_respond, get_chatbot_status
|
| 89 |
+
from redlining import generate_redlines, render_redlines_html
|
| 90 |
+
|
| 91 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 92 |
+
# 1. CONFIGURATION β FIXED label mapping (41 labels, index 6 restored)
|
| 93 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 94 |
+
|
| 95 |
+
CUAD_LABELS = [
|
| 96 |
+
"Document Name", # 0
|
| 97 |
+
"Parties", # 1
|
| 98 |
+
"Agreement Date", # 2
|
| 99 |
+
"Effective Date", # 3
|
| 100 |
+
"Expiration Date", # 4
|
| 101 |
+
"Renewal Term", # 5
|
| 102 |
+
"Notice Period to Terminate Renewal", # 6 β WAS MISSING
|
| 103 |
+
"Governing Law", # 7
|
| 104 |
+
"Most Favored Nation", # 8
|
| 105 |
+
"Non-Compete", # 9
|
| 106 |
+
"Exclusivity", # 10
|
| 107 |
+
"No-Solicit of Customers", # 11
|
| 108 |
+
"No-Solicit of Employees", # 12
|
| 109 |
+
"Non-Disparagement", # 13
|
| 110 |
+
"Termination for Convenience", # 14
|
| 111 |
+
"ROFR/ROFO/ROFN", # 15
|
| 112 |
+
"Change of Control", # 16
|
| 113 |
+
"Anti-Assignment", # 17
|
| 114 |
+
"Revenue/Profit Sharing", # 18
|
| 115 |
+
"Price Restriction", # 19
|
| 116 |
+
"Minimum Commitment", # 20
|
| 117 |
+
"Volume Restriction", # 21
|
| 118 |
+
"IP Ownership Assignment", # 22
|
| 119 |
+
"Joint IP Ownership", # 23
|
| 120 |
+
"License Grant", # 24
|
| 121 |
+
"Non-Transferable License", # 25
|
| 122 |
+
"Affiliate License-Licensor", # 26
|
| 123 |
+
"Affiliate License-Licensee", # 27
|
| 124 |
+
"Unlimited/All-You-Can-Eat License", # 28
|
| 125 |
+
"Irrevocable or Perpetual License", # 29
|
| 126 |
+
"Source Code Escrow", # 30
|
| 127 |
+
"Post-Termination Services", # 31
|
| 128 |
+
"Audit Rights", # 32
|
| 129 |
+
"Uncapped Liability", # 33
|
| 130 |
+
"Cap on Liability", # 34
|
| 131 |
+
"Liquidated Damages", # 35
|
| 132 |
+
"Warranty Duration", # 36
|
| 133 |
+
"Insurance", # 37
|
| 134 |
+
"Covenant Not to Sue", # 38
|
| 135 |
+
"Third Party Beneficiary", # 39
|
| 136 |
+
"Other", # 40
|
| 137 |
+
]
|
| 138 |
+
|
| 139 |
+
_UNFAIR_LABELS = [
|
| 140 |
+
"Limitation of liability", "Unilateral termination", "Unilateral change",
|
| 141 |
+
"Content removal", "Contract by using", "Choice of law",
|
| 142 |
+
"Jurisdiction", "Arbitration"
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
_ALL_LABELS = CUAD_LABELS + _UNFAIR_LABELS
|
| 146 |
+
|
| 147 |
+
RISK_MAP = {
|
| 148 |
+
# Critical
|
| 149 |
+
"Uncapped Liability": "CRITICAL",
|
| 150 |
+
"Arbitration": "CRITICAL",
|
| 151 |
+
"IP Ownership Assignment": "CRITICAL",
|
| 152 |
+
"Termination for Convenience": "CRITICAL",
|
| 153 |
+
"Limitation of liability": "CRITICAL",
|
| 154 |
+
"Unilateral termination": "CRITICAL",
|
| 155 |
+
"Liquidated Damages": "CRITICAL",
|
| 156 |
+
# High
|
| 157 |
+
"Non-Compete": "HIGH",
|
| 158 |
+
"Exclusivity": "HIGH",
|
| 159 |
+
"Change of Control": "HIGH",
|
| 160 |
+
"No-Solicit of Customers": "HIGH",
|
| 161 |
+
"No-Solicit of Employees": "HIGH",
|
| 162 |
+
"Unilateral change": "HIGH",
|
| 163 |
+
"Content removal": "HIGH",
|
| 164 |
+
"Anti-Assignment": "HIGH",
|
| 165 |
+
"Notice Period to Terminate Renewal": "HIGH",
|
| 166 |
+
# Medium
|
| 167 |
+
"Governing Law": "MEDIUM",
|
| 168 |
+
"Jurisdiction": "MEDIUM",
|
| 169 |
+
"Choice of law": "MEDIUM",
|
| 170 |
+
"Price Restriction": "MEDIUM",
|
| 171 |
+
"Minimum Commitment": "MEDIUM",
|
| 172 |
+
"Volume Restriction": "MEDIUM",
|
| 173 |
+
"Non-Disparagement": "MEDIUM",
|
| 174 |
+
"Most Favored Nation": "MEDIUM",
|
| 175 |
+
"Revenue/Profit Sharing": "MEDIUM",
|
| 176 |
+
"Warranty Duration": "MEDIUM",
|
| 177 |
+
# Low
|
| 178 |
+
"Document Name": "LOW",
|
| 179 |
+
"Parties": "LOW",
|
| 180 |
+
"Agreement Date": "LOW",
|
| 181 |
+
"Effective Date": "LOW",
|
| 182 |
+
"Expiration Date": "LOW",
|
| 183 |
+
"Renewal Term": "LOW",
|
| 184 |
+
"Joint IP Ownership": "LOW",
|
| 185 |
+
"License Grant": "LOW",
|
| 186 |
+
"Non-Transferable License": "LOW",
|
| 187 |
+
"Affiliate License-Licensor": "LOW",
|
| 188 |
+
"Affiliate License-Licensee": "LOW",
|
| 189 |
+
"Unlimited/All-You-Can-Eat License": "LOW",
|
| 190 |
+
"Irrevocable or Perpetual License": "LOW",
|
| 191 |
+
"Source Code Escrow": "LOW",
|
| 192 |
+
"Post-Termination Services": "LOW",
|
| 193 |
+
"Audit Rights": "LOW",
|
| 194 |
+
"Cap on Liability": "LOW",
|
| 195 |
+
"Insurance": "LOW",
|
| 196 |
+
"Covenant Not to Sue": "LOW",
|
| 197 |
+
"Third Party Beneficiary": "LOW",
|
| 198 |
+
"Other": "LOW",
|
| 199 |
+
"ROFR/ROFO/ROFN": "LOW",
|
| 200 |
+
"Contract by using": "LOW",
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
DESC_MAP = {label: label.replace("_", " ") for label in _ALL_LABELS}
|
| 204 |
+
DESC_MAP.update({
|
| 205 |
+
"Limitation of liability": "Company limits or excludes liability for losses, data breaches, or service failures.",
|
| 206 |
+
"Unilateral termination": "Company can terminate your account at any time without reason.",
|
| 207 |
+
"Unilateral change": "Company can change terms at any time without your consent.",
|
| 208 |
+
"Content removal": "Company can delete your content without notice or justification.",
|
| 209 |
+
"Contract by using": "You are bound to the contract simply by using the service.",
|
| 210 |
+
"Choice of law": "Governing law may differ from your country, reducing your legal protections.",
|
| 211 |
+
"Jurisdiction": "Disputes must be resolved in a jurisdiction that may disadvantage you.",
|
| 212 |
+
"Arbitration": "Forces disputes to arbitration instead of court. You waive your right to sue.",
|
| 213 |
+
"Uncapped Liability": "No financial limit on damages the party may be liable for.",
|
| 214 |
+
"Cap on Liability": "Maximum financial liability is explicitly capped.",
|
| 215 |
+
"Non-Compete": "Restrictions on competing with the counter-party.",
|
| 216 |
+
"Exclusivity": "Obligation to deal exclusively with one party.",
|
| 217 |
+
"IP Ownership Assignment": "Intellectual property rights are transferred entirely.",
|
| 218 |
+
"Termination for Convenience": "Either party may terminate without cause or notice.",
|
| 219 |
+
"Governing Law": "Specifies which jurisdiction's laws apply.",
|
| 220 |
+
"Non-Disparagement": "Agreement not to speak negatively about the other party.",
|
| 221 |
+
"ROFR/ROFO/ROFN": "Right of First Refusal / Offer / Negotiation clause.",
|
| 222 |
+
"Change of Control": "Provisions triggered by ownership or control changes.",
|
| 223 |
+
"Anti-Assignment": "Restrictions on transferring contract rights to third parties.",
|
| 224 |
+
"Liquidated Damages": "Pre-determined damages amount for breach of contract.",
|
| 225 |
+
"Source Code Escrow": "Third-party holds source code for release under defined conditions.",
|
| 226 |
+
"Post-Termination Services": "Services to be provided after the contract ends.",
|
| 227 |
+
"Audit Rights": "Right to inspect records or verify compliance.",
|
| 228 |
+
"Warranty Duration": "Length of time warranties remain in effect.",
|
| 229 |
+
"Covenant Not to Sue": "Agreement not to bring legal action against a party.",
|
| 230 |
+
"Third Party Beneficiary": "Non-party who benefits from the contract terms.",
|
| 231 |
+
"Insurance": "Insurance coverage requirements.",
|
| 232 |
+
"Revenue/Profit Sharing": "Revenue or profit sharing arrangements between parties.",
|
| 233 |
+
"Price Restriction": "Restrictions on pricing or discounting.",
|
| 234 |
+
"Minimum Commitment": "Minimum purchase or usage commitment.",
|
| 235 |
+
"Volume Restriction": "Limits on volume of goods or services.",
|
| 236 |
+
"License Grant": "Permission to use intellectual property.",
|
| 237 |
+
"Non-Transferable License": "License that cannot be transferred to third parties.",
|
| 238 |
+
"Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
|
| 239 |
+
"Unlimited/All-You-Can-Eat License": "License with no usage limits.",
|
| 240 |
+
"Notice Period to Terminate Renewal": "Required notice period before automatic renewal.",
|
| 241 |
+
})
|
| 242 |
+
|
| 243 |
+
RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
|
| 244 |
+
|
| 245 |
+
RISK_STYLES = {
|
| 246 |
+
"CRITICAL": ("#dc2626", "#fef2f2", "β οΈ"),
|
| 247 |
+
"HIGH": ("#ea580c", "#fff7ed", "β‘"),
|
| 248 |
+
"MEDIUM": ("#ca8a04", "#fefce8", "π"),
|
| 249 |
+
"LOW": ("#16a34a", "#f0fdf4", "β"),
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 253 |
+
# FIX v4.1: Per-class thresholds aligned with single-label softmax
|
| 254 |
+
# The model was trained with cross-entropy (single-label), so inference
|
| 255 |
+
# now uses softmax+argmax, not sigmoid. Thresholds apply to softmax probs.
|
| 256 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 257 |
+
_CUAD_THRESHOLDS = {}
|
| 258 |
+
_WEAK_CLASSES = {0, 1, 2, 7, 9, 21, 22, 27, 37, 38}
|
| 259 |
+
for _i in range(41):
|
| 260 |
+
if _i in _WEAK_CLASSES:
|
| 261 |
+
_CUAD_THRESHOLDS[_i] = 0.85 # Only flag if very confident (these classes are unreliable)
|
| 262 |
+
else:
|
| 263 |
+
_CUAD_THRESHOLDS[_i] = 0.40 # Reasonable threshold for softmax outputs
|
| 264 |
+
|
| 265 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 266 |
+
# FIX v4.1: Bounded LRU Cache utility (replaces unbounded dicts)
|
| 267 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 268 |
+
|
| 269 |
+
class BoundedCache:
|
| 270 |
+
"""Thread-safe bounded LRU cache using OrderedDict."""
|
| 271 |
+
def __init__(self, maxsize=1000):
|
| 272 |
+
self._cache = OrderedDict()
|
| 273 |
+
self._maxsize = maxsize
|
| 274 |
+
|
| 275 |
+
def get(self, key, default=None):
|
| 276 |
+
if key in self._cache:
|
| 277 |
+
self._cache.move_to_end(key)
|
| 278 |
+
return self._cache[key]
|
| 279 |
+
return default
|
| 280 |
+
|
| 281 |
+
def put(self, key, value):
|
| 282 |
+
if key in self._cache:
|
| 283 |
+
self._cache.move_to_end(key)
|
| 284 |
+
self._cache[key] = value
|
| 285 |
+
else:
|
| 286 |
+
if len(self._cache) >= self._maxsize:
|
| 287 |
+
self._cache.popitem(last=False)
|
| 288 |
+
self._cache[key] = value
|
| 289 |
+
|
| 290 |
+
def __contains__(self, key):
|
| 291 |
+
return key in self._cache
|
| 292 |
+
|
| 293 |
+
def __len__(self):
|
| 294 |
+
return len(self._cache)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 298 |
+
# 2. MODEL LOADING
|
| 299 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 300 |
+
|
| 301 |
+
cuad_tokenizer = None
|
| 302 |
+
cuad_model = None
|
| 303 |
+
ner_pipeline = None
|
| 304 |
+
nli_pipeline = None
|
| 305 |
+
_model_status = {"cuad": "not_loaded", "ner": "not_loaded", "nli": "not_loaded"}
|
| 306 |
+
|
| 307 |
+
def _load_cuad_model():
|
| 308 |
+
global cuad_tokenizer, cuad_model, _model_status
|
| 309 |
+
if not _HAS_TORCH:
|
| 310 |
+
print("[ClauseGuard] PyTorch not available β using regex fallback")
|
| 311 |
+
_model_status["cuad"] = "unavailable"
|
| 312 |
+
return
|
| 313 |
+
try:
|
| 314 |
+
base = "nlpaueb/legal-bert-base-uncased"
|
| 315 |
+
adapter = "Mokshith31/legalbert-contract-clause-classification"
|
| 316 |
+
print(f"[ClauseGuard] Loading CUAD classifier: {adapter}")
|
| 317 |
+
cuad_tokenizer = AutoTokenizer.from_pretrained(base)
|
| 318 |
+
base_model = AutoModelForSequenceClassification.from_pretrained(
|
| 319 |
+
base, num_labels=41, ignore_mismatched_sizes=True
|
| 320 |
+
)
|
| 321 |
+
cuad_model = PeftModel.from_pretrained(base_model, adapter)
|
| 322 |
+
cuad_model.eval()
|
| 323 |
+
_model_status["cuad"] = "loaded"
|
| 324 |
+
print("[ClauseGuard] CUAD model loaded successfully")
|
| 325 |
+
except Exception as e:
|
| 326 |
+
print(f"[ClauseGuard] CUAD model load failed: {e}")
|
| 327 |
+
cuad_tokenizer = None
|
| 328 |
+
cuad_model = None
|
| 329 |
+
_model_status["cuad"] = f"failed: {e}"
|
| 330 |
+
|
| 331 |
+
def _load_ner_model():
|
| 332 |
+
global ner_pipeline, _model_status, _HAS_NER_MODEL
|
| 333 |
+
if not _HAS_TORCH:
|
| 334 |
+
_model_status["ner"] = "unavailable"
|
| 335 |
+
return
|
| 336 |
+
try:
|
| 337 |
+
print("[ClauseGuard] Loading Legal NER model: matterstack/legal-bert-ner")
|
| 338 |
+
ner_pipeline = pipeline(
|
| 339 |
+
"ner",
|
| 340 |
+
model="matterstack/legal-bert-ner",
|
| 341 |
+
aggregation_strategy="simple",
|
| 342 |
+
device=-1, # CPU
|
| 343 |
+
)
|
| 344 |
+
_HAS_NER_MODEL = True
|
| 345 |
+
_model_status["ner"] = "loaded"
|
| 346 |
+
print("[ClauseGuard] Legal NER model loaded successfully")
|
| 347 |
+
except Exception as e:
|
| 348 |
+
print(f"[ClauseGuard] Legal NER model load failed (using regex fallback): {e}")
|
| 349 |
+
_model_status["ner"] = f"failed: {e}"
|
| 350 |
+
|
| 351 |
+
def _load_nli_model():
|
| 352 |
+
global nli_pipeline, _model_status, _HAS_NLI_MODEL
|
| 353 |
+
if not _HAS_TORCH:
|
| 354 |
+
_model_status["nli"] = "unavailable"
|
| 355 |
+
return
|
| 356 |
+
try:
|
| 357 |
+
print("[ClauseGuard] Loading NLI model: cross-encoder/nli-deberta-v3-base")
|
| 358 |
+
nli_pipeline = pipeline(
|
| 359 |
+
"text-classification",
|
| 360 |
+
model="cross-encoder/nli-deberta-v3-base",
|
| 361 |
+
device=-1,
|
| 362 |
+
)
|
| 363 |
+
_HAS_NLI_MODEL = True
|
| 364 |
+
_model_status["nli"] = "loaded"
|
| 365 |
+
print("[ClauseGuard] NLI model loaded successfully")
|
| 366 |
+
except Exception as e:
|
| 367 |
+
print(f"[ClauseGuard] NLI model load failed (using heuristic fallback): {e}")
|
| 368 |
+
_model_status["nli"] = f"failed: {e}"
|
| 369 |
+
|
| 370 |
+
def get_model_status_text():
|
| 371 |
+
"""Return human-readable model status."""
|
| 372 |
+
parts = []
|
| 373 |
+
for name, status in _model_status.items():
|
| 374 |
+
icon = "β
" if status == "loaded" else "β οΈ" if "failed" in status else "β"
|
| 375 |
+
label = {"cuad": "Clause Classifier", "ner": "Legal NER", "nli": "NLI Contradiction"}[name]
|
| 376 |
+
parts.append(f"{icon} {label}: {status}")
|
| 377 |
+
return " Β· ".join(parts)
|
| 378 |
+
|
| 379 |
+
# Load models at startup
|
| 380 |
+
_load_cuad_model()
|
| 381 |
+
_load_ner_model()
|
| 382 |
+
_load_nli_model()
|
| 383 |
+
|
| 384 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 385 |
+
# 3. DOCUMENT PARSING
|
| 386 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 387 |
+
|
| 388 |
+
def parse_pdf(file_path):
|
| 389 |
+
"""Smart PDF parser: native text extraction with OCR fallback for scanned PDFs."""
|
| 390 |
+
text, error, method = parse_pdf_smart(file_path)
|
| 391 |
+
if text:
|
| 392 |
+
if method == "ocr":
|
| 393 |
+
print(f"[ClauseGuard] PDF extracted via OCR ({len(text)} chars)")
|
| 394 |
+
return text, None
|
| 395 |
+
if error:
|
| 396 |
+
return None, error
|
| 397 |
+
return None, "Could not extract text from PDF. Try uploading a clearer scan or digital PDF."
|
| 398 |
+
|
| 399 |
+
def parse_docx(file_path):
|
| 400 |
+
if not _HAS_DOCX:
|
| 401 |
+
return None, "DOCX parsing not available (python-docx not installed)"
|
| 402 |
+
try:
|
| 403 |
+
doc = DocxDocument(file_path)
|
| 404 |
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 405 |
+
return "\n\n".join(paragraphs), None
|
| 406 |
+
except Exception as e:
|
| 407 |
+
return None, f"DOCX parse error: {e}"
|
| 408 |
+
|
| 409 |
+
def parse_document(file_path):
|
| 410 |
+
if file_path is None:
|
| 411 |
+
return None, "No file uploaded"
|
| 412 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 413 |
+
if ext == ".pdf":
|
| 414 |
+
return parse_pdf(file_path)
|
| 415 |
+
elif ext in (".docx", ".doc"):
|
| 416 |
+
return parse_docx(file_path)
|
| 417 |
+
elif ext in (".txt", ".md", ".rst"):
|
| 418 |
+
try:
|
| 419 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 420 |
+
return f.read(), None
|
| 421 |
+
except Exception as e:
|
| 422 |
+
return None, f"Text read error: {e}"
|
| 423 |
+
else:
|
| 424 |
+
return None, f"Unsupported file type: {ext}"
|
| 425 |
+
|
| 426 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 427 |
+
# 4. DETERMINISTIC CLAUSE SPLITTING
|
| 428 |
+
# FIX v4.1: Bounded cache (max 500 documents) instead of unbounded dict
|
| 429 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 430 |
+
|
| 431 |
+
_chunk_cache = BoundedCache(maxsize=500)
|
| 432 |
+
|
| 433 |
+
def split_clauses(text):
|
| 434 |
+
"""Deterministic, structure-aware clause splitting.
|
| 435 |
+
Same input ALWAYS produces same output. Normalized text is hashed
|
| 436 |
+
and cached so repeated runs on identical documents are identical."""
|
| 437 |
+
normalized = re.sub(r'\s+', ' ', text.strip())
|
| 438 |
+
text_hash = hashlib.sha256(normalized.encode()).hexdigest()
|
| 439 |
+
cached = _chunk_cache.get(text_hash)
|
| 440 |
+
if cached is not None:
|
| 441 |
+
return cached
|
| 442 |
+
|
| 443 |
+
text = re.sub(r'\n{3,}', '\n\n', text.strip())
|
| 444 |
+
|
| 445 |
+
# First try to detect numbered sections (1., 2., 3.1, (a), etc.)
|
| 446 |
+
section_pattern = re.compile(
|
| 447 |
+
r'(?:^|\n\n)'
|
| 448 |
+
r'(?='
|
| 449 |
+
r'\d+(?:\.\d+)*[.)]\s' # 1. 2. 3.1. 3.1)
|
| 450 |
+
r'|[A-Z]{2,}[A-Z\s]*\n' # ALL CAPS HEADERS
|
| 451 |
+
r'|\([a-z]\)\s' # (a) (b) (c)
|
| 452 |
+
r'|(?:Section|Article|Clause)\s+\d+' # Section 1, Article 2
|
| 453 |
+
r')',
|
| 454 |
+
re.MULTILINE
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
positions = [m.start() for m in section_pattern.finditer(text)]
|
| 458 |
+
|
| 459 |
+
if len(positions) >= 3:
|
| 460 |
+
clauses = []
|
| 461 |
+
for i, pos in enumerate(positions):
|
| 462 |
+
end = positions[i + 1] if i + 1 < len(positions) else len(text)
|
| 463 |
+
chunk = text[pos:end].strip()
|
| 464 |
+
if len(chunk) > 30:
|
| 465 |
+
if len(chunk) > 1500:
|
| 466 |
+
sub_parts = chunk.split('\n\n')
|
| 467 |
+
current = ""
|
| 468 |
+
for sp in sub_parts:
|
| 469 |
+
if len(current) + len(sp) < 1200:
|
| 470 |
+
current += ("\n\n" + sp if current else sp)
|
| 471 |
+
else:
|
| 472 |
+
if len(current.strip()) > 30:
|
| 473 |
+
clauses.append(current.strip())
|
| 474 |
+
current = sp
|
| 475 |
+
if len(current.strip()) > 30:
|
| 476 |
+
clauses.append(current.strip())
|
| 477 |
+
else:
|
| 478 |
+
clauses.append(chunk)
|
| 479 |
+
if positions and positions[0] > 50:
|
| 480 |
+
preamble = text[:positions[0]].strip()
|
| 481 |
+
if len(preamble) > 30:
|
| 482 |
+
clauses.insert(0, preamble)
|
| 483 |
+
result = clauses if clauses else _fallback_split(text)
|
| 484 |
+
_chunk_cache.put(text_hash, result)
|
| 485 |
+
return result
|
| 486 |
+
else:
|
| 487 |
+
result = _fallback_split(text)
|
| 488 |
+
_chunk_cache.put(text_hash, result)
|
| 489 |
+
return result
|
| 490 |
+
|
| 491 |
+
def _fallback_split(text):
|
| 492 |
+
"""Fallback: split on paragraph breaks and sentence boundaries."""
|
| 493 |
+
paragraphs = text.split('\n\n')
|
| 494 |
+
if len(paragraphs) >= 3:
|
| 495 |
+
clauses = []
|
| 496 |
+
for p in paragraphs:
|
| 497 |
+
p = p.strip()
|
| 498 |
+
if len(p) > 30:
|
| 499 |
+
if len(p) > 1500:
|
| 500 |
+
sents = re.split(r'(?<=[.!?])\s+(?=[A-Z])', p)
|
| 501 |
+
current = ""
|
| 502 |
+
for s in sents:
|
| 503 |
+
if len(current) + len(s) < 1000:
|
| 504 |
+
current += (" " + s if current else s)
|
| 505 |
+
else:
|
| 506 |
+
if len(current.strip()) > 30:
|
| 507 |
+
clauses.append(current.strip())
|
| 508 |
+
current = s
|
| 509 |
+
if len(current.strip()) > 30:
|
| 510 |
+
clauses.append(current.strip())
|
| 511 |
+
else:
|
| 512 |
+
clauses.append(p)
|
| 513 |
+
return clauses
|
| 514 |
+
|
| 515 |
+
parts = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(])', text)
|
| 516 |
+
return [p.strip() for p in parts if len(p.strip()) > 30]
|
| 517 |
+
|
| 518 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 519 |
+
# 5. CLAUSE DETECTION
|
| 520 |
+
# FIX v4.1: Use softmax (matching training) instead of sigmoid
|
| 521 |
+
# FIX v4.1: max_length raised to 512 (was 256)
|
| 522 |
+
# FIX v4.1: Bounded prediction cache
|
| 523 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 524 |
+
|
| 525 |
+
_HEADING_RE = re.compile(r'^\d+(?:\.\d+)*\s+[A-Z][A-Z\s&,/]+$', re.MULTILINE)
|
| 526 |
+
|
| 527 |
+
def _strip_heading(text):
|
| 528 |
+
"""Remove leading section headings that confuse the classifier."""
|
| 529 |
+
lines = text.split('\n')
|
| 530 |
+
if lines and _HEADING_RE.match(lines[0].strip()):
|
| 531 |
+
stripped = '\n'.join(lines[1:]).strip()
|
| 532 |
+
return stripped if len(stripped) > 20 else text
|
| 533 |
+
return text
|
| 534 |
+
|
| 535 |
+
_LABEL_GUARDRAILS = {
|
| 536 |
+
"Liquidated Damages": re.compile(
|
| 537 |
+
r'liquidated|pre-?determined.{0,10}damage|agreed.{0,10}sum|penalty clause|stipulated.{0,10}damage',
|
| 538 |
+
re.IGNORECASE
|
| 539 |
+
),
|
| 540 |
+
"Uncapped Liability": re.compile(
|
| 541 |
+
r'uncapped|unlimited.{0,10}liabilit|no.{0,10}(limit|cap).{0,10}liabilit',
|
| 542 |
+
re.IGNORECASE
|
| 543 |
+
),
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
def _apply_guardrails(label, text, confidence):
|
| 547 |
+
guard = _LABEL_GUARDRAILS.get(label)
|
| 548 |
+
if guard and not guard.search(text):
|
| 549 |
+
return "Other", confidence * 0.3
|
| 550 |
+
return label, confidence
|
| 551 |
+
|
| 552 |
+
def _text_hash(text):
|
| 553 |
+
return hashlib.md5(text.encode()).hexdigest()
|
| 554 |
+
|
| 555 |
+
# FIX v4.1: Bounded prediction cache
|
| 556 |
+
_prediction_cache = BoundedCache(maxsize=2000)
|
| 557 |
+
|
| 558 |
+
def classify_cuad(clause_text):
|
| 559 |
+
if cuad_model is None or cuad_tokenizer is None:
|
| 560 |
+
return _classify_regex(clause_text)
|
| 561 |
+
|
| 562 |
+
clean_text = _strip_heading(clause_text)
|
| 563 |
+
|
| 564 |
+
h = _text_hash(clean_text[:512])
|
| 565 |
+
cached = _prediction_cache.get(h)
|
| 566 |
+
if cached is not None:
|
| 567 |
+
return cached
|
| 568 |
+
|
| 569 |
+
try:
|
| 570 |
+
# FIX v4.1: max_length=512 (was 256 β truncating long legal clauses)
|
| 571 |
+
inputs = cuad_tokenizer(
|
| 572 |
+
clean_text,
|
| 573 |
+
return_tensors="pt",
|
| 574 |
+
truncation=True,
|
| 575 |
+
max_length=512,
|
| 576 |
+
padding=True
|
| 577 |
+
)
|
| 578 |
+
with torch.no_grad():
|
| 579 |
+
logits = cuad_model(**inputs).logits
|
| 580 |
+
|
| 581 |
+
# FIX v4.1: Use softmax (matching single-label cross-entropy training)
|
| 582 |
+
# The model was trained with F.cross_entropy, so softmax is correct.
|
| 583 |
+
probs = torch.softmax(logits, dim=-1)[0]
|
| 584 |
+
|
| 585 |
+
# Get the top prediction
|
| 586 |
+
top_prob, top_idx = torch.max(probs, dim=0)
|
| 587 |
+
top_idx = int(top_idx)
|
| 588 |
+
top_conf = float(top_prob)
|
| 589 |
+
|
| 590 |
+
results = []
|
| 591 |
+
|
| 592 |
+
# Primary prediction
|
| 593 |
+
threshold = _CUAD_THRESHOLDS.get(top_idx, 0.40)
|
| 594 |
+
if top_conf > threshold and top_idx < len(CUAD_LABELS):
|
| 595 |
+
label = CUAD_LABELS[top_idx]
|
| 596 |
+
conf = top_conf
|
| 597 |
+
label, conf = _apply_guardrails(label, clause_text, conf)
|
| 598 |
+
if not (label == "Other" and conf < 0.3):
|
| 599 |
+
risk = RISK_MAP.get(label, "LOW")
|
| 600 |
+
results.append({
|
| 601 |
+
"label": label,
|
| 602 |
+
"confidence": round(conf, 3),
|
| 603 |
+
"risk": risk,
|
| 604 |
+
"description": DESC_MAP.get(label, label),
|
| 605 |
+
"source": "ml",
|
| 606 |
+
})
|
| 607 |
+
|
| 608 |
+
# Also check 2nd-best prediction if confident enough
|
| 609 |
+
if len(probs) > 1:
|
| 610 |
+
sorted_probs, sorted_indices = torch.sort(probs, descending=True)
|
| 611 |
+
if len(sorted_probs) > 1:
|
| 612 |
+
second_idx = int(sorted_indices[1])
|
| 613 |
+
second_conf = float(sorted_probs[1])
|
| 614 |
+
second_threshold = _CUAD_THRESHOLDS.get(second_idx, 0.40)
|
| 615 |
+
if second_conf > second_threshold and second_idx < len(CUAD_LABELS):
|
| 616 |
+
label2 = CUAD_LABELS[second_idx]
|
| 617 |
+
conf2 = second_conf
|
| 618 |
+
label2, conf2 = _apply_guardrails(label2, clause_text, conf2)
|
| 619 |
+
if not (label2 == "Other" and conf2 < 0.3):
|
| 620 |
+
# Only add if different from primary
|
| 621 |
+
if not results or results[0]["label"] != label2:
|
| 622 |
+
risk2 = RISK_MAP.get(label2, "LOW")
|
| 623 |
+
results.append({
|
| 624 |
+
"label": label2,
|
| 625 |
+
"confidence": round(conf2, 3),
|
| 626 |
+
"risk": risk2,
|
| 627 |
+
"description": DESC_MAP.get(label2, label2),
|
| 628 |
+
"source": "ml",
|
| 629 |
+
})
|
| 630 |
+
|
| 631 |
+
results.sort(key=lambda x: x["confidence"], reverse=True)
|
| 632 |
+
|
| 633 |
+
# If no ML results, also try regex to catch what model misses
|
| 634 |
+
if not results:
|
| 635 |
+
results = _classify_regex(clause_text)
|
| 636 |
+
|
| 637 |
+
_prediction_cache.put(h, results)
|
| 638 |
+
return results
|
| 639 |
+
except Exception as e:
|
| 640 |
+
print(f"[ClauseGuard] CUAD inference error: {e}")
|
| 641 |
+
return _classify_regex(clause_text)
|
| 642 |
+
|
| 643 |
+
# FIX v4.1: Extended regex patterns to cover more CUAD categories
|
| 644 |
+
_REGEX_PATTERNS = {
|
| 645 |
+
"Limitation of liability": [r"not liable", r"shall not be (liable|responsible)", r"in no event.*liable", r"limitation of liability", r"without warranty", r"disclaim"],
|
| 646 |
+
"Unilateral termination": [r"terminat.*at any time", r"suspend.*account.*without", r"we may (terminat|suspend|discontinu)", r"right to (terminat|suspend)"],
|
| 647 |
+
"Unilateral change": [r"sole discretion", r"reserves? the right to (modify|change|update|amend)", r"at any time.*without (prior )?notice", r"we may (modify|change|update)"],
|
| 648 |
+
"Content removal": [r"remove.*content.*without", r"right to remove", r"we may.*remove"],
|
| 649 |
+
"Contract by using": [r"by (using|accessing).*you agree", r"continued use.*constitutes? acceptance"],
|
| 650 |
+
"Choice of law": [r"governed by.*laws? of", r"shall be governed", r"laws of the state of"],
|
| 651 |
+
"Jurisdiction": [r"exclusive jurisdiction", r"courts? of.*(california|delaware|new york|ireland|england)", r"submit to.*jurisdiction"],
|
| 652 |
+
"Arbitration": [r"arbitrat", r"binding arbitration", r"waive.*right.*court", r"class action waiver"],
|
| 653 |
+
"Governing Law": [r"governed by", r"laws of", r"jurisdiction of"],
|
| 654 |
+
"Termination for Convenience": [r"terminat.*for convenience", r"terminat.*without cause", r"terminat.*at any time"],
|
| 655 |
+
"Non-Compete": [r"non-compete", r"shall not compete", r"competition restriction"],
|
| 656 |
+
"Exclusivity": [r"exclusive(?:ly)?(?:\s+(?:deal|relationship|partner|right))", r"exclusivity"],
|
| 657 |
+
"IP Ownership Assignment": [r"assign.*intellectual property", r"ownership of.*ip", r"all rights.*assign", r"work.?for.?hire"],
|
| 658 |
+
"Uncapped Liability": [r"unlimited liability", r"uncapped", r"no.*limit.*liability"],
|
| 659 |
+
"Cap on Liability": [r"cap on liability", r"maximum liability", r"liability.*shall not exceed", r"aggregate liability.*not exceed"],
|
| 660 |
+
"Indemnification": [r"indemnif", r"hold harmless", r"defend.*against.*claim"],
|
| 661 |
+
"Confidentiality": [r"confidential(?:ity)?", r"non-disclosure", r"\bnda\b"],
|
| 662 |
+
"Force Majeure": [r"force majeure", r"act of god", r"beyond.*(?:reasonable\s+)?control"],
|
| 663 |
+
"Penalties": [r"penalt(?:y|ies)", r"late fee", r"default charge", r"interest on overdue"],
|
| 664 |
+
# FIX v4.1: Added missing regex patterns for more CUAD categories
|
| 665 |
+
"Audit Rights": [r"audit rights?", r"right to audit", r"inspect.*records?", r"examination of.*records?", r"access to.*books"],
|
| 666 |
+
"Warranty Duration": [r"warrant(?:y|ies).*(?:period|duration|term|months?|years?)", r"warranty.*shall.*(?:remain|last|continue)", r"limited warranty"],
|
| 667 |
+
"Insurance": [r"(?:shall|must).*maintain.*insurance", r"insurance.*coverage", r"policy of insurance", r"certificate of insurance"],
|
| 668 |
+
"Source Code Escrow": [r"source code escrow", r"escrow.*source code", r"escrow agent"],
|
| 669 |
+
"Post-Termination Services": [r"post.?termination.*(?:service|obligation|support)", r"(?:after|following|upon).*termination.*(?:shall|must|will).*(?:provide|continue)"],
|
| 670 |
+
"Renewal Term": [r"renew(?:al)?.*term", r"auto(?:matic(?:ally)?)?.*renew", r"successive.*(?:term|period)"],
|
| 671 |
+
"Notice Period to Terminate Renewal": [r"notice.*(?:to\s+)?terminat.*renew", r"(?:days?|months?).*(?:prior|advance).*(?:notice|written).*(?:terminat|renew)", r"notice of non.?renewal"],
|
| 672 |
+
"Change of Control": [r"change of control", r"change in.*(?:ownership|control)", r"merger.*acquisition", r"sale of.*(?:all|substantially).*assets"],
|
| 673 |
+
"Anti-Assignment": [r"(?:shall|may)\s+not\s+assign", r"anti.?assignment", r"no.*assignment.*without.*consent"],
|
| 674 |
+
"Revenue/Profit Sharing": [r"revenue.*shar", r"profit.*shar", r"royalt(?:y|ies)"],
|
| 675 |
+
"Liquidated Damages": [r"liquidated.*damages?", r"pre.?determined.*damage", r"stipulated.*damage"],
|
| 676 |
+
"Covenant Not to Sue": [r"covenant not to sue", r"(?:shall|agree).*not.*(?:bring|file|commence).*(?:action|claim|suit)"],
|
| 677 |
+
"Joint IP Ownership": [r"joint(?:ly)?.*own(?:ed|ership)?.*(?:ip|intellectual property)", r"co.?own(?:ed|ership)?"],
|
| 678 |
+
"License Grant": [r"(?:grant|license).*(?:non.?exclusive|exclusive|perpetual|irrevocable).*(?:license|right)", r"hereby grants?.*license"],
|
| 679 |
+
"Non-Transferable License": [r"non.?transferable.*license", r"license.*(?:shall|may)\s+not.*(?:transfer|assign|sublicense)"],
|
| 680 |
+
"ROFR/ROFO/ROFN": [r"right of first.*(?:refusal|offer|negotiation)", r"ROFR", r"ROFO", r"ROFN"],
|
| 681 |
+
"No-Solicit of Customers": [r"(?:shall|must|agree).*not.*solicit.*customer", r"no.?solicit.*customer", r"non.?solicitation.*customer"],
|
| 682 |
+
"No-Solicit of Employees": [r"(?:shall|must|agree).*not.*solicit.*employee", r"no.?solicit.*employee", r"non.?solicitation.*employee", r"no.?hire"],
|
| 683 |
+
"Non-Disparagement": [r"non.?disparagement", r"(?:shall|must|agree).*not.*(?:disparag|defam|make.*negative)", r"not.*make.*derogatory"],
|
| 684 |
+
"Most Favored Nation": [r"most favou?red.*nation", r"MFN", r"most favou?red.*(?:customer|pricing|terms)"],
|
| 685 |
+
"Third Party Beneficiary": [r"third.?party.*beneficiar", r"no.*third.?party.*beneficiar"],
|
| 686 |
+
"Minimum Commitment": [r"minimum.*(?:commitment|purchase|order|volume|spend)", r"(?:shall|must).*(?:purchase|order).*(?:at least|minimum|no less than)"],
|
| 687 |
+
"Volume Restriction": [r"volume.*(?:restriction|limitation|cap|ceiling)", r"(?:shall|may).*not.*exceed.*(?:volume|quantity)"],
|
| 688 |
+
"Price Restriction": [r"price.*(?:restriction|limitation|ceiling|cap|floor)", r"(?:shall|may).*not.*(?:increase|raise|exceed).*price"],
|
| 689 |
+
}
|
| 690 |
+
|
| 691 |
+
def _classify_regex(text):
|
| 692 |
+
"""Regex fallback β returns pattern match, NOT fake confidence."""
|
| 693 |
+
text_lower = text.lower()
|
| 694 |
+
results = []
|
| 695 |
+
seen = set()
|
| 696 |
+
for label, patterns in _REGEX_PATTERNS.items():
|
| 697 |
+
for pat in patterns:
|
| 698 |
+
if re.search(pat, text_lower):
|
| 699 |
+
if label not in seen:
|
| 700 |
+
risk = RISK_MAP.get(label, "MEDIUM")
|
| 701 |
+
results.append({
|
| 702 |
+
"label": label,
|
| 703 |
+
"confidence": None,
|
| 704 |
+
"risk": risk,
|
| 705 |
+
"description": DESC_MAP.get(label, label),
|
| 706 |
+
"source": "pattern",
|
| 707 |
+
})
|
| 708 |
+
seen.add(label)
|
| 709 |
+
break
|
| 710 |
+
return results
|
| 711 |
+
|
| 712 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 713 |
+
# 6. LEGAL NER β ML model with regex fallback
|
| 714 |
+
# FIX v4.1: Batch all chunks in single pipeline call
|
| 715 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 716 |
+
|
| 717 |
+
def extract_entities(text):
|
| 718 |
+
"""Extract entities using ML model (matterstack/legal-bert-ner) with regex fallback."""
|
| 719 |
+
entities = []
|
| 720 |
+
|
| 721 |
+
if _HAS_NER_MODEL and ner_pipeline is not None:
|
| 722 |
+
try:
|
| 723 |
+
# FIX v4.1: Create overlapping chunks but batch them in a SINGLE pipeline call
|
| 724 |
+
max_text = min(len(text), 10000)
|
| 725 |
+
chunks = [text[i:i+512] for i in range(0, max_text, 450)]
|
| 726 |
+
offsets = list(range(0, max_text, 450))
|
| 727 |
+
|
| 728 |
+
# Single batched pipeline call instead of sequential
|
| 729 |
+
all_ner_results = ner_pipeline(chunks, batch_size=8)
|
| 730 |
+
|
| 731 |
+
for chunk_idx, ner_results in enumerate(all_ner_results):
|
| 732 |
+
offset = offsets[chunk_idx]
|
| 733 |
+
for ent in ner_results:
|
| 734 |
+
if ent.get("score", 0) > 0.5:
|
| 735 |
+
entities.append({
|
| 736 |
+
"text": ent["word"],
|
| 737 |
+
"type": _map_ner_label(ent.get("entity_group", ent.get("entity", "MISC"))),
|
| 738 |
+
"start": ent["start"] + offset,
|
| 739 |
+
"end": ent["end"] + offset,
|
| 740 |
+
"score": round(ent["score"], 3),
|
| 741 |
+
"source": "ml",
|
| 742 |
+
})
|
| 743 |
+
except Exception as e:
|
| 744 |
+
print(f"[ClauseGuard] ML NER error, falling back to regex: {e}")
|
| 745 |
+
entities = _extract_entities_regex(text)
|
| 746 |
+
else:
|
| 747 |
+
entities = _extract_entities_regex(text)
|
| 748 |
+
|
| 749 |
+
# Always supplement with regex patterns for things NER often misses
|
| 750 |
+
regex_ents = _extract_entities_regex(text)
|
| 751 |
+
ml_spans = set()
|
| 752 |
+
for e in entities:
|
| 753 |
+
for pos in range(e["start"], e["end"]):
|
| 754 |
+
ml_spans.add(pos)
|
| 755 |
+
for re_ent in regex_ents:
|
| 756 |
+
if not any(pos in ml_spans for pos in range(re_ent["start"], re_ent["end"])):
|
| 757 |
+
entities.append(re_ent)
|
| 758 |
+
|
| 759 |
+
# Deduplicate and sort
|
| 760 |
+
entities.sort(key=lambda x: (x["start"], -(x["end"] - x["start"])))
|
| 761 |
+
filtered = []
|
| 762 |
+
last_end = -1
|
| 763 |
+
for e in entities:
|
| 764 |
+
if e["start"] >= last_end:
|
| 765 |
+
filtered.append(e)
|
| 766 |
+
last_end = e["end"]
|
| 767 |
+
return filtered
|
| 768 |
+
|
| 769 |
+
def _map_ner_label(label):
|
| 770 |
+
label = label.upper()
|
| 771 |
+
mapping = {
|
| 772 |
+
"PER": "PERSON", "PERSON": "PERSON",
|
| 773 |
+
"ORG": "PARTY", "ORGANIZATION": "PARTY",
|
| 774 |
+
"LOC": "JURISDICTION", "LOCATION": "JURISDICTION",
|
| 775 |
+
"GPE": "JURISDICTION", "DATE": "DATE",
|
| 776 |
+
"MONEY": "MONEY", "MISC": "MISC", "LAW": "LEGAL_REF",
|
| 777 |
+
}
|
| 778 |
+
return mapping.get(label, label)
|
| 779 |
+
|
| 780 |
+
def _extract_entities_regex(text):
|
| 781 |
+
"""Regex-based NER fallback."""
|
| 782 |
+
entities = []
|
| 783 |
+
patterns = [
|
| 784 |
+
(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', "DATE"),
|
| 785 |
+
(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', "DATE"),
|
| 786 |
+
(r'\b\d{1,2}-(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)-\d{2,4}\b', "DATE"),
|
| 787 |
+
(r'\b(?:Effective|Commencement|Expiration|Termination)\s+Date\b', "DATE_REF"),
|
| 788 |
+
(r'\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?(?:\s*(?:million|billion|thousand|M|B|K))?', "MONEY"),
|
| 789 |
+
(r'\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|EUR|GBP|dollars|euros|pounds)', "MONEY"),
|
| 790 |
+
(r'\b(?:USD|EUR|GBP)\s*\d{1,3}(?:,\d{3})*(?:\.\d{2})?', "MONEY"),
|
| 791 |
+
(r'\b\d+(?:\.\d+)?%', "PERCENTAGE"),
|
| 792 |
+
(r'\b\d+\s*(?:year|month|week|day|business day)s?\b', "DURATION"),
|
| 793 |
+
(r'\b[A-Z][A-Za-z0-9\s&,]+?(?:Inc\.?|LLC|Ltd\.?|Limited|Corp\.?|Corporation|PLC|GmbH|AG|S\.A\.?|B\.V\.?|L\.P\.?|LLP)\b', "PARTY"),
|
| 794 |
+
(r'\b(?:Party A|Party B|Disclosing Party|Receiving Party|Licensor|Licensee|Buyer|Seller|Tenant|Landlord|Employer|Employee|Customer|Vendor|Client)\b', "PARTY_ROLE"),
|
| 795 |
+
(r'\b(?:State|Commonwealth)\s+of\s+[A-Z][a-zA-Z\s]+', "JURISDICTION"),
|
| 796 |
+
(r'\b(?:California|Delaware|New York|Texas|Florida|England|Ireland|Germany|France|Singapore|Hong Kong|Ontario|British Columbia)\b', "JURISDICTION"),
|
| 797 |
+
(r'"([A-Z][A-Za-z\s]{1,40})"', "DEFINED_TERM"),
|
| 798 |
+
(r'\((?:the\s+)?"([A-Z][A-Za-z\s]{1,40})"\)', "DEFINED_TERM"),
|
| 799 |
+
]
|
| 800 |
+
for pat, etype in patterns:
|
| 801 |
+
for m in re.finditer(pat, text, re.IGNORECASE if etype in ("DATE", "MONEY", "DURATION", "PERCENTAGE") else 0):
|
| 802 |
+
txt = m.group(1) if m.lastindex else m.group()
|
| 803 |
+
entities.append({
|
| 804 |
+
"text": txt,
|
| 805 |
+
"type": etype,
|
| 806 |
+
"start": m.start(),
|
| 807 |
+
"end": m.end(),
|
| 808 |
+
"source": "pattern",
|
| 809 |
+
})
|
| 810 |
+
return entities
|
| 811 |
+
|
| 812 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 813 |
+
# 7. NLI / CONTRADICTION DETECTION
|
| 814 |
+
# FIX v4.1: Pass (text_a, text_b) as dict with proper keys for
|
| 815 |
+
# cross-encoder pipeline, not [SEP]-concatenated string
|
| 816 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½ββββββββββββββββββββ
|
| 817 |
+
|
| 818 |
+
def _run_nli(text_a, text_b):
|
| 819 |
+
"""Run NLI pipeline with correct input format for cross-encoder.
|
| 820 |
+
FIX v4.1: cross-encoder expects {'text': a, 'text_pair': b} or a dict,
|
| 821 |
+
but the HF pipeline for text-classification with cross-encoder accepts
|
| 822 |
+
a dict input: {"text": text_a, "text_pair": text_b}.
|
| 823 |
+
The simplest correct way is to pass them as a list of dicts."""
|
| 824 |
+
try:
|
| 825 |
+
# The cross-encoder/nli-deberta-v3-base pipeline expects two texts.
|
| 826 |
+
# Passing as a dict with text and text_pair is the correct format.
|
| 827 |
+
result = nli_pipeline(
|
| 828 |
+
{"text": text_a[:256], "text_pair": text_b[:256]},
|
| 829 |
+
truncation=True,
|
| 830 |
+
)
|
| 831 |
+
return result
|
| 832 |
+
except Exception:
|
| 833 |
+
# Some pipeline versions accept positional (text, text_pair) as tuple
|
| 834 |
+
try:
|
| 835 |
+
return nli_pipeline(
|
| 836 |
+
text_a[:256],
|
| 837 |
+
text_pair=text_b[:256],
|
| 838 |
+
truncation=True,
|
| 839 |
+
)
|
| 840 |
+
except Exception:
|
| 841 |
+
return None
|
| 842 |
+
|
| 843 |
+
|
| 844 |
+
def detect_contradictions(clause_results, raw_text=""):
|
| 845 |
+
"""
|
| 846 |
+
Detect contradictions using:
|
| 847 |
+
1. NLI cross-encoder model (semantic contradiction detection)
|
| 848 |
+
2. Structural conflict detection (mutually exclusive labels)
|
| 849 |
+
3. Missing critical clause detection
|
| 850 |
+
"""
|
| 851 |
+
contradictions = []
|
| 852 |
+
labels_found = set()
|
| 853 |
+
clause_texts_by_label = defaultdict(list)
|
| 854 |
+
|
| 855 |
+
for cr in clause_results:
|
| 856 |
+
labels_found.add(cr["label"])
|
| 857 |
+
clause_texts_by_label[cr["label"]].append(cr.get("text", ""))
|
| 858 |
+
|
| 859 |
+
# ββ 1. Semantic NLI (if model available) ββ
|
| 860 |
+
if _HAS_NLI_MODEL and nli_pipeline is not None:
|
| 861 |
+
conflict_pairs = [
|
| 862 |
+
("Uncapped Liability", "Cap on Liability",
|
| 863 |
+
"Liability cannot be both uncapped and capped simultaneously."),
|
| 864 |
+
("IP Ownership Assignment", "Joint IP Ownership",
|
| 865 |
+
"IP cannot be both fully assigned and jointly owned."),
|
| 866 |
+
("Exclusivity", "Non-Transferable License",
|
| 867 |
+
"Exclusivity and non-transferable license may conflict."),
|
| 868 |
+
]
|
| 869 |
+
for label_a, label_b, explanation in conflict_pairs:
|
| 870 |
+
if label_a in labels_found and label_b in labels_found:
|
| 871 |
+
texts_a = clause_texts_by_label[label_a]
|
| 872 |
+
texts_b = clause_texts_by_label[label_b]
|
| 873 |
+
for ta in texts_a[:2]:
|
| 874 |
+
for tb in texts_b[:2]:
|
| 875 |
+
# FIX v4.1: Use proper NLI input format
|
| 876 |
+
nli_result = _run_nli(ta, tb)
|
| 877 |
+
if nli_result is None:
|
| 878 |
+
continue
|
| 879 |
+
for r in (nli_result if isinstance(nli_result, list) else [nli_result]):
|
| 880 |
+
if r.get("label", "").lower() == "contradiction" and r.get("score", 0) > 0.6:
|
| 881 |
+
contradictions.append({
|
| 882 |
+
"type": "CONTRADICTION",
|
| 883 |
+
"explanation": explanation,
|
| 884 |
+
"severity": "HIGH",
|
| 885 |
+
"clauses": [label_a, label_b],
|
| 886 |
+
"confidence": round(r["score"], 3),
|
| 887 |
+
"source": "nli_model",
|
| 888 |
+
})
|
| 889 |
+
|
| 890 |
+
# Also check for internal contradictions within governing law / termination
|
| 891 |
+
for label in ["Governing Law", "Termination for Convenience"]:
|
| 892 |
+
texts = clause_texts_by_label.get(label, [])
|
| 893 |
+
if len(texts) >= 2:
|
| 894 |
+
for i in range(len(texts)):
|
| 895 |
+
for j in range(i + 1, min(len(texts), i + 3)):
|
| 896 |
+
nli_result = _run_nli(texts[i], texts[j])
|
| 897 |
+
if nli_result is None:
|
| 898 |
+
continue
|
| 899 |
+
for r in (nli_result if isinstance(nli_result, list) else [nli_result]):
|
| 900 |
+
if r.get("label", "").lower() == "contradiction" and r.get("score", 0) > 0.6:
|
| 901 |
+
contradictions.append({
|
| 902 |
+
"type": "CONTRADICTION",
|
| 903 |
+
"explanation": f"Conflicting {label} provisions detected β clauses contradict each other.",
|
| 904 |
+
"severity": "HIGH",
|
| 905 |
+
"clauses": [label],
|
| 906 |
+
"confidence": round(r["score"], 3),
|
| 907 |
+
"source": "nli_model",
|
| 908 |
+
})
|
| 909 |
+
else:
|
| 910 |
+
# ββ Heuristic fallback (improved) ββ
|
| 911 |
+
_heuristic_pairs = [
|
| 912 |
+
(["Uncapped Liability"], ["Cap on Liability"],
|
| 913 |
+
"Liability cannot be both uncapped and capped simultaneously."),
|
| 914 |
+
(["IP Ownership Assignment"], ["Joint IP Ownership"],
|
| 915 |
+
"IP cannot be both fully assigned and jointly owned."),
|
| 916 |
+
]
|
| 917 |
+
for group_a, group_b, explanation in _heuristic_pairs:
|
| 918 |
+
found_a = any(l in labels_found for l in group_a)
|
| 919 |
+
found_b = any(l in labels_found for l in group_b)
|
| 920 |
+
if found_a and found_b:
|
| 921 |
+
contradictions.append({
|
| 922 |
+
"type": "CONTRADICTION",
|
| 923 |
+
"explanation": explanation,
|
| 924 |
+
"severity": "HIGH",
|
| 925 |
+
"clauses": group_a + group_b,
|
| 926 |
+
"source": "heuristic",
|
| 927 |
+
})
|
| 928 |
+
|
| 929 |
+
# ββ 2. Missing critical clauses ββ
|
| 930 |
+
_REQUIRED_CLAUSE_PATTERNS = {
|
| 931 |
+
"Governing Law": re.compile(
|
| 932 |
+
r'govern(?:ed|ing).{0,15}law|applicable.{0,10}law|laws?\s+of\s+the\s+state',
|
| 933 |
+
re.IGNORECASE
|
| 934 |
+
),
|
| 935 |
+
"Limitation of liability": re.compile(
|
| 936 |
+
r'limitation.{0,10}liabilit|cap.{0,10}liabilit|liabilit.{0,10}shall\s+not\s+exceed|in\s+no\s+event.{0,20}liable',
|
| 937 |
+
re.IGNORECASE
|
| 938 |
+
),
|
| 939 |
+
"Arbitration": re.compile(
|
| 940 |
+
r'arbitrat|AAA|JAMS|binding.{0,10}dispute',
|
| 941 |
+
re.IGNORECASE
|
| 942 |
+
),
|
| 943 |
+
"Termination": re.compile(
|
| 944 |
+
r'terminat(?:e|ion|ed)|cancel(?:lation)?',
|
| 945 |
+
re.IGNORECASE
|
| 946 |
+
),
|
| 947 |
+
}
|
| 948 |
+
for clause_name, pattern in _REQUIRED_CLAUSE_PATTERNS.items():
|
| 949 |
+
if not pattern.search(raw_text):
|
| 950 |
+
contradictions.append({
|
| 951 |
+
"type": "MISSING",
|
| 952 |
+
"explanation": f"No '{clause_name}' clause detected in the document.",
|
| 953 |
+
"severity": "MEDIUM",
|
| 954 |
+
"clauses": [clause_name],
|
| 955 |
+
"source": "structural",
|
| 956 |
+
})
|
| 957 |
+
|
| 958 |
+
# Deduplicate
|
| 959 |
+
seen = set()
|
| 960 |
+
unique = []
|
| 961 |
+
for c in contradictions:
|
| 962 |
+
key = (c["type"], c["explanation"])
|
| 963 |
+
if key not in seen:
|
| 964 |
+
seen.add(key)
|
| 965 |
+
unique.append(c)
|
| 966 |
+
|
| 967 |
+
return unique
|
| 968 |
+
|
| 969 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 970 |
+
# 8. RISK SCORING
|
| 971 |
+
# FIX v4.1: Absolute risk based on findings, not normalized by doc length
|
| 972 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 973 |
+
|
| 974 |
+
def compute_risk_score(clause_results, total_clauses):
|
| 975 |
+
sev_counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
|
| 976 |
+
for cr in clause_results:
|
| 977 |
+
sev = cr.get("risk", "LOW")
|
| 978 |
+
sev_counts[sev] += 1
|
| 979 |
+
if total_clauses == 0:
|
| 980 |
+
return 0, "A", sev_counts
|
| 981 |
+
|
| 982 |
+
# FIX v4.1: Absolute risk β critical findings should always score high
|
| 983 |
+
# regardless of document size. A 200-clause doc with 5 critical findings
|
| 984 |
+
# is just as dangerous as a 10-clause doc with 5 critical findings.
|
| 985 |
+
weighted = sum(sev_counts[s] * RISK_WEIGHTS[s] for s in sev_counts)
|
| 986 |
+
|
| 987 |
+
# Diminishing returns formula: starts linear, flattens near 100
|
| 988 |
+
# max theoretical = 100, one CRITICAL finding = ~30, two = ~48, five = ~72
|
| 989 |
+
risk = min(100, round(100 * (1 - (1 / (1 + weighted / 30)))))
|
| 990 |
+
|
| 991 |
+
if risk >= 70: grade = "F"
|
| 992 |
+
elif risk >= 50: grade = "D"
|
| 993 |
+
elif risk >= 30: grade = "C"
|
| 994 |
+
elif risk >= 15: grade = "B"
|
| 995 |
+
else: grade = "A"
|
| 996 |
+
return risk, grade, sev_counts
|
| 997 |
+
|
| 998 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 999 |
+
# 9. MAIN ANALYSIS PIPELINE
|
| 1000 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1001 |
+
|
| 1002 |
+
def analyze_contract(text):
|
| 1003 |
+
if not text or len(text.strip()) < 50:
|
| 1004 |
+
return None, "Document too short (minimum 50 characters)"
|
| 1005 |
+
clauses = split_clauses(text)
|
| 1006 |
+
if not clauses:
|
| 1007 |
+
return None, "No clauses detected in document"
|
| 1008 |
+
clause_results = []
|
| 1009 |
+
for clause in clauses:
|
| 1010 |
+
predictions = classify_cuad(clause)
|
| 1011 |
+
if predictions:
|
| 1012 |
+
for pred in predictions:
|
| 1013 |
+
clause_results.append({
|
| 1014 |
+
"text": clause,
|
| 1015 |
+
"label": pred["label"],
|
| 1016 |
+
"confidence": pred["confidence"],
|
| 1017 |
+
"risk": pred["risk"],
|
| 1018 |
+
"description": pred["description"],
|
| 1019 |
+
"source": pred.get("source", "unknown"),
|
| 1020 |
+
})
|
| 1021 |
+
entities = extract_entities(text)
|
| 1022 |
+
contradictions = detect_contradictions(clause_results, text)
|
| 1023 |
+
risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
|
| 1024 |
+
obligations = extract_obligations(text)
|
| 1025 |
+
compliance = check_compliance(text)
|
| 1026 |
+
|
| 1027 |
+
flagged_clause_count = len(clause_results)
|
| 1028 |
+
unique_flagged_texts = len(set(cr["text"] for cr in clause_results))
|
| 1029 |
+
|
| 1030 |
+
result = {
|
| 1031 |
+
"metadata": {
|
| 1032 |
+
"analysis_date": datetime.now().isoformat(),
|
| 1033 |
+
"total_clauses": len(clauses),
|
| 1034 |
+
"flagged_clauses": flagged_clause_count,
|
| 1035 |
+
"unique_flagged": unique_flagged_texts,
|
| 1036 |
+
"model": get_model_status_text(),
|
| 1037 |
+
"text_hash": hashlib.sha256(re.sub(r'\s+', ' ', text.strip()).encode()).hexdigest()[:16],
|
| 1038 |
+
},
|
| 1039 |
+
"risk": {
|
| 1040 |
+
"score": risk,
|
| 1041 |
+
"grade": grade,
|
| 1042 |
+
"breakdown": sev_counts,
|
| 1043 |
+
},
|
| 1044 |
+
"clauses": clause_results,
|
| 1045 |
+
"entities": entities,
|
| 1046 |
+
"contradictions": contradictions,
|
| 1047 |
+
"obligations": obligations,
|
| 1048 |
+
"compliance": compliance,
|
| 1049 |
+
"raw_text": text,
|
| 1050 |
+
}
|
| 1051 |
+
return result, None
|
| 1052 |
+
|
| 1053 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1054 |
+
# 10. EXPORT FUNCTIONS
|
| 1055 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1056 |
+
|
| 1057 |
+
def export_json(result):
|
| 1058 |
+
if result is None:
|
| 1059 |
+
return None
|
| 1060 |
+
return json.dumps(result, indent=2, default=str)
|
| 1061 |
+
|
| 1062 |
+
def export_csv(result):
|
| 1063 |
+
if result is None:
|
| 1064 |
+
return None
|
| 1065 |
+
output = io.StringIO()
|
| 1066 |
+
writer = csv.writer(output)
|
| 1067 |
+
writer.writerow(["Clause Text", "Label", "Risk", "Confidence", "Description", "Source"])
|
| 1068 |
+
for cr in result.get("clauses", []):
|
| 1069 |
+
conf = cr.get("confidence")
|
| 1070 |
+
conf_str = f"{conf:.3f}" if conf is not None else "pattern match"
|
| 1071 |
+
writer.writerow([
|
| 1072 |
+
cr.get("text", "")[:500],
|
| 1073 |
+
cr.get("label", ""),
|
| 1074 |
+
cr.get("risk", ""),
|
| 1075 |
+
conf_str,
|
| 1076 |
+
cr.get("description", ""),
|
| 1077 |
+
cr.get("source", ""),
|
| 1078 |
+
])
|
| 1079 |
+
return output.getvalue()
|
| 1080 |
+
|
| 1081 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1082 |
+
# 11. UI RENDERING
|
| 1083 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1084 |
+
|
| 1085 |
+
def render_summary(result):
|
| 1086 |
+
if result is None:
|
| 1087 |
+
return ""
|
| 1088 |
+
risk = result["risk"]
|
| 1089 |
+
score = risk["score"]
|
| 1090 |
+
grade = risk["grade"]
|
| 1091 |
+
breakdown = risk["breakdown"]
|
| 1092 |
+
grade_color = {
|
| 1093 |
+
"A": "#16a34a", "B": "#65a30d", "C": "#ca8a04",
|
| 1094 |
+
"D": "#ea580c", "F": "#dc2626",
|
| 1095 |
+
}.get(grade, "#6b7280")
|
| 1096 |
+
crit, high, med, low = breakdown["CRITICAL"], breakdown["HIGH"], breakdown["MEDIUM"], breakdown["LOW"]
|
| 1097 |
+
html = f"""
|
| 1098 |
+
<div style="font-family:system-ui,sans-serif;padding:16px;border:1px solid #e5e7eb;border-radius:12px;background:#fff;">
|
| 1099 |
+
<div style="text-align:center;margin-bottom:16px;">
|
| 1100 |
+
<div style="font-size:48px;font-weight:700;color:{grade_color};">{score}</div>
|
| 1101 |
+
<div style="font-size:14px;color:#6b7280;">/100 Risk Score</div>
|
| 1102 |
+
<div style="display:inline-block;margin-top:8px;padding:4px 16px;border-radius:20px;background:{grade_color};color:white;font-weight:600;font-size:14px;">
|
| 1103 |
+
Grade {grade}
|
| 1104 |
+
</div>
|
| 1105 |
+
</div>
|
| 1106 |
+
<div style="display:grid;grid-template-columns:1fr 1fr;gap:8px;margin-bottom:12px;">
|
| 1107 |
+
<div style="padding:8px;border-radius:6px;background:#fef2f2;text-align:center;">
|
| 1108 |
+
<div style="font-size:20px;font-weight:700;color:#dc2626;">{crit}</div>
|
| 1109 |
+
<div style="font-size:11px;color:#991b1b;">Critical</div>
|
| 1110 |
+
</div>
|
| 1111 |
+
<div style="padding:8px;border-radius:6px;background:#fff7ed;text-align:center;">
|
| 1112 |
+
<div style="font-size:20px;font-weight:700;color:#ea580c;">{high}</div>
|
| 1113 |
+
<div style="font-size:11px;color:#9a3412;">High</div>
|
| 1114 |
+
</div>
|
| 1115 |
+
<div style="padding:8px;border-radius:6px;background:#fefce8;text-align:center;">
|
| 1116 |
+
<div style="font-size:20px;font-weight:700;color:#ca8a04;">{med}</div>
|
| 1117 |
+
<div style="font-size:11px;color:#854d0e;">Medium</div>
|
| 1118 |
+
</div>
|
| 1119 |
+
<div style="padding:8px;border-radius:6px;background:#f0fdf4;text-align:center;">
|
| 1120 |
+
<div style="font-size:20px;font-weight:700;color:#16a34a;">{low}</div>
|
| 1121 |
+
<div style="font-size:11px;color:#166534;">Low</div>
|
| 1122 |
+
</div>
|
| 1123 |
+
</div>
|
| 1124 |
+
<div style="font-size:12px;color:#6b7280;text-align:center;">
|
| 1125 |
+
{result['metadata']['total_clauses']} clauses analyzed Β· {result['metadata']['flagged_clauses']} flagged
|
| 1126 |
+
<br><span style="font-size:10px;">{result['metadata']['model']}</span>
|
| 1127 |
+
</div>
|
| 1128 |
+
</div>
|
| 1129 |
+
"""
|
| 1130 |
+
return html
|
| 1131 |
+
|
| 1132 |
+
def render_clause_cards(result):
|
| 1133 |
+
if result is None:
|
| 1134 |
+
return ""
|
| 1135 |
+
clauses = result.get("clauses", [])
|
| 1136 |
+
if not clauses:
|
| 1137 |
+
return '<div style="padding:24px;text-align:center;color:#6b7280;">No clauses detected.</div>'
|
| 1138 |
+
grouped = defaultdict(list)
|
| 1139 |
+
for cr in clauses:
|
| 1140 |
+
grouped[cr["text"]].append(cr)
|
| 1141 |
+
html = '<div style="font-family:system-ui,sans-serif;">'
|
| 1142 |
+
for text, items in grouped.items():
|
| 1143 |
+
max_risk = max(items, key=lambda x: {"CRITICAL":4,"HIGH":3,"MEDIUM":2,"LOW":1}[x["risk"]])["risk"]
|
| 1144 |
+
border, bg, icon = RISK_STYLES[max_risk]
|
| 1145 |
+
tags = ""
|
| 1146 |
+
for item in items:
|
| 1147 |
+
tag_bg = RISK_STYLES[item["risk"]][1]
|
| 1148 |
+
tag_color = RISK_STYLES[item["risk"]][0]
|
| 1149 |
+
conf = item.get("confidence")
|
| 1150 |
+
source = item.get("source", "")
|
| 1151 |
+
if conf is not None:
|
| 1152 |
+
conf_text = f"{conf:.0%}"
|
| 1153 |
+
else:
|
| 1154 |
+
conf_text = "pattern"
|
| 1155 |
+
source_icon = "π€" if source == "ml" else "π"
|
| 1156 |
+
tags += f'<span style="background:{tag_bg};color:{tag_color};border:1px solid {tag_color}33;padding:2px 8px;border-radius:12px;font-size:11px;font-weight:500;margin-right:4px;">{source_icon} {item["label"]} ({conf_text})</span>'
|
| 1157 |
+
descs = "".join(
|
| 1158 |
+
f'<p style="font-size:12px;color:#6b7280;margin:4px 0 0 0;">{item["description"]}</p>'
|
| 1159 |
+
for item in items
|
| 1160 |
+
)
|
| 1161 |
+
preview = text[:300] + ("..." if len(text) > 300 else "")
|
| 1162 |
+
preview = preview.replace("<", "<").replace(">", ">")
|
| 1163 |
+
html += f"""
|
| 1164 |
+
<div style="border:1px solid #e5e7eb;border-left:4px solid {border};border-radius:8px;padding:14px;margin-bottom:10px;background:#fafafa;">
|
| 1165 |
+
<div style="display:flex;align-items:center;gap:6px;margin-bottom:6px;">
|
| 1166 |
+
<span style="font-size:16px;">{icon}</span>
|
| 1167 |
+
<span style="font-size:12px;font-weight:600;color:{border};text-transform:uppercase;">{max_risk}</span>
|
| 1168 |
+
</div>
|
| 1169 |
+
<p style="font-size:13px;color:#374151;line-height:1.6;margin:0 0 8px 0;">{preview}</p>
|
| 1170 |
+
<div style="margin-bottom:6px;">{tags}</div>
|
| 1171 |
+
{descs}
|
| 1172 |
+
</div>
|
| 1173 |
+
"""
|
| 1174 |
+
html += "</div>"
|
| 1175 |
+
return html
|
| 1176 |
+
|
| 1177 |
+
def render_entities(result):
|
| 1178 |
+
if result is None:
|
| 1179 |
+
return ""
|
| 1180 |
+
entities = result.get("entities", [])
|
| 1181 |
+
if not entities:
|
| 1182 |
+
return '<div style="padding:16px;color:#6b7280;">No entities detected.</div>'
|
| 1183 |
+
grouped = defaultdict(list)
|
| 1184 |
+
for e in entities:
|
| 1185 |
+
grouped[e["type"]].append(e["text"])
|
| 1186 |
+
html = '<div style="font-family:system-ui,sans-serif;">'
|
| 1187 |
+
for etype, texts in grouped.items():
|
| 1188 |
+
unique = list(dict.fromkeys(texts))[:20]
|
| 1189 |
+
color = {
|
| 1190 |
+
"DATE": "#3b82f6", "DATE_REF": "#60a5fa",
|
| 1191 |
+
"MONEY": "#22c55e", "PERCENTAGE": "#10b981",
|
| 1192 |
+
"DURATION": "#6366f1",
|
| 1193 |
+
"PARTY": "#8b5cf6", "PARTY_ROLE": "#a78bfa",
|
| 1194 |
+
"PERSON": "#ec4899",
|
| 1195 |
+
"JURISDICTION": "#f59e0b",
|
| 1196 |
+
"DEFINED_TERM": "#ec4899",
|
| 1197 |
+
"LEGAL_REF": "#6b7280",
|
| 1198 |
+
"MISC": "#9ca3af",
|
| 1199 |
+
}.get(etype, "#6b7280")
|
| 1200 |
+
items_html = "".join(
|
| 1201 |
+
f'<span style="display:inline-block;background:{color}15;color:{color};border:1px solid {color}40;padding:3px 10px;border-radius:6px;font-size:12px;margin:3px;">{t}</span>'
|
| 1202 |
+
for t in unique
|
| 1203 |
+
)
|
| 1204 |
+
html += f"""
|
| 1205 |
+
<div style="margin-bottom:12px;">
|
| 1206 |
+
<div style="font-size:12px;font-weight:600;color:#374151;margin-bottom:6px;text-transform:uppercase;">{etype}</div>
|
| 1207 |
+
<div>{items_html}</div>
|
| 1208 |
+
</div>
|
| 1209 |
+
"""
|
| 1210 |
+
html += "</div>"
|
| 1211 |
+
return html
|
| 1212 |
+
|
| 1213 |
+
def render_contradictions(result):
|
| 1214 |
+
if result is None:
|
| 1215 |
+
return ""
|
| 1216 |
+
contradictions = result.get("contradictions", [])
|
| 1217 |
+
if not contradictions:
|
| 1218 |
+
return '<div style="padding:16px;color:#16a34a;">β No contradictions or missing clauses detected.</div>'
|
| 1219 |
+
html = '<div style="font-family:system-ui,sans-serif;">'
|
| 1220 |
+
for c in contradictions:
|
| 1221 |
+
sev_color = RISK_STYLES[c["severity"]][0]
|
| 1222 |
+
icon = "β οΈ" if c["type"] == "CONTRADICTION" else "π"
|
| 1223 |
+
source = c.get("source", "")
|
| 1224 |
+
source_badge = ""
|
| 1225 |
+
if source == "nli_model":
|
| 1226 |
+
conf = c.get("confidence", 0)
|
| 1227 |
+
source_badge = f'<span style="font-size:10px;background:#eff6ff;color:#3b82f6;padding:1px 6px;border-radius:4px;margin-left:8px;">π€ NLI {conf:.0%}</span>'
|
| 1228 |
+
elif source == "heuristic":
|
| 1229 |
+
source_badge = '<span style="font-size:10px;background:#fef3c7;color:#92400e;padding:1px 6px;border-radius:4px;margin-left:8px;">π Heuristic</span>'
|
| 1230 |
+
html += f"""
|
| 1231 |
+
<div style="border:1px solid #e5e7eb;border-left:4px solid {sev_color};border-radius:8px;padding:12px;margin-bottom:8px;background:#fafafa;">
|
| 1232 |
+
<div style="display:flex;align-items:center;gap:6px;margin-bottom:4px;">
|
| 1233 |
+
<span>{icon}</span>
|
| 1234 |
+
<span style="font-size:12px;font-weight:600;color:{sev_color};">{c["type"]}</span>
|
| 1235 |
+
{source_badge}
|
| 1236 |
+
</div>
|
| 1237 |
+
<p style="font-size:13px;color:#374151;margin:0;">{c["explanation"]}</p>
|
| 1238 |
+
</div>
|
| 1239 |
+
"""
|
| 1240 |
+
html += "</div>"
|
| 1241 |
+
return html
|
| 1242 |
+
|
| 1243 |
+
def render_document_viewer(result):
|
| 1244 |
+
if result is None:
|
| 1245 |
+
return ""
|
| 1246 |
+
text = result.get("raw_text", "")
|
| 1247 |
+
entities = sorted(result.get("entities", []), key=lambda x: x["start"])
|
| 1248 |
+
html_parts = []
|
| 1249 |
+
last_end = 0
|
| 1250 |
+
entity_colors = {
|
| 1251 |
+
"DATE": "#3b82f6", "DATE_REF": "#60a5fa", "MONEY": "#22c55e",
|
| 1252 |
+
"PERCENTAGE": "#10b981", "DURATION": "#6366f1", "PARTY": "#8b5cf6",
|
| 1253 |
+
"PARTY_ROLE": "#a78bfa", "PERSON": "#ec4899", "JURISDICTION": "#f59e0b",
|
| 1254 |
+
"DEFINED_TERM": "#ec4899", "LEGAL_REF": "#6b7280", "MISC": "#9ca3af",
|
| 1255 |
+
}
|
| 1256 |
+
for e in entities:
|
| 1257 |
+
if e["start"] >= last_end:
|
| 1258 |
+
plain = text[last_end:e["start"]].replace("<", "<").replace(">", ">")
|
| 1259 |
+
html_parts.append(plain)
|
| 1260 |
+
color = entity_colors.get(e["type"], "#6b7280")
|
| 1261 |
+
entity_text = text[e["start"]:e["end"]].replace("<", "<").replace(">", ">")
|
| 1262 |
+
html_parts.append(
|
| 1263 |
+
f'<span style="background:{color}20;color:{color};border-bottom:2px solid {color};padding:0 2px;border-radius:2px;" '
|
| 1264 |
+
f'title="{e["type"]}">{entity_text}</span>'
|
| 1265 |
+
)
|
| 1266 |
+
last_end = e["end"]
|
| 1267 |
+
if last_end < len(text):
|
| 1268 |
+
html_parts.append(text[last_end:].replace("<", "<").replace(">", ">"))
|
| 1269 |
+
return f'<div style="font-family:ui-monospace,monospace;font-size:13px;line-height:1.8;white-space:pre-wrap;padding:16px;">{"".join(html_parts)}</div>'
|