Spaces:

gaurv007
/

ClauseGuard

Sleeping

App Files Files Community

gaurv007 commited on 17 days ago

Commit

3d79dff

verified ·

1 Parent(s): 25d0271

v3.0: Fix API - use shared modules, fix schema mismatch, add rate limiting, fix CORS

Browse files

Files changed (1) hide show

api/main.py +1 -667

api/main.py CHANGED Viewed

@@ -1,667 +1 @@
-"""
-ClauseGuard — FastAPI Backend v2.0
-══════════════════════════════════
-Features:
-  • 41 CUAD clause categories via fine-tuned Legal-BERT
-  • 4-tier risk scoring (Critical / High / Medium / Low)
-  • Legal NER: parties, dates, monetary values, jurisdictions, defined terms
-  • NLI contradiction & missing-clause detection
-  • Contract comparison engine
-  • Obligation tracker
-  • Compliance checker (GDPR, CCPA, SOX, HIPAA, FINRA)
-"""
-import os
-import re
-import json
-import time
-from contextlib import asynccontextmanager
-from typing import Optional
-from collections import defaultdict
-from datetime import datetime
-import httpx
-import numpy as np
-from fastapi import FastAPI, HTTPException, Depends, Body
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, Field
-from auth import get_current_user, require_auth
-# ─── Config ───
-MODEL_PATH = os.environ.get("MODEL_PATH", "./clauseguard-model/final")
-ONNX_MODEL_PATH = os.environ.get("ONNX_MODEL_PATH", "./clauseguard-model-onnx")
-USE_ONNX = os.environ.get("USE_ONNX", "true").lower() == "true"
-SUPABASE_URL = os.environ.get("SUPABASE_URL", "")
-SUPABASE_SERVICE_KEY = os.environ.get("SUPABASE_SERVICE_ROLE_KEY", "")
-HF_API_TOKEN = os.environ.get("HF_API_TOKEN", "")
-SAULLM_ENDPOINT = os.environ.get("SAULLM_ENDPOINT", "")
-# ─── CUAD Labels (41 categories) ───
-CUAD_LABELS = [
-    "Document Name", "Parties", "Agreement Date", "Effective Date",
-    "Expiration Date", "Renewal Term", "Governing Law", "Most Favored Nation",
-    "Non-Compete", "Exclusivity", "No-Solicit of Customers",
-    "No-Solicit of Employees", "Non-Disparagement",
-    "Termination for Convenience", "ROFR/ROFO/ROFN", "Change of Control",
-    "Anti-Assignment", "Revenue/Profit Sharing", "Price Restriction",
-    "Minimum Commitment", "Volume Restriction", "IP Ownership Assignment",
-    "Joint IP Ownership", "License Grant", "Non-Transferable License",
-    "Affiliate License-Licensor", "Affiliate License-Licensee",
-    "Unlimited/All-You-Can-Eat License", "Irrevocable or Perpetual License",
-    "Source Code Escrow", "Post-Termination Services", "Audit Rights",
-    "Uncapped Liability", "Cap on Liability", "Liquidated Damages",
-    "Warranty Duration", "Insurance", "Covenant Not to Sue",
-    "Third Party Beneficiary", "Other"
-]
-RISK_MAP = {
-    "Uncapped Liability": "CRITICAL", "Arbitration": "CRITICAL",
-    "IP Ownership Assignment": "CRITICAL", "Termination for Convenience": "CRITICAL",
-    "Limitation of liability": "CRITICAL", "Unilateral termination": "CRITICAL",
-    "Liquidated Damages": "CRITICAL",
-    "Non-Compete": "HIGH", "Exclusivity": "HIGH", "Change of Control": "HIGH",
-    "No-Solicit of Customers": "HIGH", "No-Solicit of Employees": "HIGH",
-    "Unilateral change": "HIGH", "Content removal": "HIGH", "Anti-Assignment": "HIGH",
-    "Governing Law": "MEDIUM", "Jurisdiction": "MEDIUM", "Choice of law": "MEDIUM",
-    "Price Restriction": "MEDIUM", "Minimum Commitment": "MEDIUM",
-    "Volume Restriction": "MEDIUM", "Non-Disparagement": "MEDIUM",
-    "Most Favored Nation": "MEDIUM", "Revenue/Profit Sharing": "MEDIUM",
-    "Warranty Duration": "MEDIUM",
-    "Document Name": "LOW", "Parties": "LOW", "Agreement Date": "LOW",
-    "Effective Date": "LOW", "Expiration Date": "LOW", "Renewal Term": "LOW",
-    "Joint IP Ownership": "LOW", "License Grant": "LOW",
-    "Non-Transferable License": "LOW", "Affiliate License-Licensor": "LOW",
-    "Affiliate License-Licensee": "LOW", "Unlimited/All-You-Can-Eat License": "LOW",
-    "Irrevocable or Perpetual License": "LOW", "Source Code Escrow": "LOW",
-    "Post-Termination Services": "LOW", "Audit Rights": "LOW",
-    "Cap on Liability": "LOW", "Insurance": "LOW",
-    "Covenant Not to Sue": "LOW", "Third Party Beneficiary": "LOW",
-    "Other": "LOW", "ROFR/ROFO/ROFN": "LOW", "Contract by using": "LOW",
-}
-DESC_MAP = {
-    "Limitation of liability": "Company limits or excludes liability for losses, data breaches, or service failures.",
-    "Unilateral termination": "Company can terminate your account at any time without reason.",
-    "Unilateral change": "Company can change terms at any time without your consent.",
-    "Content removal": "Company can delete your content without notice or justification.",
-    "Contract by using": "You are bound to the contract simply by using the service.",
-    "Choice of law": "Governing law may differ from your country, reducing your legal protections.",
-    "Jurisdiction": "Disputes must be resolved in a jurisdiction that may disadvantage you.",
-    "Arbitration": "Forces disputes to arbitration instead of court. You waive your right to sue.",
-    "Uncapped Liability": "No financial limit on damages the party may be liable for.",
-    "Cap on Liability": "Maximum financial liability is explicitly capped.",
-    "Non-Compete": "Restrictions on competing with the counter-party.",
-    "Exclusivity": "Obligation to deal exclusively with one party.",
-    "IP Ownership Assignment": "Intellectual property rights are transferred entirely.",
-    "Termination for Convenience": "Either party may terminate without cause or notice.",
-    "Governing Law": "Specifies which jurisdiction's laws apply.",
-    "Non-Disparagement": "Agreement not to speak negatively about the other party.",
-    "ROFR/ROFO/ROFN": "Right of First Refusal / Offer / Negotiation clause.",
-    "Change of Control": "Provisions triggered by ownership or control changes.",
-    "Anti-Assignment": "Restrictions on transferring contract rights to third parties.",
-    "Liquidated Damages": "Pre-determined damages amount for breach of contract.",
-    "Source Code Escrow": "Third-party holds source code for release under defined conditions.",
-    "Post-Termination Services": "Services to be provided after the contract ends.",
-    "Audit Rights": "Right to inspect records or verify compliance.",
-    "Warranty Duration": "Length of time warranties remain in effect.",
-    "Covenant Not to Sue": "Agreement not to bring legal action against a party.",
-    "Third Party Beneficiary": "Non-party who benefits from the contract terms.",
-    "Insurance": "Insurance coverage requirements.",
-    "Revenue/Profit Sharing": "Revenue or profit sharing arrangements between parties.",
-    "Price Restriction": "Restrictions on pricing or discounting.",
-    "Minimum Commitment": "Minimum purchase or usage commitment.",
-    "Volume Restriction": "Limits on volume of goods or services.",
-    "License Grant": "Permission to use intellectual property.",
-    "Non-Transferable License": "License that cannot be transferred to third parties.",
-    "Irrevocable or Perpetual License": "License that cannot be revoked or lasts indefinitely.",
-    "Unlimited/All-You-Can-Eat License": "License with no usage limits.",
-}
-RISK_WEIGHTS = {"CRITICAL": 40, "HIGH": 20, "MEDIUM": 10, "LOW": 3}
-# ─── Regex patterns (fallback) ───
-REGEX_PATTERNS = {
-    "Limitation of liability": [r"not liable", r"shall not be (liable|responsible)", r"in no event.*liable", r"limitation of liability", r"without warranty", r"disclaim"],
-    "Unilateral termination": [r"terminat.*at any time", r"suspend.*account.*without", r"we may (terminat|suspend|discontinu)", r"right to (terminat|suspend)"],
-    "Unilateral change": [r"sole discretion", r"reserves? the right to (modify|change|update|amend)", r"at any time.*without (prior )?notice", r"we may (modify|change|update)"],
-    "Content removal": [r"remove.*content.*without", r"right to remove", r"we may.*remove"],
-    "Contract by using": [r"by (using|accessing).*you agree", r"continued use.*constitutes? acceptance"],
-    "Choice of law": [r"governed by.*laws? of", r"shall be governed", r"laws of the state of"],
-    "Jurisdiction": [r"exclusive jurisdiction", r"courts? of.*(california|delaware|new york|ireland|england)", r"submit to.*jurisdiction"],
-    "Arbitration": [r"arbitrat", r"binding arbitration", r"waive.*right.*court", r"class action waiver"],
-    "Governing Law": [r"governed by", r"laws of", r"jurisdiction of"],
-    "Termination for Convenience": [r"terminat.*for convenience", r"terminat.*without cause", r"terminat.*at any time"],
-    "Non-Compete": [r"non-compete", r"shall not compete", r"competition"],
-    "Exclusivity": [r"exclusive", r"exclusivity"],
-    "IP Ownership Assignment": [r"assign.*intellectual property", r"ownership of.*ip", r"all rights.*assign"],
-    "Uncapped Liability": [r"unlimited liability", r"uncapped", r"no.*limit.*liability"],
-    "Cap on Liability": [r"cap on liability", r"maximum liability", r"liability.*shall not exceed"],
-    "Indemnification": [r"indemnif", r"hold harmless", r"defend"],
-    "Confidentiality": [r"confidential", r"non-disclosure", r"nda"],
-    "Force Majeure": [r"force majeure", r"act of god", r"beyond.*control"],
-    "Penalties": [r"penalt", r"late fee", r"default charge", r"interest on overdue"],
-}
-# ─── Model Loading ───
-cuad_tokenizer = None
-cuad_model = None
-_HAS_TORCH = False
-try:
-    import torch
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    from peft import PeftModel
-    _HAS_TORCH = True
-except Exception:
-    pass
-def load_model():
-    global cuad_tokenizer, cuad_model, classifier
-    if not _HAS_TORCH:
-        print("[ClauseGuard] PyTorch not available")
-        return
-    try:
-        base = "nlpaueb/legal-bert-base-uncased"
-        adapter = "Mokshith31/legalbert-contract-clause-classification"
-        print(f"[ClauseGuard] Loading CUAD classifier: {adapter}")
-        cuad_tokenizer = AutoTokenizer.from_pretrained(base)
-        base_model = AutoModelForSequenceClassification.from_pretrained(
-            base, num_labels=41, ignore_mismatched_sizes=True
-        )
-        cuad_model = PeftModel.from_pretrained(base_model, adapter)
-        cuad_model.eval()
-        print("[ClauseGuard] CUAD model loaded successfully")
-    except Exception as e:
-        print(f"[ClauseGuard] CUAD model load failed: {e}")
-        cuad_tokenizer = None
-        cuad_model = None
-# ─── Supabase helper ───
-async def supabase_insert(table: str, data: dict):
-    if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
-        return
-    async with httpx.AsyncClient() as client:
-        await client.post(
-            f"{SUPABASE_URL}/rest/v1/{table}",
-            json=data,
-            headers={"apikey": SUPABASE_SERVICE_KEY, "Authorization": f"Bearer {SUPABASE_SERVICE_KEY}",
-                      "Content-Type": "application/json", "Prefer": "return=minimal"},
-        )
-async def supabase_query(table: str, params: dict, headers_extra: dict = {}):
-    if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
-        return []
-    async with httpx.AsyncClient() as client:
-        resp = await client.get(
-            f"{SUPABASE_URL}/rest/v1/{table}",
-            params=params,
-            headers={"apikey": SUPABASE_SERVICE_KEY, "Authorization": f"Bearer {SUPABASE_SERVICE_KEY}", **headers_extra},
-        )
-        return resp.json() if resp.status_code == 200 else []
-# ─── Clause Processing ───
-def split_clauses(text):
-    text = re.sub(r'\n{3,}', '\n\n', text.strip())
-    parts = re.split(r'(?<=[.!?])\s+(?=[A-Z0-9(])|(?:\n\n)(?=\d+[.)]\s|\([a-z]\)\s|[A-Z][A-Z\s]{2,})', text)
-    return [p.strip() for p in parts if len(p.strip()) > 30]
-def classify_regex(text):
-    text_lower = text.lower()
-    results = []
-    seen = set()
-    for label, patterns in REGEX_PATTERNS.items():
-        for pat in patterns:
-            if re.search(pat, text_lower):
-                if label not in seen:
-                    risk = RISK_MAP.get(label, "MEDIUM")
-                    results.append({
-                        "label": label,
-                        "confidence": 0.7,
-                        "risk": risk,
-                        "description": DESC_MAP.get(label, label),
-                    })
-                    seen.add(label)
-                break
-    return results
-def classify_cuad(clause_text):
-    if cuad_model is None or cuad_tokenizer is None:
-        return classify_regex(clause_text)
-    try:
-        inputs = cuad_tokenizer(clause_text, return_tensors="pt", truncation=True, max_length=256, padding=True)
-        with torch.no_grad():
-            logits = cuad_model(**inputs).logits
-        probs = torch.softmax(logits, dim=-1)[0]
-        threshold = 0.15
-        results = []
-        for i, prob in enumerate(probs):
-            if prob > threshold and i < len(CUAD_LABELS):
-                label = CUAD_LABELS[i]
-                results.append({
-                    "label": label,
-                    "confidence": round(float(prob), 3),
-                    "risk": RISK_MAP.get(label, "LOW"),
-                    "description": DESC_MAP.get(label, label),
-                })
-        results.sort(key=lambda x: x["confidence"], reverse=True)
-        if not results:
-            top_idx = int(probs.argmax())
-            label = CUAD_LABELS[top_idx] if top_idx < len(CUAD_LABELS) else "Other"
-            results.append({
-                "label": label,
-                "confidence": round(float(probs[top_idx]), 3),
-                "risk": RISK_MAP.get(label, "LOW"),
-                "description": DESC_MAP.get(label, label),
-            })
-        return results
-    except Exception:
-        return classify_regex(clause_text)
-# ─── NER ───
-def extract_entities(text):
-    entities = []
-    # Dates
-    for pat, etype in [
-        (r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', "DATE"),
-        (r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', "DATE"),
-        (r'\b\d{1,2}-\d{1,2}-\d{2,4}\b', "DATE"),
-        (r'\b(?:Effective|Commencement|Expiration|Termination)\s+Date\b', "DATE_REF"),
-    ]:
-        for m in re.finditer(pat, text, re.IGNORECASE):
-            entities.append({"text": m.group(), "type": etype, "start": m.start(), "end": m.end()})
-    # Money
-    for pat, etype in [
-        (r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?(?:\s*(?:million|billion|thousand|M|B|K))?', "MONEY"),
-        (r'\b\d{1,3}(?:,\d{3})*(?:\.\d{2})?\s*(?:USD|EUR|GBP|dollars|euros)', "MONEY"),
-    ]:
-        for m in re.finditer(pat, text, re.IGNORECASE):
-            entities.append({"text": m.group(), "type": etype, "start": m.start(), "end": m.end()})
-    # Parties
-    for pat, etype in [
-        (r'\b[A-Z][A-Za-z0-9\s&]+(?:Inc\.|LLC|Ltd\.|Limited|Corp\.|Corporation|PLC|GmbH|AG|S\.A\.|B\.V\.)\b', "PARTY"),
-        (r'\b(?:Party A|Party B|Disclosing Party|Receiving Party|Licensor|Licensee|Buyer|Seller|Tenant|Landlord|Employer|Employee|Company|Customer|Vendor|Client)\b', "PARTY_ROLE"),
-    ]:
-        for m in re.finditer(pat, text):
-            entities.append({"text": m.group(), "type": etype, "start": m.start(), "end": m.end()})
-    # Jurisdictions
-    for pat, etype in [
-        (r'\b(?:State|Laws?) of [A-Z][a-zA-Z\s]+', "JURISDICTION"),
-        (r'\b(?:California|Delaware|New York|Texas|Florida|England|Ireland|Germany|France|Singapore|Hong Kong)\b', "JURISDICTION"),
-    ]:
-        for m in re.finditer(pat, text, re.IGNORECASE):
-            entities.append({"text": m.group(), "type": etype, "start": m.start(), "end": m.end()})
-    # Defined Terms
-    for pat, etype in [
-        (r'"([A-Z][A-Z\s]+)"', "DEFINED_TERM"),
-        (r'\(([A-Z][A-Z\s]+)\)', "DEFINED_TERM"),
-    ]:
-        for m in re.finditer(pat, text):
-            entities.append({"text": m.group(1), "type": etype, "start": m.start(), "end": m.end()})
-    # Deduplicate
-    entities.sort(key=lambda x: (x["start"], -(x["end"] - x["start"])))
-    filtered = []
-    last_end = -1
-    for e in entities:
-        if e["start"] >= last_end:
-            filtered.append(e)
-            last_end = e["end"]
-    return filtered
-# ─── Contradictions ───
-CONTRADICTION_PAIRS = [
-    (["Uncapped Liability", "unlimited liability"], ["Cap on Liability", "cap on liability"],
-     "Liability cannot be both uncapped and capped simultaneously."),
-    (["Governing Law"], ["Governing Law"],
-     "Multiple governing law provisions detected — verify consistency."),
-    (["Termination for Convenience", "terminat.*convenience"], ["Fixed Term", "fixed term"],
-     "Contract has both fixed term and termination for convenience — review carefully."),
-    (["IP Ownership Assignment", "assign.*ip"], ["Joint IP Ownership", "joint ownership"],
-     "IP cannot be both fully assigned and jointly owned."),
-]
-def detect_contradictions(clause_results):
-    contradictions = []
-    labels_found = set()
-    for cr in clause_results:
-        labels_found.add(cr["label"])
-    for group_a, group_b, explanation in CONTRADICTION_PAIRS:
-        found_a = any(l in labels_found for l in group_a)
-        found_b = any(l in labels_found for l in group_b)
-        if found_a and found_b:
-            contradictions.append({"type": "CONTRADICTION", "explanation": explanation, "severity": "HIGH", "clauses": list(set(group_a + group_b))})
-    for cc in ["Governing Law", "Termination for Convenience", "Limitation of liability", "Arbitration"]:
-        if cc not in labels_found:
-            contradictions.append({"type": "MISSING", "explanation": f"Critical clause '{cc}' not detected.", "severity": "MEDIUM", "clauses": [cc]})
-    return contradictions
-# ─── Risk Scoring ───
-def compute_risk_score(clause_results, total_clauses):
-    sev_counts = {"CRITICAL": 0, "HIGH": 0, "MEDIUM": 0, "LOW": 0}
-    for cr in clause_results:
-        sev = cr.get("risk", "LOW")
-        sev_counts[sev] += 1
-    if total_clauses == 0:
-        return 0, "A", sev_counts
-    weighted = sum(sev_counts[s] * RISK_WEIGHTS[s] for s in sev_counts)
-    risk = min(100, round(weighted / max(1, total_clauses) * 10))
-    if risk >= 70: grade = "F"
-    elif risk >= 50: grade = "D"
-    elif risk >= 30: grade = "C"
-    elif risk >= 15: grade = "B"
-    else: grade = "A"
-    return risk, grade, sev_counts
-# ─── Obligations ───
-OBLIGATION_PATTERNS = {
-    "monetary": [r"(?:shall|must|will|agrees? to)\s+pay\s+(?:\$?[\d,]+)", r"(?:fee|payment|compensation|reimburs(?:e|ement))\s+of\s+(?:\$?[\d,]+)", r"(?:shall|must|will)\s+remit\s+(?:\$?[\d,]+)", r"(?:annual|monthly|quarterly)\s+(?:fee|payment)\s+of", r"(?:liquidated damages|penalty)\s+of\s+(?:\$?[\d,]+)"],
-    "compliance": [r"(?:shall|must|will)\s+comply\s+with", r"(?:shall|must|will)\s+adhere\s+to", r"(?:shall|must|will)\s+conform\s+to", r"(?:GDPR|CCPA|HIPAA|SOX|PCI-DSS|ISO\s+\d+)", r"(?:confidential|privacy|data protection)", r"(?:shall|must|will)\s+maintain\s+(?:insurance|coverage|bond)"],
-    "reporting": [r"(?:shall|must|will)\s+report", r"(?:shall|must|will)\s+provide\s+(?:regular|monthly|quarterly|annual)\s+(?:reports?|updates?|status)", r"(?:shall|must|will)\s+notify", r"(?:shall|must|will)\s+inform"],
-    "delivery": [r"(?:shall|must|will)\s+deliver", r"(?:shall|must|will)\s+provide", r"(?:shall|must|will)\s+furnish", r"(?:shall|must|will)\s+supply", r"(?:shall|must|will)\s+submit"],
-    "termination": [r"(?:shall|must|will)\s+return", r"(?:shall|must|will)\s+destroy", r"(?:shall|must|will)\s+cease", r"(?:upon|after)\s+termination"],
-}
-def extract_obligations(text):
-    sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
-    obligations = []
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if len(sentence) < 30:
-            continue
-        found_types = set()
-        for otype, patterns in OBLIGATION_PATTERNS.items():
-            for pat in patterns:
-                if re.search(pat, sentence, re.IGNORECASE):
-                    found_types.add(otype)
-                    break
-        if not found_types:
-            continue
-        party = "Unknown"
-        for pp in [r'\b(?:Party A|Party B|Disclosing Party|Receiving Party|Licensor|Licensee|Buyer|Seller|Tenant|Landlord|Employer|Employee|Company|Customer|Vendor|Client)\b', r'\b[A-Z][A-Za-z0-9\s&]+(?:Inc\.|LLC|Ltd\.|Limited|Corp\.|Corporation|PLC|GmbH|AG|S\.A\.|B\.V\.)\b']:
-            m = re.search(pp, sentence)
-            if m:
-                party = m.group(0)
-                break
-        deadline = "Not specified"
-        for pat, ptype in [
-            (r"within\s+(\d+)\s+(day|week|month|year)s?", "relative"),
-            (r"no\s+later\s+than\s+(\d+)\s+(day|week|month|year)s?", "relative"),
-            (r"within\s+(\d+)\s+business\s+days?", "business_days"),
-            (r"by\s+([A-Z][a-z]+\s+\d{1,2},?\s+\d{4})", "absolute"),
-            (r"on\s+or\s+before\s+([A-Z][a-z]+\s+\d{1,2},?\s+\d{4})", "absolute"),
-        ]:
-            m = re.search(pat, sentence, re.IGNORECASE)
-            if m:
-                deadline = m.group(0)
-                break
-        for otype in found_types:
-            obligations.append({"type": otype, "party": party, "description": sentence[:250] + ("..." if len(sentence) > 250 else ""), "deadline": deadline})
-    return obligations
-# ─── Compliance ───
-REGULATIONS = {
-    "GDPR": {
-        "description": "EU General Data Protection Regulation (Regulation 2016/679)",
-        "requirements": {
-            "lawful_basis": {"keywords": ["lawful basis", "legal basis", "legitimate interest", "consent", "performance of contract", "legal obligation"], "description": "Must specify lawful basis for data processing (Art. 6)", "severity": "HIGH"},
-            "data_subject_rights": {"keywords": ["right to access", "right to erasure", "right to be forgotten", "data portability", "rectification", "object to processing"], "description": "Must acknowledge data subject rights (Arts. 15-22)", "severity": "HIGH"},
-            "data_breach_notification": {"keywords": ["data breach", "breach notification", "notify supervisory authority", "72 hours"], "description": "Must include data breach notification obligations (Art. 33)", "severity": "MEDIUM"},
-            "cross_border_transfer": {"keywords": ["standard contractual clauses", "SCCs", "adequacy decision", "transfer mechanism", "third country"], "description": "Must specify transfer safeguards for cross-border data (Arts. 44-49)", "severity": "HIGH"},
-        },
-    },
-    "CCPA": {
-        "description": "California Consumer Privacy Act (Cal. Civ. Code § 1798.100 et seq.)",
-        "requirements": {
-            "consumer_rights": {"keywords": ["right to know", "right to delete", "right to opt out", "right to non-discrimination", "consumer rights"], "description": "Must acknowledge California consumer rights", "severity": "HIGH"},
-            "data_categories": {"keywords": ["categories of personal information", "personal information categories", "identifiers", "commercial information"], "description": "Must disclose categories of personal information collected", "severity": "HIGH"},
-            "sale_of_data": {"keywords": ["do not sell my personal information", "opt-out of sale", "sale of personal information"], "description": "Must provide opt-out mechanism for data sales", "severity": "HIGH"},
-        },
-    },
-    "SOX": {
-        "description": "Sarbanes-Oxley Act (US, 2002)",
-        "requirements": {
-            "internal_controls": {"keywords": ["internal controls", "internal control over financial reporting", "ICFR"], "description": "Must reference internal controls over financial reporting (§ 404)", "severity": "HIGH"},
-            "whistleblower": {"keywords": ["whistleblower", "anonymous reporting", "reporting hotline", "retaliation"], "description": "Should protect whistleblower provisions (§ 806)", "severity": "HIGH"},
-            "document_retention": {"keywords": ["document retention", "record retention", "retention policy", "preserve records"], "description": "Must include document retention obligations (§ 802)", "severity": "HIGH"},
-        },
-    },
-    "HIPAA": {
-        "description": "Health Insurance Portability and Accountability Act (US, 1996)",
-        "requirements": {
-            "phi_protection": {"keywords": ["protected health information", "PHI", "health information", "ePHI"], "description": "Must protect PHI and limit uses/disclosures", "severity": "CRITICAL"},
-            "security_safeguards": {"keywords": ["administrative safeguards", "technical safeguards", "physical safeguards", "encryption", "access controls"], "description": "Must implement security safeguards (§ 164.308-312)", "severity": "HIGH"},
-            "breach_notification": {"keywords": ["breach notification", "notification of breach", "unauthorized access"], "description": "Must include breach notification obligations (§ 164.400-414)", "severity": "HIGH"},
-        },
-    },
-    "FINRA": {
-        "description": "Financial Industry Regulatory Authority (US)",
-        "requirements": {
-            "recordkeeping": {"keywords": ["recordkeeping", "books and records", "retain records", "SEC Rule 17a-4"], "description": "Must comply with recordkeeping rules (FINRA Rule 4511)", "severity": "HIGH"},
-            "anti_money_laundering": {"keywords": ["anti-money laundering", "AML", "suspicious activity", "SAR", "OFAC"], "description": "Must reference AML compliance (FINRA Rule 3310)", "severity": "HIGH"},
-            "privacy": {"keywords": ["privacy policy", "customer information", "Regulation S-P", "nonpublic personal information"], "description": "Must protect customer information (Regulation S-P)", "severity": "HIGH"},
-        },
-    },
-}
-def check_compliance(text):
-    text_lower = text.lower()
-    results = {}
-    for reg_name, reg_data in REGULATIONS.items():
-        checks = []
-        for req_name, req_data in reg_data["requirements"].items():
-            matched = False
-            matched_keywords = []
-            for kw in req_data["keywords"]:
-                if kw.lower() in text_lower:
-                    matched = True
-                    matched_keywords.append(kw)
-            checks.append({"requirement": req_name, "description": req_data["description"], "severity": req_data["severity"], "status": "PASS" if matched else "MISSING", "matched_keywords": matched_keywords})
-        passed = sum(1 for c in checks if c["status"] == "PASS")
-        total = len(checks)
-        compliance_rate = round(passed / total * 100) if total > 0 else 0
-        results[reg_name] = {"description": reg_data["description"], "compliance_rate": compliance_rate, "checks": checks, "overall_status": "COMPLIANT" if compliance_rate >= 80 else "PARTIAL" if compliance_rate >= 40 else "NON-COMPLIANT"}
-    return results
-# ─── Comparison ───
-from difflib import SequenceMatcher
-def _normalize(text):
-    text = text.lower()
-    text = re.sub(r'[^a-z0-9\s]', ' ', text)
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
-def _clause_type(text):
-    text_lower = text.lower()
-    type_keywords = {
-        "governing law": ["govern", "law", "jurisdiction"],
-        "termination": ["terminat", "cancel", "end"],
-        "indemnification": ["indemnif", "hold harmless"],
-        "confidentiality": ["confidential", "non-disclosure"],
-        "liability": ["liability", "liable", "damages"],
-        "payment": ["payment", "fee", "price", "compensat"],
-        "intellectual property": ["intellectual", "ip", "copyright", "patent"],
-        "warranty": ["warrant", "guarantee"],
-        "force majeure": ["force majeure", "act of god"],
-        "arbitration": ["arbitrat", "mediation"],
-        "assignment": ["assign", "transfer"],
-        "non-compete": ["compete", "competition"],
-        "renewal": ["renew", "extend"],
-    }
-    for ctype, keywords in type_keywords.items():
-        if any(kw in text_lower for kw in keywords):
-            return ctype
-    return "general"
-def compare_contracts(text_a, text_b):
-    clauses_a = split_clauses(text_a)
-    clauses_b = split_clauses(text_b)
-    matched_a = set()
-    matched_b = set()
-    modified = []
-    for i, ca in enumerate(clauses_a):
-        best_sim, best_j = 0, -1
-        for j, cb in enumerate(clauses_b):
-            if j in matched_b:
-                continue
-            sim = SequenceMatcher(None, _normalize(ca), _normalize(cb)).ratio()
-            if sim > best_sim:
-                best_sim = sim
-                best_j = j
-        if best_sim >= 0.75:
-            matched_a.add(i)
-            matched_b.add(best_j)
-            if best_sim < 0.95:
-                modified.append({"type": "modified", "similarity": round(best_sim, 3), "clause_a": ca[:200], "clause_b": clauses_b[best_j][:200], "clause_type": _clause_type(ca)})
-        elif best_sim >= 0.45:
-            modified.append({"type": "partial", "similarity": round(best_sim, 3), "clause_a": ca[:200], "clause_b": clauses_b[best_j][:200] if best_j >= 0 else "", "clause_type": _clause_type(ca)})
-    removed = [clauses_a[i] for i in range(len(clauses_a)) if i not in matched_a]
-    added = [clauses_b[j] for j in range(len(clauses_b)) if j not in matched_b]
-    total_pairs = max(len(clauses_a), len(clauses_b))
-    alignment = len(matched_a) / total_pairs if total_pairs > 0 else 0.0
-    risk_keywords = ["unlimited", "unilateral", "waive", "arbitration", "indemnif", "not liable", "no warranty", "sole discretion"]
-    risk_a = sum(1 for kw in risk_keywords if kw in text_a.lower())
-    risk_b = sum(1 for kw in risk_keywords if kw in text_b.lower())
-    if risk_a > risk_b + 2:
-        risk_delta, risk_winner = "Contract A is significantly riskier", "B"
-    elif risk_b > risk_a + 2:
-        risk_delta, risk_winner = "Contract B is significantly riskier", "A"
-    else:
-        risk_delta, risk_winner = "Similar risk profiles", "tie"
-    return {
-        "alignment_score": round(alignment, 3),
-        "contract_a_clauses": len(clauses_a), "contract_b_clauses": len(clauses_b),
-        "added_clauses": [{"text": c[:200], "type": _clause_type(c)} for c in added[:50]],
-        "removed_clauses": [{"text": c[:200], "type": _clause_type(c)} for c in removed[:50]],
-        "modified_clauses": modified[:50],
-        "risk_delta": risk_delta, "risk_winner": risk_winner,
-        "type_map_a": {k: len(v) for k, v in defaultdict(list, [("general", [])]).items()},
-        "type_map_b": {k: len(v) for k, v in defaultdict(list, [("general", [])]).items()},
-    }
-# ─── Models ───
-class AnalyzeRequest(BaseModel):
-    text: str = Field(..., min_length=50)
-    source_url: Optional[str] = None
-class AnalyzeResponse(BaseModel):
-    risk_score: int
-    grade: str
-    total_clauses: int
-    flagged_count: int
-    results: list[dict]
-    entities: list[dict]
-    contradictions: list[dict]
-    obligations: list[dict]
-    compliance: dict
-    model: str
-    latency_ms: int
-class CompareRequest(BaseModel):
-    text_a: str = Field(..., min_length=50)
-    text_b: str = Field(..., min_length=50)
-class ExplainRequest(BaseModel):
-    clause: str = Field(..., min_length=10, max_length=2000)
-    category: str
-class ExplainResponse(BaseModel):
-    clause: str
-    category: str
-    explanation: str
-    legal_basis: str
-    recommendation: str
-# ─── App ───
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    load_model()
-    yield
-app = FastAPI(title="ClauseGuard API", version="2.0.0", lifespan=lifespan)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["https://clauseguardweb.netlify.app", "https://clauseguardweb.netlify.app", "chrome-extension://*", "http://localhost:3000", "*"],
-    allow_credentials=True, allow_methods=["*"], allow_headers=["*"],
-)
-@app.get("/health")
-async def health():
-    return {"status": "ok", "model": "ml" if cuad_model else "regex", "version": "2.0.0"}
-@app.post("/api/analyze", response_model=AnalyzeResponse)
-async def analyze(req: AnalyzeRequest, user: Optional[dict] = Depends(get_current_user)):
-    start = time.time()
-    clauses = split_clauses(req.text)
-    if not clauses:
-        raise HTTPException(status_code=400, detail="No clauses detected in document")
-    clause_results = []
-    for clause in clauses:
-        predictions = classify_cuad(clause)
-        if predictions:
-            for pred in predictions:
-                clause_results.append({"text": clause, "label": pred["label"], "confidence": pred["confidence"], "risk": pred["risk"], "description": pred["description"]})
-    entities = extract_entities(req.text)
-    contradictions = detect_contradictions(clause_results)
-    risk, grade, sev_counts = compute_risk_score(clause_results, len(clauses))
-    obligations = extract_obligations(req.text)
-    compliance = check_compliance(req.text)
-    latency = int((time.time() - start) * 1000)
-    results_for_db = [{"text": cr["text"], "categories": [{"name": cr["label"], "severity": cr["risk"], "confidence": cr["confidence"], "description": cr["description"]}]} for cr in clause_results]
-    if user:
-        await supabase_insert("analyses", {
-            "user_id": user["id"], "source_url": req.source_url, "total_clauses": len(clauses),
-            "flagged_count": len(set(cr["text"] for cr in clause_results)), "risk_score": risk, "grade": grade,
-            "clauses": results_for_db, "entities": entities, "contradictions": contradictions,
-            "obligations": obligations, "compliance": compliance,
-        })
-    return AnalyzeResponse(
-        risk_score=risk, grade=grade, total_clauses=len(clauses),
-        flagged_count=len(set(cr["text"] for cr in clause_results)),
-        results=results_for_db, entities=entities, contradictions=contradictions,
-        obligations=obligations, compliance=compliance,
-        model="ml" if cuad_model else "regex", latency_ms=latency,
-    )
-@app.post("/api/compare")
-async def compare(req: CompareRequest):
-    result = compare_contracts(req.text_a, req.text_b)
-    return result
-@app.post("/api/explain", response_model=ExplainResponse)
-async def explain(req: ExplainRequest, user: dict = Depends(require_auth)):
-    desc = DESC_MAP.get(req.category, "Unknown category.")
-    legal = "Consult local consumer protection laws."
-    recommendation = "Review this clause carefully. Consider negotiating or seeking legal advice before agreeing."
-    if SAULLM_ENDPOINT and HF_API_TOKEN:
-        try:
-            prompt = f"You are a consumer protection legal analyst. Analyze this clause and explain why it may be unfair.\n\nClause: \"{req.clause}\"\nCategory: {req.category}\n\nProvide:\n1. A plain-English explanation\n2. The specific legal basis\n3. A practical recommendation\n\nBe concise. 3-4 sentences per section."
-            async with httpx.AsyncClient(timeout=30.0) as client:
-                resp = await client.post(SAULLM_ENDPOINT, json={"inputs": prompt, "parameters": {"max_new_tokens": 300, "temperature": 0.3}}, headers={"Authorization": f"Bearer {HF_API_TOKEN}"})
-                if resp.status_code == 200:
-                    output = resp.json()
-                    generated = output[0]["generated_text"] if isinstance(output, list) else output.get("generated_text", "")
-                    if generated and len(generated) > 50:
-                        parts = generated.split("\n\n")
-                        desc = parts[0] if len(parts) > 0 else desc
-                        legal = parts[1] if len(parts) > 1 else legal
-                        recommendation = parts[2] if len(parts) > 2 else recommendation
-        except Exception:
-            pass
-    return ExplainResponse(clause=req.clause, category=req.category, explanation=desc, legal_basis=legal, recommendation=recommendation)
-@app.get("/api/history")
-async def history(user: dict = Depends(require_auth), limit: int = 20, offset: int = 0):
-    limit = min(limit, 100)
-    data = await supabase_query("analyses", {"user_id": f"eq.{user['id']}", "select": "*", "order": "created_at.desc", "limit": str(limit), "offset": str(offset)})
-    return {"analyses": data, "limit": limit, "offset": offset}
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)


1	+ /app/clauseguard/api/main.py