"""
build_pinecone_legal.py
Builds the legal framework vector DB in Pinecone using bge-small-en-v1.5 (384 dims).
Sources: constitution_qa.json, ipc_sections.csv, bsa_sections.csv, crpc_sections.csv

Before running:
1. pip install pinecone-client sentence-transformers tqdm torch
2. Place all 4 source files in ./constitution/ folder
3. Set your PINECONE_API_KEY below

Run with: python build_pinecone_legal.py
"""

import os
import csv
import json
import time
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
import torch

# ── Configuration ─────────────────────────────────────────────────────────────
PINECONE_API_KEY  = "pcsk_3xZm5c_FcWbAXNUTzNUgMzimd1aQ2FHk9dbp8idGoWH429FGFFMwwu6KppRPd4bp6NdGek"   # ← paste your key
PINECONE_INDEX    = "legal-framework"
PINECONE_CLOUD    = "aws"
PINECONE_REGION   = "us-east-1"

CONSTITUTION_DIR  = "./constitution"
LOCAL_MODEL_DIR   = "./models/bge-small"            # shared with judgements builder
EMBED_MODEL_NAME  = "BAAI/bge-small-en-v1.5"
DEVICE            = "cuda" if torch.cuda.is_available() else "cpu"
UPSERT_BATCH      = 100
BGE_PREFIX        = "Represent this sentence for searching relevant passages: "

# File names inside CONSTITUTION_DIR
CONSTITUTION_FILE = "constitution_qa.json"
IPC_FILE          = "ipc_sections.csv"
BSA_FILE          = "bsa_sections.csv"
CRPC_FILE         = "crpc_sections.csv"
# ─────────────────────────────────────────────────────────────────────────────


def load_model() -> SentenceTransformer:
    local = Path(LOCAL_MODEL_DIR)
    if local.exists() and any(local.iterdir()):
        print(f"✅ Loading bge-small from '{LOCAL_MODEL_DIR}'")
    else:
        print(f"📥 Downloading {EMBED_MODEL_NAME} (~130 MB)…")
        local.mkdir(parents=True, exist_ok=True)
        m = SentenceTransformer(EMBED_MODEL_NAME)
        m.save(str(local))
        print(f"✅ Saved to '{LOCAL_MODEL_DIR}'")
    model = SentenceTransformer(str(local))
    model = model.to(DEVICE)
    print(f"   Device: {DEVICE} | Dim: {model.get_sentence_embedding_dimension()}")
    return model


# ── Document loaders ──────────────────────────────────────────────────────────

def load_constitution(base: Path) -> list[dict]:
    path = base / CONSTITUTION_FILE
    if not path.exists():
        print(f"⚠️  Not found: {path}"); return []
    with open(path, encoding="utf-8") as f:
        data = json.load(f)
    docs = []
    for i, item in enumerate(data):
        q = item.get("question", "").strip()
        a = item.get("answer", "").strip()
        if not a:
            continue
        docs.append({
            "id":      f"const_{i}",
            "text":    f"Q: {q}\nA: {a}" if q else a,
            "source":  "Indian Constitution",
            "type":    "constitution_qa",
            "section": "",
        })
    print(f"   📜 Constitution: {len(docs)} Q&A pairs")
    return docs


def load_ipc(base: Path) -> list[dict]:
    path = base / IPC_FILE
    if not path.exists():
        print(f"⚠️  Not found: {path}"); return []
    docs = []
    with open(path, encoding="utf-8", errors="replace") as f:
        for i, row in enumerate(csv.DictReader(f)):
            sec   = row.get("Section", "").strip()
            desc  = row.get("Description", "").strip()
            off   = row.get("Offense", "").strip()
            pun   = row.get("Punishment", "").strip()
            if not desc:
                continue
            docs.append({
                "id":      f"ipc_{i}",
                "text":    f"Section: {sec}\nOffense: {off}\nPunishment: {pun}\n\n{desc}",
                "source":  "Indian Penal Code",
                "type":    "ipc_section",
                "section": sec,
            })
    print(f"   ⚖️  IPC: {len(docs)} sections")
    return docs


def load_generic_csv(base: Path, filename: str, source_name: str, id_prefix: str) -> list[dict]:
    path = base / filename
    if not path.exists():
        print(f"⚠️  Not found: {path}"); return []
    docs = []
    with open(path, encoding="utf-8", errors="replace") as f:
        for i, row in enumerate(csv.DictReader(f)):
            sec      = str(row.get("Section", "")).strip()
            sec_name = row.get("Section _name", row.get("Section_name", "")).strip()
            chap     = row.get("Chapter_name", "").strip()
            desc     = row.get("Description", "").strip()
            if not desc:
                continue
            docs.append({
                "id":      f"{id_prefix}_{i}",
                "text":    f"Act: {source_name}\nChapter: {chap}\nSection {sec}: {sec_name}\n\n{desc}",
                "source":  source_name,
                "type":    f"{id_prefix}_section",
                "section": sec,
            })
    print(f"   📋 {source_name}: {len(docs)} sections")
    return docs


def load_all_docs() -> list[dict]:
    base = Path(CONSTITUTION_DIR)
    if not base.exists():
        raise FileNotFoundError(f"Constitution folder not found: '{CONSTITUTION_DIR}'")
    all_docs = []
    all_docs += load_constitution(base)
    all_docs += load_ipc(base)
    all_docs += load_generic_csv(base, BSA_FILE,  "Bharatiya Sakshya Adhiniyam 2023", "bsa")
    all_docs += load_generic_csv(base, CRPC_FILE, "Code of Criminal Procedure 1973",  "crpc")
    print(f"\n   Total: {len(all_docs)} documents")
    return all_docs


# ── Pinecone connection ───────────────────────────────────────────────────────

def connect_pinecone():
    print("🔌 Connecting to Pinecone…")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing = [i.name for i in pc.list_indexes()]
    if PINECONE_INDEX not in existing:
        print(f"   Index '{PINECONE_INDEX}' not found — creating it…")
        pc.create_index(
            name=PINECONE_INDEX,
            dimension=384,
            metric="cosine",
            spec=ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION)
        )
        while not pc.describe_index(PINECONE_INDEX).status['ready']:
            print("   Waiting for index to be ready…")
            time.sleep(3)
        print(f"   ✅ Index '{PINECONE_INDEX}' created.")
    index = pc.Index(PINECONE_INDEX)
    stats = index.describe_index_stats()
    print(f"✅ Connected | Existing vectors: {stats.total_vector_count}")
    return index


# ── Main build ────────────────────────────────────────────────────────────────

def build():
    print("=" * 70)
    print("  LEGAL FRAMEWORK → PINECONE  (bge-small-en-v1.5 · 384 dims)")
    print(f"  Device : {DEVICE}")
    print("=" * 70)

    model = load_model()

    print("\n📚 Loading documents…")
    docs = load_all_docs()
    if not docs:
        print("❌ No documents loaded. Check your constitution/ folder.")
        return

    index = connect_pinecone()

    # Ask before overwriting
    stats = index.describe_index_stats()
    if stats.total_vector_count > 0:
        ans = input(f"\n⚠️  Index already has {stats.total_vector_count} vectors. Overwrite? (y/n): ").strip().lower()
        if ans != 'y':
            print("Aborted.")
            return
        index.delete(delete_all=True)
        print("   Cleared existing vectors.")

    print(f"\n📥 Embedding and uploading {len(docs)} documents…")

    buf    = []
    failed = []

    for doc in tqdm(docs, desc="Embedding & uploading"):
        try:
            emb = model.encode(
                BGE_PREFIX + doc["text"][:1000],
                normalize_embeddings=True,
                device=DEVICE
            ).tolist()
        except Exception as e:
            print(f"\n  ⚠️  Embed failed for {doc['id']}: {e}")
            failed.append(doc["id"])
            continue

        buf.append({
            "id":     doc["id"],
            "values": emb,
            "metadata": {
                "source":  doc["source"],
                "type":    doc["type"],
                "section": doc["section"],
                "content": doc["text"][:4000],
            }
        })

        if len(buf) >= UPSERT_BATCH:
            try:
                index.upsert(vectors=buf)
                buf = []
            except Exception as e:
                print(f"\n  ⚠️  Upsert failed: {e}")
                buf = []

    if buf:
        try:
            index.upsert(vectors=buf)
        except Exception as e:
            print(f"\n  ⚠️  Final upsert failed: {e}")

    stats = index.describe_index_stats()
    print(f"\n{'=' * 70}")
    print(f"  ✅ DONE  |  Pinecone vectors: {stats.total_vector_count}")
    if failed:
        print(f"  ⚠️  {len(failed)} failed: {failed[:5]}")
    print(f"  Index  : {PINECONE_INDEX}")
    print(f"  Model  : {EMBED_MODEL_NAME}")
    print("=" * 70)


if __name__ == "__main__":
    build()