""" build_pinecone_judgements.py Rebuilds the judgements vector DB in Pinecone using bge-small-en-v1.5 (384 dims). Before running: 1. pip install pinecone-client sentence-transformers tqdm pymupdf torch 2. Create a Pinecone index named 'legal-judgements' with dimension=384, metric=cosine OR let this script create it automatically (serverless). 3. Set your PINECONE_API_KEY below. Run with: python build_pinecone_judgements.py """ import os import zipfile import time from pathlib import Path from tqdm import tqdm from sentence_transformers import SentenceTransformer from pinecone import Pinecone, ServerlessSpec import torch # ── Configuration ───────────────────────────────────────────────────────────── PINECONE_API_KEY = "Enter your Pincone_API" # ← paste your key PINECONE_INDEX = "legal-judgements" PINECONE_CLOUD = "aws" PINECONE_REGION = "us-east-1" ZIP_PATH = "./Judgements.zip" EXTRACT_DIR = "./judgements_extracted" LOCAL_MODEL_DIR = "./models/bge-small" EMBED_MODEL_NAME = "BAAI/bge-small-en-v1.5" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" UPSERT_BATCH = 100 BGE_PREFIX = "Represent this sentence for searching relevant passages: " # ───────────────────────────────────────────────────────────────────────────── def load_model() -> SentenceTransformer: local = Path(LOCAL_MODEL_DIR) if local.exists() and any(local.iterdir()): print(f"✅ Loading bge-small from '{LOCAL_MODEL_DIR}'") else: print(f"📥 Downloading {EMBED_MODEL_NAME} (~130 MB)…") local.mkdir(parents=True, exist_ok=True) m = SentenceTransformer(EMBED_MODEL_NAME) m.save(str(local)) print(f"✅ Saved to '{LOCAL_MODEL_DIR}'") model = SentenceTransformer(str(local)) model = model.to(DEVICE) print(f" Device: {DEVICE} | Dim: {model.get_sentence_embedding_dimension()}") return model def extract_zip(): if Path(EXTRACT_DIR).exists(): print(f"📂 '{EXTRACT_DIR}' already exists — skipping extraction.") return print(f"📦 Extracting {ZIP_PATH}…") Path(EXTRACT_DIR).mkdir(parents=True, exist_ok=True) with zipfile.ZipFile(ZIP_PATH, 'r') as z: z.extractall(EXTRACT_DIR) print("✅ Extraction complete.") def find_pdfs() -> list[Path]: root = Path(EXTRACT_DIR) pdfs = list({p.resolve(): p for p in list(root.rglob("*.pdf")) + list(root.rglob("*.PDF"))}.values()) pdfs = sorted(pdfs) print(f"📄 Found {len(pdfs)} PDF files.") return pdfs def extract_text(pdf_path: Path) -> str: try: import fitz doc = fitz.open(str(pdf_path)) text = "\n\n".join(p.get_text() for p in doc).strip() doc.close() return text except Exception as e: print(f" ⚠️ {pdf_path.name}: {e}") return "" def get_year(pdf_path: Path) -> str: for part in pdf_path.parts: if part.isdigit() and len(part) == 4: return part return "unknown" def connect_pinecone(): print("🔌 Connecting to Pinecone…") pc = Pinecone(api_key=PINECONE_API_KEY) existing = [i.name for i in pc.list_indexes()] if PINECONE_INDEX not in existing: print(f" Index '{PINECONE_INDEX}' not found — creating it…") pc.create_index( name=PINECONE_INDEX, dimension=384, metric="cosine", spec=ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION) ) while not pc.describe_index(PINECONE_INDEX).status['ready']: print(" Waiting for index to be ready…") time.sleep(3) print(f" ✅ Index '{PINECONE_INDEX}' created.") index = pc.Index(PINECONE_INDEX) stats = index.describe_index_stats() print(f"✅ Connected | Existing vectors: {stats.total_vector_count}") return index def build(resume_from: int = 0): print("=" * 70) print(" JUDGEMENTS → PINECONE (bge-small-en-v1.5 · 384 dims)") print(f" Device : {DEVICE}") print("=" * 70) model = load_model() extract_zip() pdfs = find_pdfs() if not pdfs: print("❌ No PDFs found. Check ZIP_PATH.") return index = connect_pinecone() to_process = pdfs[resume_from:] print(f"\n📥 Processing {len(to_process)} PDFs from index #{resume_from}…") buf = [] failed = [] for i, pdf_path in enumerate(tqdm(to_process, desc="Embedding & uploading")): gidx = resume_from + i text = extract_text(pdf_path) if not text.strip(): continue try: emb = model.encode( BGE_PREFIX + text[:3000], normalize_embeddings=True, device=DEVICE ).tolist() except Exception as e: print(f"\n ⚠️ Embed failed #{gidx} ({pdf_path.name}): {e}") failed.append(gidx) continue buf.append({ "id": f"j_{gidx}", "values": emb, "metadata": { "file_name": pdf_path.stem, "year": get_year(pdf_path), "source": str(pdf_path), "content": text[:8000], # stored for retrieval display } }) if len(buf) >= UPSERT_BATCH: try: index.upsert(vectors=buf) buf = [] except Exception as e: print(f"\n ⚠️ Upsert failed near #{gidx}: {e}") failed.append(gidx) buf = [] # Flush remainder if buf: try: index.upsert(vectors=buf) except Exception as e: print(f"\n ⚠️ Final upsert failed: {e}") stats = index.describe_index_stats() print(f"\n{'=' * 70}") print(f" ✅ DONE | Pinecone vectors: {stats.total_vector_count}") if failed: print(f" ⚠️ {len(failed)} failed. Resume with: build(resume_from={failed[0]})") print(f" Index : {PINECONE_INDEX}") print(f" Model : {EMBED_MODEL_NAME}") print("=" * 70) if __name__ == "__main__": # If a previous run failed at e.g. doc #5000, set resume_from=5000 build(resume_from=0)