Spaces:
Running
Running
Amogh Gupta commited on
Commit ·
bc4e23f
1
Parent(s): a01e1da
feat(backend): add vector DB integration
Browse files- vectordb.py +294 -0
vectordb.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import zipfile
|
| 3 |
+
import shutil
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from langchain_core.documents import Document
|
| 6 |
+
from langchain_chroma import Chroma
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
import torch
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
# ── Fix: use langchain_huggingface instead of deprecated langchain_community ──
|
| 12 |
+
try:
|
| 13 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 14 |
+
except ImportError:
|
| 15 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 16 |
+
|
| 17 |
+
# ── Configuration ────────────────────────────────────────────────────────────
|
| 18 |
+
ZIP_PATH = "Judgements.zip"
|
| 19 |
+
EXTRACT_DIR = "judgements_extracted"
|
| 20 |
+
PERSIST_DIR = "legal_db"
|
| 21 |
+
COLLECTION_NAME = "LegalJudgements"
|
| 22 |
+
LOCAL_MODEL_DIR = "./models/bge-large" # ← local model, no download needed
|
| 23 |
+
BATCH_SIZE = 10 # reduced from 50 to avoid ChromaDB compaction errors
|
| 24 |
+
RETRY_ATTEMPTS = 3 # retry failed batches this many times
|
| 25 |
+
RETRY_DELAY = 5 # seconds to wait between retries
|
| 26 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 27 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_embeddings():
|
| 31 |
+
"""Load embedding model from local disk — no internet needed."""
|
| 32 |
+
local_path = Path(LOCAL_MODEL_DIR)
|
| 33 |
+
if not local_path.exists() or not any(local_path.iterdir()):
|
| 34 |
+
raise FileNotFoundError(
|
| 35 |
+
f"Local embedding model not found at '{LOCAL_MODEL_DIR}'.\n"
|
| 36 |
+
f"Make sure the folder exists and contains the model files.\n"
|
| 37 |
+
f"Expected path: {local_path.resolve()}"
|
| 38 |
+
)
|
| 39 |
+
print(f"✅ Loading embedding model from local disk: {local_path.resolve()}")
|
| 40 |
+
return HuggingFaceEmbeddings(
|
| 41 |
+
model_name=str(local_path.resolve()),
|
| 42 |
+
model_kwargs={"device": DEVICE},
|
| 43 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def extract_zip(zip_path: str, extract_to: str):
|
| 48 |
+
"""Extract the judgements zip file."""
|
| 49 |
+
print(f"📦 Extracting {zip_path} to {extract_to}...")
|
| 50 |
+
if os.path.exists(extract_to):
|
| 51 |
+
shutil.rmtree(extract_to)
|
| 52 |
+
os.makedirs(extract_to)
|
| 53 |
+
with zipfile.ZipFile(zip_path, 'r') as z:
|
| 54 |
+
z.extractall(extract_to)
|
| 55 |
+
print("✅ Extraction complete.")
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def find_pdfs(root_dir: str) -> list[Path]:
|
| 59 |
+
"""Recursively find ALL PDFs under root_dir, no matter how deep."""
|
| 60 |
+
root = Path(root_dir)
|
| 61 |
+
|
| 62 |
+
print("\n📂 Directory tree after extraction:")
|
| 63 |
+
for item in sorted(root.rglob("*")):
|
| 64 |
+
indent = " " * (len(item.relative_to(root).parts) - 1)
|
| 65 |
+
marker = "📄" if item.is_file() else "📁"
|
| 66 |
+
print(f" {indent}{marker} {item.name}")
|
| 67 |
+
|
| 68 |
+
pdfs = list(root.rglob("*.pdf")) + list(root.rglob("*.PDF"))
|
| 69 |
+
pdfs = list({p.resolve(): p for p in pdfs}.values())
|
| 70 |
+
pdfs = sorted(pdfs)
|
| 71 |
+
|
| 72 |
+
print(f"\n📄 Found {len(pdfs)} PDF files.")
|
| 73 |
+
for p in pdfs:
|
| 74 |
+
print(f" → {p.relative_to(root)}")
|
| 75 |
+
|
| 76 |
+
return pdfs
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def extract_text_from_pdf(pdf_path: Path) -> str:
|
| 80 |
+
"""Extract full text from a PDF using PyMuPDF."""
|
| 81 |
+
try:
|
| 82 |
+
import fitz
|
| 83 |
+
doc = fitz.open(str(pdf_path))
|
| 84 |
+
pages_text = [page.get_text() for page in doc]
|
| 85 |
+
doc.close()
|
| 86 |
+
full_text = "\n\n".join(pages_text).strip()
|
| 87 |
+
return full_text
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f" ⚠️ Could not read {pdf_path.name}: {e}")
|
| 90 |
+
return ""
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def build_documents(pdf_paths: list[Path]) -> list[Document]:
|
| 94 |
+
"""One PDF = one Document (one chunk)."""
|
| 95 |
+
documents = []
|
| 96 |
+
print("\n📚 Building documents from PDFs...")
|
| 97 |
+
|
| 98 |
+
for pdf_path in tqdm(pdf_paths, desc="Reading PDFs"):
|
| 99 |
+
text = extract_text_from_pdf(pdf_path)
|
| 100 |
+
if not text:
|
| 101 |
+
print(f" ⚠️ Skipping empty PDF: {pdf_path.name}")
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
year = "unknown"
|
| 105 |
+
for part in pdf_path.parts:
|
| 106 |
+
if part.isdigit() and len(part) == 4:
|
| 107 |
+
year = part
|
| 108 |
+
break
|
| 109 |
+
|
| 110 |
+
documents.append(Document(
|
| 111 |
+
page_content=text,
|
| 112 |
+
metadata={
|
| 113 |
+
"source": str(pdf_path),
|
| 114 |
+
"file_name": pdf_path.stem,
|
| 115 |
+
"year": year,
|
| 116 |
+
"full_path": str(pdf_path.resolve()),
|
| 117 |
+
}
|
| 118 |
+
))
|
| 119 |
+
|
| 120 |
+
print(f"✅ Created {len(documents)} document(s) — one per judgement.")
|
| 121 |
+
return documents
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def build_vector_db(documents: list[Document], start_from: int = 0) -> Chroma | None:
|
| 125 |
+
"""
|
| 126 |
+
Embed documents and persist the Chroma vector store.
|
| 127 |
+
start_from: resume from this document index if a previous run failed.
|
| 128 |
+
"""
|
| 129 |
+
print("\n🔨 Building vector database...")
|
| 130 |
+
print(f" Embedding model : {LOCAL_MODEL_DIR}")
|
| 131 |
+
print(f" Device : {DEVICE}")
|
| 132 |
+
print(f" Persist directory: {PERSIST_DIR}")
|
| 133 |
+
print(f" Batch size : {BATCH_SIZE}")
|
| 134 |
+
if start_from > 0:
|
| 135 |
+
print(f" Resuming from document #{start_from}")
|
| 136 |
+
|
| 137 |
+
# Only wipe DB if starting fresh
|
| 138 |
+
if start_from == 0 and os.path.exists(PERSIST_DIR):
|
| 139 |
+
print(f"\n⚠️ Vector DB already exists at '{PERSIST_DIR}'.")
|
| 140 |
+
answer = input(" Overwrite? (y/n): ").strip().lower()
|
| 141 |
+
if answer != "y":
|
| 142 |
+
print(" Aborted.")
|
| 143 |
+
return None
|
| 144 |
+
shutil.rmtree(PERSIST_DIR)
|
| 145 |
+
print(" Removed existing DB.")
|
| 146 |
+
|
| 147 |
+
embeddings = get_embeddings()
|
| 148 |
+
|
| 149 |
+
vector_store = Chroma(
|
| 150 |
+
collection_name=COLLECTION_NAME,
|
| 151 |
+
embedding_function=embeddings,
|
| 152 |
+
persist_directory=PERSIST_DIR,
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
docs_to_insert = documents[start_from:]
|
| 156 |
+
failed_batches = []
|
| 157 |
+
|
| 158 |
+
print(f"\n📥 Inserting {len(docs_to_insert)} documents in batches of {BATCH_SIZE}...")
|
| 159 |
+
|
| 160 |
+
with tqdm(total=len(docs_to_insert), desc="Inserting") as pbar:
|
| 161 |
+
for i in range(0, len(docs_to_insert), BATCH_SIZE):
|
| 162 |
+
batch = docs_to_insert[i : i + BATCH_SIZE]
|
| 163 |
+
batch_num = i // BATCH_SIZE + 1
|
| 164 |
+
success = False
|
| 165 |
+
|
| 166 |
+
for attempt in range(1, RETRY_ATTEMPTS + 1):
|
| 167 |
+
try:
|
| 168 |
+
vector_store.add_documents(batch)
|
| 169 |
+
success = True
|
| 170 |
+
break
|
| 171 |
+
except Exception as e:
|
| 172 |
+
print(f"\n ⚠️ Batch {batch_num} attempt {attempt} failed: {e}")
|
| 173 |
+
if attempt < RETRY_ATTEMPTS:
|
| 174 |
+
print(f" Retrying in {RETRY_DELAY}s…")
|
| 175 |
+
time.sleep(RETRY_DELAY)
|
| 176 |
+
else:
|
| 177 |
+
print(f" ❌ Batch {batch_num} failed after {RETRY_ATTEMPTS} attempts. Skipping.")
|
| 178 |
+
failed_batches.append(start_from + i)
|
| 179 |
+
|
| 180 |
+
pbar.update(len(batch))
|
| 181 |
+
|
| 182 |
+
if failed_batches:
|
| 183 |
+
print(f"\n⚠️ {len(failed_batches)} batch(es) failed and were skipped:")
|
| 184 |
+
for idx in failed_batches:
|
| 185 |
+
print(f" → Starting at document #{idx} (resume with START_FROM={idx})")
|
| 186 |
+
else:
|
| 187 |
+
print("✅ Vector DB build complete — all batches inserted successfully!")
|
| 188 |
+
|
| 189 |
+
return vector_store
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def verify_vector_db():
|
| 193 |
+
"""Run a few test queries to confirm the DB is working."""
|
| 194 |
+
print("\n🔍 Verifying vector database...")
|
| 195 |
+
|
| 196 |
+
embeddings = get_embeddings()
|
| 197 |
+
|
| 198 |
+
vector_store = Chroma(
|
| 199 |
+
collection_name=COLLECTION_NAME,
|
| 200 |
+
embedding_function=embeddings,
|
| 201 |
+
persist_directory=PERSIST_DIR,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
prefix = "Represent this sentence for searching relevant passages: "
|
| 205 |
+
|
| 206 |
+
test_queries = [
|
| 207 |
+
"mortgage deed property",
|
| 208 |
+
"cheating IPC section 420",
|
| 209 |
+
"partition of land revenue",
|
| 210 |
+
]
|
| 211 |
+
|
| 212 |
+
print("-" * 70)
|
| 213 |
+
for query in test_queries:
|
| 214 |
+
results = vector_store.similarity_search(prefix + query, k=2)
|
| 215 |
+
print(f"\n🔎 Query : '{query}'")
|
| 216 |
+
print(f" Hits : {len(results)}")
|
| 217 |
+
if results:
|
| 218 |
+
snippet = results[0].page_content[:200].replace("\n", " ")
|
| 219 |
+
name = results[0].metadata.get("file_name", "?")
|
| 220 |
+
year = results[0].metadata.get("year", "?")
|
| 221 |
+
print(f" Best : [{year}] {name}")
|
| 222 |
+
print(f" Preview: {snippet}...")
|
| 223 |
+
print("-" * 70)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def cleanup_extracted(extract_dir: str):
|
| 227 |
+
"""Remove the temporary extraction folder."""
|
| 228 |
+
if os.path.exists(extract_dir):
|
| 229 |
+
shutil.rmtree(extract_dir)
|
| 230 |
+
print(f"🗑️ Removed temporary folder '{extract_dir}'.")
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def main():
|
| 234 |
+
print("=" * 70)
|
| 235 |
+
print(" LEGAL JUDGEMENTS — VECTOR DB BUILDER")
|
| 236 |
+
print(" (1 PDF = 1 chunk = 1 judgement)")
|
| 237 |
+
print(f" Device : {DEVICE}")
|
| 238 |
+
print(f" Local model : {Path(LOCAL_MODEL_DIR).resolve()}")
|
| 239 |
+
print("=" * 70)
|
| 240 |
+
|
| 241 |
+
# ── RESUME CONTROL ────────────────────────────────────────────────────
|
| 242 |
+
# If the script crashed mid-way, set START_FROM to the failed document
|
| 243 |
+
# index printed in the error output — it will skip re-extraction and
|
| 244 |
+
# resume inserting from that point without wiping the existing DB.
|
| 245 |
+
# Set to 0 for a fresh run.
|
| 246 |
+
START_FROM = 0
|
| 247 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 248 |
+
|
| 249 |
+
if START_FROM == 0:
|
| 250 |
+
if os.path.exists(PERSIST_DIR):
|
| 251 |
+
print(f"\n🗑️ Removing old vector DB at '{PERSIST_DIR}'...")
|
| 252 |
+
shutil.rmtree(PERSIST_DIR)
|
| 253 |
+
print(" Done.")
|
| 254 |
+
|
| 255 |
+
if not os.path.exists(ZIP_PATH):
|
| 256 |
+
raise FileNotFoundError(
|
| 257 |
+
f"Zip file not found: '{ZIP_PATH}'. "
|
| 258 |
+
"Update ZIP_PATH at the top of the script."
|
| 259 |
+
)
|
| 260 |
+
extract_zip(ZIP_PATH, EXTRACT_DIR)
|
| 261 |
+
else:
|
| 262 |
+
print(f"\n▶️ Resuming from document #{START_FROM} — skipping extraction.")
|
| 263 |
+
|
| 264 |
+
pdf_paths = find_pdfs(EXTRACT_DIR)
|
| 265 |
+
if not pdf_paths:
|
| 266 |
+
print("❌ No PDFs found inside the zip. Check the folder structure.")
|
| 267 |
+
return
|
| 268 |
+
|
| 269 |
+
documents = build_documents(pdf_paths)
|
| 270 |
+
if not documents:
|
| 271 |
+
print("❌ No readable text extracted from PDFs.")
|
| 272 |
+
return
|
| 273 |
+
|
| 274 |
+
vector_store = build_vector_db(documents, start_from=START_FROM)
|
| 275 |
+
|
| 276 |
+
if vector_store:
|
| 277 |
+
verify_vector_db()
|
| 278 |
+
|
| 279 |
+
answer = input("\nDelete the extracted PDF folder? (y/n): ").strip().lower()
|
| 280 |
+
if answer == "y":
|
| 281 |
+
cleanup_extracted(EXTRACT_DIR)
|
| 282 |
+
|
| 283 |
+
print("\n" + "=" * 70)
|
| 284 |
+
print(" DONE!")
|
| 285 |
+
print(f" DB location : {os.path.abspath(PERSIST_DIR)}")
|
| 286 |
+
print(f" Judgements : {len(documents)}")
|
| 287 |
+
print(f" Collection : {COLLECTION_NAME}")
|
| 288 |
+
print(f" Local model : {Path(LOCAL_MODEL_DIR).resolve()}")
|
| 289 |
+
print(f" Device used : {DEVICE}")
|
| 290 |
+
print("=" * 70)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
if __name__ == "__main__":
|
| 294 |
+
main()
|