Amogh Gupta commited on
Commit
bc4e23f
·
1 Parent(s): a01e1da

feat(backend): add vector DB integration

Browse files
Files changed (1) hide show
  1. vectordb.py +294 -0
vectordb.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ import shutil
4
+ from pathlib import Path
5
+ from langchain_core.documents import Document
6
+ from langchain_chroma import Chroma
7
+ from tqdm import tqdm
8
+ import torch
9
+ import time
10
+
11
+ # ── Fix: use langchain_huggingface instead of deprecated langchain_community ──
12
+ try:
13
+ from langchain_huggingface import HuggingFaceEmbeddings
14
+ except ImportError:
15
+ from langchain_community.embeddings import HuggingFaceEmbeddings
16
+
17
+ # ── Configuration ────────────────────────────────────────────────────────────
18
+ ZIP_PATH = "Judgements.zip"
19
+ EXTRACT_DIR = "judgements_extracted"
20
+ PERSIST_DIR = "legal_db"
21
+ COLLECTION_NAME = "LegalJudgements"
22
+ LOCAL_MODEL_DIR = "./models/bge-large" # ← local model, no download needed
23
+ BATCH_SIZE = 10 # reduced from 50 to avoid ChromaDB compaction errors
24
+ RETRY_ATTEMPTS = 3 # retry failed batches this many times
25
+ RETRY_DELAY = 5 # seconds to wait between retries
26
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
27
+ # ─────────────────────────────────────────────────────────────────────────────
28
+
29
+
30
+ def get_embeddings():
31
+ """Load embedding model from local disk — no internet needed."""
32
+ local_path = Path(LOCAL_MODEL_DIR)
33
+ if not local_path.exists() or not any(local_path.iterdir()):
34
+ raise FileNotFoundError(
35
+ f"Local embedding model not found at '{LOCAL_MODEL_DIR}'.\n"
36
+ f"Make sure the folder exists and contains the model files.\n"
37
+ f"Expected path: {local_path.resolve()}"
38
+ )
39
+ print(f"✅ Loading embedding model from local disk: {local_path.resolve()}")
40
+ return HuggingFaceEmbeddings(
41
+ model_name=str(local_path.resolve()),
42
+ model_kwargs={"device": DEVICE},
43
+ encode_kwargs={"normalize_embeddings": True},
44
+ )
45
+
46
+
47
+ def extract_zip(zip_path: str, extract_to: str):
48
+ """Extract the judgements zip file."""
49
+ print(f"📦 Extracting {zip_path} to {extract_to}...")
50
+ if os.path.exists(extract_to):
51
+ shutil.rmtree(extract_to)
52
+ os.makedirs(extract_to)
53
+ with zipfile.ZipFile(zip_path, 'r') as z:
54
+ z.extractall(extract_to)
55
+ print("✅ Extraction complete.")
56
+
57
+
58
+ def find_pdfs(root_dir: str) -> list[Path]:
59
+ """Recursively find ALL PDFs under root_dir, no matter how deep."""
60
+ root = Path(root_dir)
61
+
62
+ print("\n📂 Directory tree after extraction:")
63
+ for item in sorted(root.rglob("*")):
64
+ indent = " " * (len(item.relative_to(root).parts) - 1)
65
+ marker = "📄" if item.is_file() else "📁"
66
+ print(f" {indent}{marker} {item.name}")
67
+
68
+ pdfs = list(root.rglob("*.pdf")) + list(root.rglob("*.PDF"))
69
+ pdfs = list({p.resolve(): p for p in pdfs}.values())
70
+ pdfs = sorted(pdfs)
71
+
72
+ print(f"\n📄 Found {len(pdfs)} PDF files.")
73
+ for p in pdfs:
74
+ print(f" → {p.relative_to(root)}")
75
+
76
+ return pdfs
77
+
78
+
79
+ def extract_text_from_pdf(pdf_path: Path) -> str:
80
+ """Extract full text from a PDF using PyMuPDF."""
81
+ try:
82
+ import fitz
83
+ doc = fitz.open(str(pdf_path))
84
+ pages_text = [page.get_text() for page in doc]
85
+ doc.close()
86
+ full_text = "\n\n".join(pages_text).strip()
87
+ return full_text
88
+ except Exception as e:
89
+ print(f" ⚠️ Could not read {pdf_path.name}: {e}")
90
+ return ""
91
+
92
+
93
+ def build_documents(pdf_paths: list[Path]) -> list[Document]:
94
+ """One PDF = one Document (one chunk)."""
95
+ documents = []
96
+ print("\n📚 Building documents from PDFs...")
97
+
98
+ for pdf_path in tqdm(pdf_paths, desc="Reading PDFs"):
99
+ text = extract_text_from_pdf(pdf_path)
100
+ if not text:
101
+ print(f" ⚠️ Skipping empty PDF: {pdf_path.name}")
102
+ continue
103
+
104
+ year = "unknown"
105
+ for part in pdf_path.parts:
106
+ if part.isdigit() and len(part) == 4:
107
+ year = part
108
+ break
109
+
110
+ documents.append(Document(
111
+ page_content=text,
112
+ metadata={
113
+ "source": str(pdf_path),
114
+ "file_name": pdf_path.stem,
115
+ "year": year,
116
+ "full_path": str(pdf_path.resolve()),
117
+ }
118
+ ))
119
+
120
+ print(f"✅ Created {len(documents)} document(s) — one per judgement.")
121
+ return documents
122
+
123
+
124
+ def build_vector_db(documents: list[Document], start_from: int = 0) -> Chroma | None:
125
+ """
126
+ Embed documents and persist the Chroma vector store.
127
+ start_from: resume from this document index if a previous run failed.
128
+ """
129
+ print("\n🔨 Building vector database...")
130
+ print(f" Embedding model : {LOCAL_MODEL_DIR}")
131
+ print(f" Device : {DEVICE}")
132
+ print(f" Persist directory: {PERSIST_DIR}")
133
+ print(f" Batch size : {BATCH_SIZE}")
134
+ if start_from > 0:
135
+ print(f" Resuming from document #{start_from}")
136
+
137
+ # Only wipe DB if starting fresh
138
+ if start_from == 0 and os.path.exists(PERSIST_DIR):
139
+ print(f"\n⚠️ Vector DB already exists at '{PERSIST_DIR}'.")
140
+ answer = input(" Overwrite? (y/n): ").strip().lower()
141
+ if answer != "y":
142
+ print(" Aborted.")
143
+ return None
144
+ shutil.rmtree(PERSIST_DIR)
145
+ print(" Removed existing DB.")
146
+
147
+ embeddings = get_embeddings()
148
+
149
+ vector_store = Chroma(
150
+ collection_name=COLLECTION_NAME,
151
+ embedding_function=embeddings,
152
+ persist_directory=PERSIST_DIR,
153
+ )
154
+
155
+ docs_to_insert = documents[start_from:]
156
+ failed_batches = []
157
+
158
+ print(f"\n📥 Inserting {len(docs_to_insert)} documents in batches of {BATCH_SIZE}...")
159
+
160
+ with tqdm(total=len(docs_to_insert), desc="Inserting") as pbar:
161
+ for i in range(0, len(docs_to_insert), BATCH_SIZE):
162
+ batch = docs_to_insert[i : i + BATCH_SIZE]
163
+ batch_num = i // BATCH_SIZE + 1
164
+ success = False
165
+
166
+ for attempt in range(1, RETRY_ATTEMPTS + 1):
167
+ try:
168
+ vector_store.add_documents(batch)
169
+ success = True
170
+ break
171
+ except Exception as e:
172
+ print(f"\n ⚠️ Batch {batch_num} attempt {attempt} failed: {e}")
173
+ if attempt < RETRY_ATTEMPTS:
174
+ print(f" Retrying in {RETRY_DELAY}s…")
175
+ time.sleep(RETRY_DELAY)
176
+ else:
177
+ print(f" ❌ Batch {batch_num} failed after {RETRY_ATTEMPTS} attempts. Skipping.")
178
+ failed_batches.append(start_from + i)
179
+
180
+ pbar.update(len(batch))
181
+
182
+ if failed_batches:
183
+ print(f"\n⚠️ {len(failed_batches)} batch(es) failed and were skipped:")
184
+ for idx in failed_batches:
185
+ print(f" → Starting at document #{idx} (resume with START_FROM={idx})")
186
+ else:
187
+ print("✅ Vector DB build complete — all batches inserted successfully!")
188
+
189
+ return vector_store
190
+
191
+
192
+ def verify_vector_db():
193
+ """Run a few test queries to confirm the DB is working."""
194
+ print("\n🔍 Verifying vector database...")
195
+
196
+ embeddings = get_embeddings()
197
+
198
+ vector_store = Chroma(
199
+ collection_name=COLLECTION_NAME,
200
+ embedding_function=embeddings,
201
+ persist_directory=PERSIST_DIR,
202
+ )
203
+
204
+ prefix = "Represent this sentence for searching relevant passages: "
205
+
206
+ test_queries = [
207
+ "mortgage deed property",
208
+ "cheating IPC section 420",
209
+ "partition of land revenue",
210
+ ]
211
+
212
+ print("-" * 70)
213
+ for query in test_queries:
214
+ results = vector_store.similarity_search(prefix + query, k=2)
215
+ print(f"\n🔎 Query : '{query}'")
216
+ print(f" Hits : {len(results)}")
217
+ if results:
218
+ snippet = results[0].page_content[:200].replace("\n", " ")
219
+ name = results[0].metadata.get("file_name", "?")
220
+ year = results[0].metadata.get("year", "?")
221
+ print(f" Best : [{year}] {name}")
222
+ print(f" Preview: {snippet}...")
223
+ print("-" * 70)
224
+
225
+
226
+ def cleanup_extracted(extract_dir: str):
227
+ """Remove the temporary extraction folder."""
228
+ if os.path.exists(extract_dir):
229
+ shutil.rmtree(extract_dir)
230
+ print(f"🗑️ Removed temporary folder '{extract_dir}'.")
231
+
232
+
233
+ def main():
234
+ print("=" * 70)
235
+ print(" LEGAL JUDGEMENTS — VECTOR DB BUILDER")
236
+ print(" (1 PDF = 1 chunk = 1 judgement)")
237
+ print(f" Device : {DEVICE}")
238
+ print(f" Local model : {Path(LOCAL_MODEL_DIR).resolve()}")
239
+ print("=" * 70)
240
+
241
+ # ── RESUME CONTROL ────────────────────────────────────────────────────
242
+ # If the script crashed mid-way, set START_FROM to the failed document
243
+ # index printed in the error output — it will skip re-extraction and
244
+ # resume inserting from that point without wiping the existing DB.
245
+ # Set to 0 for a fresh run.
246
+ START_FROM = 0
247
+ # ──────────────────────────────────────────────────────────────────────
248
+
249
+ if START_FROM == 0:
250
+ if os.path.exists(PERSIST_DIR):
251
+ print(f"\n🗑️ Removing old vector DB at '{PERSIST_DIR}'...")
252
+ shutil.rmtree(PERSIST_DIR)
253
+ print(" Done.")
254
+
255
+ if not os.path.exists(ZIP_PATH):
256
+ raise FileNotFoundError(
257
+ f"Zip file not found: '{ZIP_PATH}'. "
258
+ "Update ZIP_PATH at the top of the script."
259
+ )
260
+ extract_zip(ZIP_PATH, EXTRACT_DIR)
261
+ else:
262
+ print(f"\n▶️ Resuming from document #{START_FROM} — skipping extraction.")
263
+
264
+ pdf_paths = find_pdfs(EXTRACT_DIR)
265
+ if not pdf_paths:
266
+ print("❌ No PDFs found inside the zip. Check the folder structure.")
267
+ return
268
+
269
+ documents = build_documents(pdf_paths)
270
+ if not documents:
271
+ print("❌ No readable text extracted from PDFs.")
272
+ return
273
+
274
+ vector_store = build_vector_db(documents, start_from=START_FROM)
275
+
276
+ if vector_store:
277
+ verify_vector_db()
278
+
279
+ answer = input("\nDelete the extracted PDF folder? (y/n): ").strip().lower()
280
+ if answer == "y":
281
+ cleanup_extracted(EXTRACT_DIR)
282
+
283
+ print("\n" + "=" * 70)
284
+ print(" DONE!")
285
+ print(f" DB location : {os.path.abspath(PERSIST_DIR)}")
286
+ print(f" Judgements : {len(documents)}")
287
+ print(f" Collection : {COLLECTION_NAME}")
288
+ print(f" Local model : {Path(LOCAL_MODEL_DIR).resolve()}")
289
+ print(f" Device used : {DEVICE}")
290
+ print("=" * 70)
291
+
292
+
293
+ if __name__ == "__main__":
294
+ main()