JaydeepR Claude Sonnet 4.6 commited on
Commit
a337229
·
1 Parent(s): 1564d1d

Step 8: vector store and bidder processor — ChromaDB indexing and retrieval

Browse files

Implements specs/06_vectorstore_and_bidder_processor.md. get_client cached
with st.cache_resource; upsert deduplicates via sha256 IDs. process_bidder
runs OCR pipeline per file, chunks, and indexes with bidder_id metadata.
gather_evidence queries top-k chunks for a criterion's query_hints.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

core/bidder_processor.py CHANGED
@@ -1,10 +1,56 @@
1
  from pathlib import Path
 
 
 
 
2
  from core.schemas import Criterion, Evidence
3
 
4
 
5
  def process_bidder(bidder_id: str, files: list[Path]) -> None:
6
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
10
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
2
+
3
+ from core import audit, vectorstore
4
+ from core.chunker import chunk_bidder
5
+ from core.ocr_pipeline import extract_document
6
  from core.schemas import Criterion, Evidence
7
 
8
 
9
  def process_bidder(bidder_id: str, files: list[Path]) -> None:
10
+ collection = vectorstore.get_collection("bidder_chunks")
11
+ for file in files:
12
+ pages = extract_document(file)
13
+ chunks = chunk_bidder(pages, bidder_id, file.name)
14
+ if not chunks:
15
+ continue
16
+ metadatas = [
17
+ {
18
+ "bidder_id": bidder_id,
19
+ "doc_name": chunk["doc_name"],
20
+ "page": chunk["page"],
21
+ "source_type": chunk["source_type"],
22
+ "ocr_confidence": float(chunk["ocr_confidence"])
23
+ if chunk["ocr_confidence"] is not None else -1.0,
24
+ }
25
+ for chunk in chunks
26
+ ]
27
+ vectorstore.add_chunks(collection, chunks, metadatas)
28
+ audit.log(
29
+ "bidder_processed",
30
+ bidder_id=bidder_id,
31
+ doc_name=file.name,
32
+ chunk_count=len(chunks),
33
+ )
34
 
35
 
36
  def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
37
+ query_text = f"{criterion.title} {' '.join(criterion.query_hints)}"
38
+ collection = vectorstore.get_collection("bidder_chunks")
39
+ results = vectorstore.query(
40
+ collection, query_text, k=k, where={"bidder_id": bidder_id}
41
+ )
42
+ evidence = []
43
+ for r in results:
44
+ meta = r["metadata"]
45
+ ocr_conf = meta.get("ocr_confidence")
46
+ if ocr_conf is not None and ocr_conf < 0:
47
+ ocr_conf = None
48
+ evidence.append(Evidence(
49
+ bidder_id=bidder_id,
50
+ doc_name=meta["doc_name"],
51
+ page=meta["page"],
52
+ text=r["text"],
53
+ source_type=meta["source_type"],
54
+ ocr_confidence=ocr_conf,
55
+ ))
56
+ return evidence
core/vectorstore.py CHANGED
@@ -1,16 +1,53 @@
 
 
 
 
 
 
 
 
1
  def get_client():
2
- raise NotImplementedError
 
3
 
4
 
5
  def get_collection(name: str):
6
- raise NotImplementedError
 
 
 
 
7
 
8
 
9
  def add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None:
10
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
  def query(
14
  collection, text: str, k: int = 4, where: dict | None = None
15
  ) -> list[dict]:
16
- raise NotImplementedError
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+
3
+ import streamlit as st
4
+
5
+ from core.config import CHROMA_DIR
6
+
7
+
8
+ @st.cache_resource
9
  def get_client():
10
+ import chromadb
11
+ return chromadb.PersistentClient(path=CHROMA_DIR)
12
 
13
 
14
  def get_collection(name: str):
15
+ client = get_client()
16
+ return client.get_or_create_collection(
17
+ name=name,
18
+ metadata={"hnsw:space": "cosine"},
19
+ )
20
 
21
 
22
  def add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None:
23
+ if not chunks:
24
+ return
25
+ ids = [
26
+ hashlib.sha256(c["text"].encode()).hexdigest()[:16]
27
+ for c in chunks
28
+ ]
29
+ collection.upsert(
30
+ documents=[c["text"] for c in chunks],
31
+ ids=ids,
32
+ metadatas=metadatas,
33
+ )
34
 
35
 
36
  def query(
37
  collection, text: str, k: int = 4, where: dict | None = None
38
  ) -> list[dict]:
39
+ count = collection.count()
40
+ if count == 0:
41
+ return []
42
+ n = min(k, count)
43
+ kwargs: dict = {"query_texts": [text], "n_results": n}
44
+ if where:
45
+ kwargs["where"] = where
46
+ results = collection.query(**kwargs)
47
+ docs = results["documents"][0]
48
+ metas = results["metadatas"][0]
49
+ dists = results["distances"][0]
50
+ return [
51
+ {"text": doc, "metadata": meta, "distance": dist}
52
+ for doc, meta, dist in zip(docs, metas, dists)
53
+ ]
specs/06_vectorstore_and_bidder_processor.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Spec 06 — Vector Store and Bidder Processor
2
+
3
+ **Step:** 8 of 15
4
+ **Time budget:** ~25 min
5
+ **Checkpoint:** `process_bidder("bidder_a", ...)` indexes all docs; `gather_evidence("bidder_a", turnover_criterion)` returns chunks mentioning the turnover figure.
6
+
7
+ ---
8
+
9
+ ## Goal
10
+
11
+ Implement `core/vectorstore.py` (ChromaDB persistent client helpers) and `core/bidder_processor.py` (document ingestion + evidence retrieval per criterion).
12
+
13
+ ---
14
+
15
+ ## `core/vectorstore.py`
16
+
17
+ Uses ChromaDB persistent client with `sentence-transformers/all-MiniLM-L6-v2` embeddings.
18
+
19
+ ### `get_client()`
20
+
21
+ ```python
22
+ @st.cache_resource
23
+ def get_client():
24
+ import chromadb
25
+ from core.config import CHROMA_DIR
26
+ return chromadb.PersistentClient(path=CHROMA_DIR)
27
+ ```
28
+
29
+ ### `get_collection(name: str)`
30
+
31
+ ```python
32
+ def get_collection(name: str):
33
+ client = get_client()
34
+ return client.get_or_create_collection(
35
+ name=name,
36
+ metadata={"hnsw:space": "cosine"},
37
+ )
38
+ ```
39
+
40
+ Note: ChromaDB default embedding function uses `all-MiniLM-L6-v2` (~80 MB, downloaded on first run).
41
+
42
+ ### `add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None`
43
+
44
+ - IDs: `hashlib.sha256(chunk["text"].encode()).hexdigest()[:16]` — deduplicates across reruns.
45
+ - Calls `collection.upsert(documents=[c["text"] for c in chunks], ids=ids, metadatas=metadatas)`.
46
+
47
+ ### `query(collection, text: str, k: int = 4, where: dict | None = None) -> list[dict]`
48
+
49
+ - Calls `collection.query(query_texts=[text], n_results=k, where=where)` (omit `where` if None).
50
+ - Returns `[{"text": doc, "metadata": meta, "distance": dist}, ...]` from the first result set.
51
+ - Handle the case where fewer than `k` documents are in the collection (ChromaDB raises if `n_results > len(collection)`).
52
+
53
+ ---
54
+
55
+ ## `core/bidder_processor.py`
56
+
57
+ ### `process_bidder(bidder_id: str, files: list[Path]) -> None`
58
+
59
+ For each file in `files`:
60
+ 1. `pages = ocr_pipeline.extract_document(file)`.
61
+ 2. `chunks = chunker.chunk_bidder(pages, bidder_id, file.name)`.
62
+ 3. Build metadatas list — one per chunk:
63
+ ```python
64
+ {"bidder_id": bidder_id, "doc_name": file.name,
65
+ "page": chunk["page"], "source_type": chunk["source_type"],
66
+ "ocr_confidence": chunk["ocr_confidence"]}
67
+ ```
68
+ 4. `collection = vectorstore.get_collection("bidder_chunks")`.
69
+ 5. `vectorstore.add_chunks(collection, chunks, metadatas)`.
70
+ 6. `audit.log("bidder_processed", bidder_id=bidder_id, doc_name=file.name, chunk_count=len(chunks))`.
71
+
72
+ ### `gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]`
73
+
74
+ 1. Build query string: `f"{criterion.title} {' '.join(criterion.query_hints)}"`.
75
+ 2. `collection = vectorstore.get_collection("bidder_chunks")`.
76
+ 3. `results = vectorstore.query(collection, query, k=k, where={"bidder_id": bidder_id})`.
77
+ 4. Map each result to `Evidence`:
78
+ ```python
79
+ Evidence(
80
+ bidder_id=bidder_id,
81
+ doc_name=meta["doc_name"],
82
+ page=meta["page"],
83
+ text=result["text"],
84
+ source_type=meta["source_type"],
85
+ ocr_confidence=meta.get("ocr_confidence"),
86
+ )
87
+ ```
88
+ 5. Return list.
89
+
90
+ ---
91
+
92
+ ## Acceptance Criteria
93
+
94
+ 1. `process_bidder("bidder_a", [path1, path2, ...])` completes without error and logs audit entries.
95
+ 2. `gather_evidence("bidder_a", c1_criterion)` returns at least 1 `Evidence` object.
96
+ 3. The strongest evidence for Bidder A's turnover mentions "6,20,00,000" or "INR".
97
+ 4. Calling `process_bidder` twice on the same files does not duplicate chunks (upsert).