Step 8: vector store and bidder processor — ChromaDB indexing and retrieval
Browse filesImplements specs/06_vectorstore_and_bidder_processor.md. get_client cached
with st.cache_resource; upsert deduplicates via sha256 IDs. process_bidder
runs OCR pipeline per file, chunks, and indexes with bidder_id metadata.
gather_evidence queries top-k chunks for a criterion's query_hints.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- core/bidder_processor.py +48 -2
- core/vectorstore.py +41 -4
- specs/06_vectorstore_and_bidder_processor.md +97 -0
core/bidder_processor.py
CHANGED
|
@@ -1,10 +1,56 @@
|
|
| 1 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from core.schemas import Criterion, Evidence
|
| 3 |
|
| 4 |
|
| 5 |
def process_bidder(bidder_id: str, files: list[Path]) -> None:
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from core import audit, vectorstore
|
| 4 |
+
from core.chunker import chunk_bidder
|
| 5 |
+
from core.ocr_pipeline import extract_document
|
| 6 |
from core.schemas import Criterion, Evidence
|
| 7 |
|
| 8 |
|
| 9 |
def process_bidder(bidder_id: str, files: list[Path]) -> None:
|
| 10 |
+
collection = vectorstore.get_collection("bidder_chunks")
|
| 11 |
+
for file in files:
|
| 12 |
+
pages = extract_document(file)
|
| 13 |
+
chunks = chunk_bidder(pages, bidder_id, file.name)
|
| 14 |
+
if not chunks:
|
| 15 |
+
continue
|
| 16 |
+
metadatas = [
|
| 17 |
+
{
|
| 18 |
+
"bidder_id": bidder_id,
|
| 19 |
+
"doc_name": chunk["doc_name"],
|
| 20 |
+
"page": chunk["page"],
|
| 21 |
+
"source_type": chunk["source_type"],
|
| 22 |
+
"ocr_confidence": float(chunk["ocr_confidence"])
|
| 23 |
+
if chunk["ocr_confidence"] is not None else -1.0,
|
| 24 |
+
}
|
| 25 |
+
for chunk in chunks
|
| 26 |
+
]
|
| 27 |
+
vectorstore.add_chunks(collection, chunks, metadatas)
|
| 28 |
+
audit.log(
|
| 29 |
+
"bidder_processed",
|
| 30 |
+
bidder_id=bidder_id,
|
| 31 |
+
doc_name=file.name,
|
| 32 |
+
chunk_count=len(chunks),
|
| 33 |
+
)
|
| 34 |
|
| 35 |
|
| 36 |
def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
|
| 37 |
+
query_text = f"{criterion.title} {' '.join(criterion.query_hints)}"
|
| 38 |
+
collection = vectorstore.get_collection("bidder_chunks")
|
| 39 |
+
results = vectorstore.query(
|
| 40 |
+
collection, query_text, k=k, where={"bidder_id": bidder_id}
|
| 41 |
+
)
|
| 42 |
+
evidence = []
|
| 43 |
+
for r in results:
|
| 44 |
+
meta = r["metadata"]
|
| 45 |
+
ocr_conf = meta.get("ocr_confidence")
|
| 46 |
+
if ocr_conf is not None and ocr_conf < 0:
|
| 47 |
+
ocr_conf = None
|
| 48 |
+
evidence.append(Evidence(
|
| 49 |
+
bidder_id=bidder_id,
|
| 50 |
+
doc_name=meta["doc_name"],
|
| 51 |
+
page=meta["page"],
|
| 52 |
+
text=r["text"],
|
| 53 |
+
source_type=meta["source_type"],
|
| 54 |
+
ocr_confidence=ocr_conf,
|
| 55 |
+
))
|
| 56 |
+
return evidence
|
core/vectorstore.py
CHANGED
|
@@ -1,16 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
def get_client():
|
| 2 |
-
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def get_collection(name: str):
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None:
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def query(
|
| 14 |
collection, text: str, k: int = 4, where: dict | None = None
|
| 15 |
) -> list[dict]:
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
|
| 3 |
+
import streamlit as st
|
| 4 |
+
|
| 5 |
+
from core.config import CHROMA_DIR
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@st.cache_resource
|
| 9 |
def get_client():
|
| 10 |
+
import chromadb
|
| 11 |
+
return chromadb.PersistentClient(path=CHROMA_DIR)
|
| 12 |
|
| 13 |
|
| 14 |
def get_collection(name: str):
|
| 15 |
+
client = get_client()
|
| 16 |
+
return client.get_or_create_collection(
|
| 17 |
+
name=name,
|
| 18 |
+
metadata={"hnsw:space": "cosine"},
|
| 19 |
+
)
|
| 20 |
|
| 21 |
|
| 22 |
def add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None:
|
| 23 |
+
if not chunks:
|
| 24 |
+
return
|
| 25 |
+
ids = [
|
| 26 |
+
hashlib.sha256(c["text"].encode()).hexdigest()[:16]
|
| 27 |
+
for c in chunks
|
| 28 |
+
]
|
| 29 |
+
collection.upsert(
|
| 30 |
+
documents=[c["text"] for c in chunks],
|
| 31 |
+
ids=ids,
|
| 32 |
+
metadatas=metadatas,
|
| 33 |
+
)
|
| 34 |
|
| 35 |
|
| 36 |
def query(
|
| 37 |
collection, text: str, k: int = 4, where: dict | None = None
|
| 38 |
) -> list[dict]:
|
| 39 |
+
count = collection.count()
|
| 40 |
+
if count == 0:
|
| 41 |
+
return []
|
| 42 |
+
n = min(k, count)
|
| 43 |
+
kwargs: dict = {"query_texts": [text], "n_results": n}
|
| 44 |
+
if where:
|
| 45 |
+
kwargs["where"] = where
|
| 46 |
+
results = collection.query(**kwargs)
|
| 47 |
+
docs = results["documents"][0]
|
| 48 |
+
metas = results["metadatas"][0]
|
| 49 |
+
dists = results["distances"][0]
|
| 50 |
+
return [
|
| 51 |
+
{"text": doc, "metadata": meta, "distance": dist}
|
| 52 |
+
for doc, meta, dist in zip(docs, metas, dists)
|
| 53 |
+
]
|
specs/06_vectorstore_and_bidder_processor.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Spec 06 — Vector Store and Bidder Processor
|
| 2 |
+
|
| 3 |
+
**Step:** 8 of 15
|
| 4 |
+
**Time budget:** ~25 min
|
| 5 |
+
**Checkpoint:** `process_bidder("bidder_a", ...)` indexes all docs; `gather_evidence("bidder_a", turnover_criterion)` returns chunks mentioning the turnover figure.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Goal
|
| 10 |
+
|
| 11 |
+
Implement `core/vectorstore.py` (ChromaDB persistent client helpers) and `core/bidder_processor.py` (document ingestion + evidence retrieval per criterion).
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## `core/vectorstore.py`
|
| 16 |
+
|
| 17 |
+
Uses ChromaDB persistent client with `sentence-transformers/all-MiniLM-L6-v2` embeddings.
|
| 18 |
+
|
| 19 |
+
### `get_client()`
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
@st.cache_resource
|
| 23 |
+
def get_client():
|
| 24 |
+
import chromadb
|
| 25 |
+
from core.config import CHROMA_DIR
|
| 26 |
+
return chromadb.PersistentClient(path=CHROMA_DIR)
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### `get_collection(name: str)`
|
| 30 |
+
|
| 31 |
+
```python
|
| 32 |
+
def get_collection(name: str):
|
| 33 |
+
client = get_client()
|
| 34 |
+
return client.get_or_create_collection(
|
| 35 |
+
name=name,
|
| 36 |
+
metadata={"hnsw:space": "cosine"},
|
| 37 |
+
)
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
Note: ChromaDB default embedding function uses `all-MiniLM-L6-v2` (~80 MB, downloaded on first run).
|
| 41 |
+
|
| 42 |
+
### `add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None`
|
| 43 |
+
|
| 44 |
+
- IDs: `hashlib.sha256(chunk["text"].encode()).hexdigest()[:16]` — deduplicates across reruns.
|
| 45 |
+
- Calls `collection.upsert(documents=[c["text"] for c in chunks], ids=ids, metadatas=metadatas)`.
|
| 46 |
+
|
| 47 |
+
### `query(collection, text: str, k: int = 4, where: dict | None = None) -> list[dict]`
|
| 48 |
+
|
| 49 |
+
- Calls `collection.query(query_texts=[text], n_results=k, where=where)` (omit `where` if None).
|
| 50 |
+
- Returns `[{"text": doc, "metadata": meta, "distance": dist}, ...]` from the first result set.
|
| 51 |
+
- Handle the case where fewer than `k` documents are in the collection (ChromaDB raises if `n_results > len(collection)`).
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## `core/bidder_processor.py`
|
| 56 |
+
|
| 57 |
+
### `process_bidder(bidder_id: str, files: list[Path]) -> None`
|
| 58 |
+
|
| 59 |
+
For each file in `files`:
|
| 60 |
+
1. `pages = ocr_pipeline.extract_document(file)`.
|
| 61 |
+
2. `chunks = chunker.chunk_bidder(pages, bidder_id, file.name)`.
|
| 62 |
+
3. Build metadatas list — one per chunk:
|
| 63 |
+
```python
|
| 64 |
+
{"bidder_id": bidder_id, "doc_name": file.name,
|
| 65 |
+
"page": chunk["page"], "source_type": chunk["source_type"],
|
| 66 |
+
"ocr_confidence": chunk["ocr_confidence"]}
|
| 67 |
+
```
|
| 68 |
+
4. `collection = vectorstore.get_collection("bidder_chunks")`.
|
| 69 |
+
5. `vectorstore.add_chunks(collection, chunks, metadatas)`.
|
| 70 |
+
6. `audit.log("bidder_processed", bidder_id=bidder_id, doc_name=file.name, chunk_count=len(chunks))`.
|
| 71 |
+
|
| 72 |
+
### `gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]`
|
| 73 |
+
|
| 74 |
+
1. Build query string: `f"{criterion.title} {' '.join(criterion.query_hints)}"`.
|
| 75 |
+
2. `collection = vectorstore.get_collection("bidder_chunks")`.
|
| 76 |
+
3. `results = vectorstore.query(collection, query, k=k, where={"bidder_id": bidder_id})`.
|
| 77 |
+
4. Map each result to `Evidence`:
|
| 78 |
+
```python
|
| 79 |
+
Evidence(
|
| 80 |
+
bidder_id=bidder_id,
|
| 81 |
+
doc_name=meta["doc_name"],
|
| 82 |
+
page=meta["page"],
|
| 83 |
+
text=result["text"],
|
| 84 |
+
source_type=meta["source_type"],
|
| 85 |
+
ocr_confidence=meta.get("ocr_confidence"),
|
| 86 |
+
)
|
| 87 |
+
```
|
| 88 |
+
5. Return list.
|
| 89 |
+
|
| 90 |
+
---
|
| 91 |
+
|
| 92 |
+
## Acceptance Criteria
|
| 93 |
+
|
| 94 |
+
1. `process_bidder("bidder_a", [path1, path2, ...])` completes without error and logs audit entries.
|
| 95 |
+
2. `gather_evidence("bidder_a", c1_criterion)` returns at least 1 `Evidence` object.
|
| 96 |
+
3. The strongest evidence for Bidder A's turnover mentions "6,20,00,000" or "INR".
|
| 97 |
+
4. Calling `process_bidder` twice on the same files does not duplicate chunks (upsert).
|