Spaces:

JaydeepR
/

TenderIQ

Sleeping

JaydeepR Claude Sonnet 4.6 commited on 15 days ago

Commit

a337229

1 Parent(s): 1564d1d

Step 8: vector store and bidder processor — ChromaDB indexing and retrieval

Implements specs/06_vectorstore_and_bidder_processor.md. get_client cached
with st.cache_resource; upsert deduplicates via sha256 IDs. process_bidder
runs OCR pipeline per file, chunks, and indexes with bidder_id metadata.
gather_evidence queries top-k chunks for a criterion's query_hints.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (3) hide show

core/bidder_processor.py +48 -2
core/vectorstore.py +41 -4
specs/06_vectorstore_and_bidder_processor.md +97 -0

core/bidder_processor.py CHANGED Viewed

@@ -1,10 +1,56 @@
 from pathlib import Path
 from core.schemas import Criterion, Evidence
 def process_bidder(bidder_id: str, files: list[Path]) -> None:
-    raise NotImplementedError
 def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
-    raise NotImplementedError

 from pathlib import Path
+from core import audit, vectorstore
+from core.chunker import chunk_bidder
+from core.ocr_pipeline import extract_document
 from core.schemas import Criterion, Evidence
 def process_bidder(bidder_id: str, files: list[Path]) -> None:
+    collection = vectorstore.get_collection("bidder_chunks")
+    for file in files:
+        pages = extract_document(file)
+        chunks = chunk_bidder(pages, bidder_id, file.name)
+        if not chunks:
+            continue
+        metadatas = [
+            {
+                "bidder_id": bidder_id,
+                "doc_name": chunk["doc_name"],
+                "page": chunk["page"],
+                "source_type": chunk["source_type"],
+                "ocr_confidence": float(chunk["ocr_confidence"])
+                if chunk["ocr_confidence"] is not None else -1.0,
+            }
+            for chunk in chunks
+        ]
+        vectorstore.add_chunks(collection, chunks, metadatas)
+        audit.log(
+            "bidder_processed",
+            bidder_id=bidder_id,
+            doc_name=file.name,
+            chunk_count=len(chunks),
+        )
 def gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]:
+    query_text = f"{criterion.title} {' '.join(criterion.query_hints)}"
+    collection = vectorstore.get_collection("bidder_chunks")
+    results = vectorstore.query(
+        collection, query_text, k=k, where={"bidder_id": bidder_id}
+    )
+    evidence = []
+    for r in results:
+        meta = r["metadata"]
+        ocr_conf = meta.get("ocr_confidence")
+        if ocr_conf is not None and ocr_conf < 0:
+            ocr_conf = None
+        evidence.append(Evidence(
+            bidder_id=bidder_id,
+            doc_name=meta["doc_name"],
+            page=meta["page"],
+            text=r["text"],
+            source_type=meta["source_type"],
+            ocr_confidence=ocr_conf,
+        ))
+    return evidence

core/vectorstore.py CHANGED Viewed

@@ -1,16 +1,53 @@
 def get_client():
-    raise NotImplementedError
 def get_collection(name: str):
-    raise NotImplementedError
 def add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None:
-    raise NotImplementedError
 def query(
     collection, text: str, k: int = 4, where: dict | None = None
 ) -> list[dict]:
-    raise NotImplementedError

+import hashlib
+import streamlit as st
+from core.config import CHROMA_DIR
+@st.cache_resource
 def get_client():
+    import chromadb
+    return chromadb.PersistentClient(path=CHROMA_DIR)
 def get_collection(name: str):
+    client = get_client()
+    return client.get_or_create_collection(
+        name=name,
+        metadata={"hnsw:space": "cosine"},
+    )
 def add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None:
+    if not chunks:
+        return
+    ids = [
+        hashlib.sha256(c["text"].encode()).hexdigest()[:16]
+        for c in chunks
+    ]
+    collection.upsert(
+        documents=[c["text"] for c in chunks],
+        ids=ids,
+        metadatas=metadatas,
+    )
 def query(
     collection, text: str, k: int = 4, where: dict | None = None
 ) -> list[dict]:
+    count = collection.count()
+    if count == 0:
+        return []
+    n = min(k, count)
+    kwargs: dict = {"query_texts": [text], "n_results": n}
+    if where:
+        kwargs["where"] = where
+    results = collection.query(**kwargs)
+    docs = results["documents"][0]
+    metas = results["metadatas"][0]
+    dists = results["distances"][0]
+    return [
+        {"text": doc, "metadata": meta, "distance": dist}
+        for doc, meta, dist in zip(docs, metas, dists)
+    ]

specs/06_vectorstore_and_bidder_processor.md ADDED Viewed

	@@ -0,0 +1,97 @@

+# Spec 06 — Vector Store and Bidder Processor
+**Step:** 8 of 15
+**Time budget:** ~25 min
+**Checkpoint:** `process_bidder("bidder_a", ...)` indexes all docs; `gather_evidence("bidder_a", turnover_criterion)` returns chunks mentioning the turnover figure.
+---
+## Goal
+Implement `core/vectorstore.py` (ChromaDB persistent client helpers) and `core/bidder_processor.py` (document ingestion + evidence retrieval per criterion).
+---
+## `core/vectorstore.py`
+Uses ChromaDB persistent client with `sentence-transformers/all-MiniLM-L6-v2` embeddings.
+### `get_client()`
+```python
+@st.cache_resource
+def get_client():
+    import chromadb
+    from core.config import CHROMA_DIR
+    return chromadb.PersistentClient(path=CHROMA_DIR)
+```
+### `get_collection(name: str)`
+```python
+def get_collection(name: str):
+    client = get_client()
+    return client.get_or_create_collection(
+        name=name,
+        metadata={"hnsw:space": "cosine"},
+    )
+```
+Note: ChromaDB default embedding function uses `all-MiniLM-L6-v2` (~80 MB, downloaded on first run).
+### `add_chunks(collection, chunks: list[dict], metadatas: list[dict]) -> None`
+- IDs: `hashlib.sha256(chunk["text"].encode()).hexdigest()[:16]` — deduplicates across reruns.
+- Calls `collection.upsert(documents=[c["text"] for c in chunks], ids=ids, metadatas=metadatas)`.
+### `query(collection, text: str, k: int = 4, where: dict | None = None) -> list[dict]`
+- Calls `collection.query(query_texts=[text], n_results=k, where=where)` (omit `where` if None).
+- Returns `[{"text": doc, "metadata": meta, "distance": dist}, ...]` from the first result set.
+- Handle the case where fewer than `k` documents are in the collection (ChromaDB raises if `n_results > len(collection)`).
+---
+## `core/bidder_processor.py`
+### `process_bidder(bidder_id: str, files: list[Path]) -> None`
+For each file in `files`:
+1. `pages = ocr_pipeline.extract_document(file)`.
+2. `chunks = chunker.chunk_bidder(pages, bidder_id, file.name)`.
+3. Build metadatas list — one per chunk:
+   ```python
+   {"bidder_id": bidder_id, "doc_name": file.name,
+    "page": chunk["page"], "source_type": chunk["source_type"],
+    "ocr_confidence": chunk["ocr_confidence"]}
+   ```
+4. `collection = vectorstore.get_collection("bidder_chunks")`.
+5. `vectorstore.add_chunks(collection, chunks, metadatas)`.
+6. `audit.log("bidder_processed", bidder_id=bidder_id, doc_name=file.name, chunk_count=len(chunks))`.
+### `gather_evidence(bidder_id: str, criterion: Criterion, k: int = 4) -> list[Evidence]`
+1. Build query string: `f"{criterion.title} {' '.join(criterion.query_hints)}"`.
+2. `collection = vectorstore.get_collection("bidder_chunks")`.
+3. `results = vectorstore.query(collection, query, k=k, where={"bidder_id": bidder_id})`.
+4. Map each result to `Evidence`:
+   ```python
+   Evidence(
+       bidder_id=bidder_id,
+       doc_name=meta["doc_name"],
+       page=meta["page"],
+       text=result["text"],
+       source_type=meta["source_type"],
+       ocr_confidence=meta.get("ocr_confidence"),
+   )
+   ```
+5. Return list.
+---
+## Acceptance Criteria
+1. `process_bidder("bidder_a", [path1, path2, ...])` completes without error and logs audit entries.
+2. `gather_evidence("bidder_a", c1_criterion)` returns at least 1 `Evidence` object.
+3. The strongest evidence for Bidder A's turnover mentions "6,20,00,000" or "INR".
+4. Calling `process_bidder` twice on the same files does not duplicate chunks (upsert).