Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto commited on 4 days ago

Commit

6b2c154

1 Parent(s): ac78b6f

feat(rag): clinical TF-IDF index loader with main.Chunk routing

Browse files

Files changed (5) hide show

src/rag/clinical/__init__.py +0 -0
src/rag/clinical/loader.py +45 -0
src/rag/clinical/types.py +30 -0
tests/fixtures/build_tiny_clinical_index.py +49 -0
tests/rag/test_clinical_loader.py +30 -0

src/rag/clinical/__init__.py ADDED Viewed

File without changes

src/rag/clinical/loader.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Load (or rebuild) the TF-IDF clinical RAG index.
+The user's rag.py builds the index from `__main__.Chunk` (a frozen
+dataclass). Picking up that pickle from a different module path requires
+a custom Unpickler that re-routes the class — see _ChunkRoutingUnpickler.
+"""
+from __future__ import annotations
+import pickle
+from pathlib import Path
+from typing import Any
+from src.core.logger import get_logger
+from src.rag.clinical.types import ClinicalChunk
+logger = get_logger(__name__)
+class _ChunkRoutingUnpickler(pickle.Unpickler):
+    """Pickle's `find_class` hook lets us swap `__main__.Chunk` (and
+    `rag.Chunk` if the user later runs the builder as a module) for our
+    `ClinicalChunk` — both are frozen dataclasses with the same fields,
+    so the swap is structurally safe.
+    """
+    def find_class(self, module: str, name: str):
+        if name == "Chunk" and module in {"__main__", "rag", "rag.rag"}:
+            return ClinicalChunk
+        return super().find_class(module, name)
+def load_index(path: Path) -> dict[str, Any]:
+    """Unpickle a TF-IDF index produced by the user's rag.py."""
+    path = Path(path)
+    if not path.exists():
+        raise FileNotFoundError(f"clinical RAG index not found: {path}")
+    with path.open("rb") as f:
+        payload = _ChunkRoutingUnpickler(f).load()
+    expected = {"chunks", "vectorizer", "matrix"}
+    if not expected <= set(payload):
+        raise ValueError(
+            f"clinical RAG index missing expected keys: have {sorted(payload)}, need {sorted(expected)}"
+        )
+    logger.info("loaded clinical RAG index: %d chunks from %s", len(payload["chunks"]), path)
+    return payload

src/rag/clinical/types.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""Types shared across clinical-RAG modules."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pydantic import BaseModel, Field
+@dataclass(frozen=True)
+class ClinicalChunk:
+    """Mirrors the Chunk dataclass produced by the user's rag.py builder."""
+    chunk_id: int
+    source: str
+    page_start: int
+    page_end: int
+    text: str
+class ClinicalEvidence(BaseModel):
+    sentence: str
+    source: str
+    page_start: int
+    page_end: int
+    score: float = Field(..., ge=0.0)
+class ClinicalRetrievalResult(BaseModel):
+    query: str
+    evidence: list[ClinicalEvidence]
+    summary_text: str = Field(..., description="Pre-formatted RAG feedback for the agent")

tests/fixtures/build_tiny_clinical_index.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Build a synthetic TF-IDF clinical-RAG index for tests.
+Avoids needing real PDFs. Constructs the same payload schema the user's
+rag.py produces so the loader can be tested independently of pypdf.
+"""
+from __future__ import annotations
+import pickle
+from datetime import datetime
+from pathlib import Path
+from sklearn.feature_extraction.text import TfidfVectorizer
+from src.rag.clinical.types import ClinicalChunk
+def build(path: Path) -> Path:
+    """Save a tiny TF-IDF index at `path`."""
+    path = Path(path)
+    if path.exists():
+        return path
+    path.parent.mkdir(parents=True, exist_ok=True)
+    chunks = [
+        ClinicalChunk(0, "alzheimers_lifestyle.pdf", 1, 1,
+                      "Aerobic exercise and Mediterranean diet are associated with reduced cognitive decline in older adults at risk for Alzheimer's disease."),
+        ClinicalChunk(1, "parkinsons_motor.pdf", 1, 1,
+                      "Levodopa remains the most effective symptomatic treatment for motor symptoms of Parkinson's disease."),
+        ClinicalChunk(2, "alzheimers_mci.pdf", 2, 2,
+                      "Mild cognitive impairment may progress to dementia; MMSE and MoCA are standard screening tools."),
+        ClinicalChunk(3, "parkinsons_nutrition.pdf", 1, 1,
+                      "Dietary patterns rich in antioxidants and omega-3 fatty acids are linked to lower Parkinson's risk."),
+    ]
+    vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1, norm="l2")
+    matrix = vectorizer.fit_transform([c.text for c in chunks])
+    payload = {
+        "created_at": datetime.now().isoformat(timespec="seconds"),
+        "source_dir": str(path.parent),
+        "chunk_words": 220,
+        "overlap_words": 45,
+        "chunks": chunks,
+        "vectorizer": vectorizer,
+        "matrix": matrix,
+    }
+    with path.open("wb") as f:
+        pickle.dump(payload, f)
+    return path

tests/rag/test_clinical_loader.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""Tests for src.rag.clinical.loader."""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from src.rag.clinical import loader
+from tests.fixtures.build_tiny_clinical_index import build as build_tiny
+class TestLoadIndex:
+    def test_load_returns_payload_with_expected_keys(self, tmp_path: Path) -> None:
+        idx_path = build_tiny(tmp_path / "tiny.pkl")
+        payload = loader.load_index(idx_path)
+        assert {"chunks", "vectorizer", "matrix"} <= set(payload)
+        assert len(payload["chunks"]) == 4
+    def test_missing_index_raises(self, tmp_path: Path) -> None:
+        with pytest.raises(FileNotFoundError, match="clinical RAG index not found"):
+            loader.load_index(tmp_path / "nope.pkl")
+    def test_unique_sources(self, tmp_path: Path) -> None:
+        idx_path = build_tiny(tmp_path / "tiny.pkl")
+        payload = loader.load_index(idx_path)
+        sources = {c.source for c in payload["chunks"]}
+        assert sources == {
+            "alzheimers_lifestyle.pdf", "parkinsons_motor.pdf",
+            "alzheimers_mci.pdf", "parkinsons_nutrition.pdf",
+        }

feat(rag): clinical TF-IDF index loader with __main__.Chunk routing

feat(rag): clinical TF-IDF index loader with main.Chunk routing