hackathon / tests /fixtures /build_tiny_clinical_index.py
mekosotto's picture
feat(rag): clinical TF-IDF index loader with __main__.Chunk routing
6b2c154
"""Build a synthetic TF-IDF clinical-RAG index for tests.
Avoids needing real PDFs. Constructs the same payload schema the user's
rag.py produces so the loader can be tested independently of pypdf.
"""
from __future__ import annotations
import pickle
from datetime import datetime
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from src.rag.clinical.types import ClinicalChunk
def build(path: Path) -> Path:
"""Save a tiny TF-IDF index at `path`."""
path = Path(path)
if path.exists():
return path
path.parent.mkdir(parents=True, exist_ok=True)
chunks = [
ClinicalChunk(0, "alzheimers_lifestyle.pdf", 1, 1,
"Aerobic exercise and Mediterranean diet are associated with reduced cognitive decline in older adults at risk for Alzheimer's disease."),
ClinicalChunk(1, "parkinsons_motor.pdf", 1, 1,
"Levodopa remains the most effective symptomatic treatment for motor symptoms of Parkinson's disease."),
ClinicalChunk(2, "alzheimers_mci.pdf", 2, 2,
"Mild cognitive impairment may progress to dementia; MMSE and MoCA are standard screening tools."),
ClinicalChunk(3, "parkinsons_nutrition.pdf", 1, 1,
"Dietary patterns rich in antioxidants and omega-3 fatty acids are linked to lower Parkinson's risk."),
]
vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1, norm="l2")
matrix = vectorizer.fit_transform([c.text for c in chunks])
payload = {
"created_at": datetime.now().isoformat(timespec="seconds"),
"source_dir": str(path.parent),
"chunk_words": 220,
"overlap_words": 45,
"chunks": chunks,
"vectorizer": vectorizer,
"matrix": matrix,
}
with path.open("wb") as f:
pickle.dump(payload, f)
return path