"""Build a synthetic TF-IDF clinical-RAG index for tests. Avoids needing real PDFs. Constructs the same payload schema the user's rag.py produces so the loader can be tested independently of pypdf. """ from __future__ import annotations import pickle from datetime import datetime from pathlib import Path from sklearn.feature_extraction.text import TfidfVectorizer from src.rag.clinical.types import ClinicalChunk def build(path: Path) -> Path: """Save a tiny TF-IDF index at `path`.""" path = Path(path) if path.exists(): return path path.parent.mkdir(parents=True, exist_ok=True) chunks = [ ClinicalChunk(0, "alzheimers_lifestyle.pdf", 1, 1, "Aerobic exercise and Mediterranean diet are associated with reduced cognitive decline in older adults at risk for Alzheimer's disease."), ClinicalChunk(1, "parkinsons_motor.pdf", 1, 1, "Levodopa remains the most effective symptomatic treatment for motor symptoms of Parkinson's disease."), ClinicalChunk(2, "alzheimers_mci.pdf", 2, 2, "Mild cognitive impairment may progress to dementia; MMSE and MoCA are standard screening tools."), ClinicalChunk(3, "parkinsons_nutrition.pdf", 1, 1, "Dietary patterns rich in antioxidants and omega-3 fatty acids are linked to lower Parkinson's risk."), ] vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1, norm="l2") matrix = vectorizer.fit_transform([c.text for c in chunks]) payload = { "created_at": datetime.now().isoformat(timespec="seconds"), "source_dir": str(path.parent), "chunk_words": 220, "overlap_words": 45, "chunks": chunks, "vectorizer": vectorizer, "matrix": matrix, } with path.open("wb") as f: pickle.dump(payload, f) return path