Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

hackathon / tests /fixtures /build_tiny_clinical_index.py

mekosotto

feat(rag): clinical TF-IDF index loader with __main__.Chunk routing

6b2c154 22 days ago

raw

history blame contribute delete

1.91 kB

	"""Build a synthetic TF-IDF clinical-RAG index for tests.

	Avoids needing real PDFs. Constructs the same payload schema the user's
	rag.py produces so the loader can be tested independently of pypdf.
	"""
	from __future__ import annotations

	import pickle
	from datetime import datetime
	from pathlib import Path

	from sklearn.feature_extraction.text import TfidfVectorizer

	from src.rag.clinical.types import ClinicalChunk


	def build(path: Path) -> Path:
	"""Save a tiny TF-IDF index at `path`."""
	path = Path(path)
	if path.exists():
	return path
	path.parent.mkdir(parents=True, exist_ok=True)

	chunks = [
	ClinicalChunk(0, "alzheimers_lifestyle.pdf", 1, 1,
	"Aerobic exercise and Mediterranean diet are associated with reduced cognitive decline in older adults at risk for Alzheimer's disease."),
	ClinicalChunk(1, "parkinsons_motor.pdf", 1, 1,
	"Levodopa remains the most effective symptomatic treatment for motor symptoms of Parkinson's disease."),
	ClinicalChunk(2, "alzheimers_mci.pdf", 2, 2,
	"Mild cognitive impairment may progress to dementia; MMSE and MoCA are standard screening tools."),
	ClinicalChunk(3, "parkinsons_nutrition.pdf", 1, 1,
	"Dietary patterns rich in antioxidants and omega-3 fatty acids are linked to lower Parkinson's risk."),
	]

	vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1, norm="l2")
	matrix = vectorizer.fit_transform([c.text for c in chunks])

	payload = {
	"created_at": datetime.now().isoformat(timespec="seconds"),
	"source_dir": str(path.parent),
	"chunk_words": 220,
	"overlap_words": 45,
	"chunks": chunks,
	"vectorizer": vectorizer,
	"matrix": matrix,
	}
	with path.open("wb") as f:
	pickle.dump(payload, f)
	return path