"""Build a synthetic TF-IDF clinical-RAG index for tests.

Avoids needing real PDFs. Constructs the same payload schema the user's
rag.py produces so the loader can be tested independently of pypdf.
"""
from __future__ import annotations

import pickle
from datetime import datetime
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer

from src.rag.clinical.types import ClinicalChunk


def build(path: Path) -> Path:
    """Save a tiny TF-IDF index at `path`."""
    path = Path(path)
    if path.exists():
        return path
    path.parent.mkdir(parents=True, exist_ok=True)

    chunks = [
        ClinicalChunk(0, "alzheimers_lifestyle.pdf", 1, 1,
                      "Aerobic exercise and Mediterranean diet are associated with reduced cognitive decline in older adults at risk for Alzheimer's disease."),
        ClinicalChunk(1, "parkinsons_motor.pdf", 1, 1,
                      "Levodopa remains the most effective symptomatic treatment for motor symptoms of Parkinson's disease."),
        ClinicalChunk(2, "alzheimers_mci.pdf", 2, 2,
                      "Mild cognitive impairment may progress to dementia; MMSE and MoCA are standard screening tools."),
        ClinicalChunk(3, "parkinsons_nutrition.pdf", 1, 1,
                      "Dietary patterns rich in antioxidants and omega-3 fatty acids are linked to lower Parkinson's risk."),
    ]

    vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 2), min_df=1, norm="l2")
    matrix = vectorizer.fit_transform([c.text for c in chunks])

    payload = {
        "created_at": datetime.now().isoformat(timespec="seconds"),
        "source_dir": str(path.parent),
        "chunk_words": 220,
        "overlap_words": 45,
        "chunks": chunks,
        "vectorizer": vectorizer,
        "matrix": matrix,
    }
    with path.open("wb") as f:
        pickle.dump(payload, f)
    return path