feat(rag): ingest CLI (markdown/PDF → chunks → FAISS) + sample KB fixtures
Browse files
src/rag/ingest.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Walk a knowledge-base directory, chunk each file, embed, persist FAISS index.
|
| 2 |
+
|
| 3 |
+
CLI entry point: `python -m src.rag.ingest [<input_dir> [<output_dir>]]`.
|
| 4 |
+
Defaults: input=`data/knowledge_base/`, output=`data/processed/faiss_index/`.
|
| 5 |
+
|
| 6 |
+
Supported file types: `.md`, `.txt`, `.pdf`. Other extensions are ignored
|
| 7 |
+
with a logged WARNING.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import sys
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
from src.core.logger import get_logger
|
| 15 |
+
from src.rag.chunker import chunk_text
|
| 16 |
+
from src.rag.embed import EMBEDDING_DIM, Embedder
|
| 17 |
+
from src.rag.store import FAISSStore
|
| 18 |
+
|
| 19 |
+
logger = get_logger(__name__)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
_DEFAULT_INPUT = Path("data/knowledge_base")
|
| 23 |
+
_DEFAULT_OUTPUT = Path("data/processed/faiss_index")
|
| 24 |
+
_SUPPORTED = {".md", ".txt", ".pdf"}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _read_pdf(path: Path) -> str:
|
| 28 |
+
from pypdf import PdfReader
|
| 29 |
+
reader = PdfReader(str(path))
|
| 30 |
+
return "\n\n".join(page.extract_text() or "" for page in reader.pages)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _read_file(path: Path) -> str:
|
| 34 |
+
suffix = path.suffix.lower()
|
| 35 |
+
if suffix == ".pdf":
|
| 36 |
+
return _read_pdf(path)
|
| 37 |
+
return path.read_text(encoding="utf-8", errors="replace")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def ingest_directory(input_dir: Path, output_dir: Path) -> int:
|
| 41 |
+
"""Ingest every supported file in `input_dir` into a FAISS index at `output_dir`.
|
| 42 |
+
|
| 43 |
+
Returns the total number of chunks indexed.
|
| 44 |
+
"""
|
| 45 |
+
input_dir = Path(input_dir)
|
| 46 |
+
output_dir = Path(output_dir)
|
| 47 |
+
|
| 48 |
+
files = sorted(p for p in input_dir.rglob("*") if p.suffix.lower() in _SUPPORTED)
|
| 49 |
+
logger.info("Ingesting %d file(s) from %s", len(files), input_dir)
|
| 50 |
+
|
| 51 |
+
all_chunks: list[dict] = []
|
| 52 |
+
for path in files:
|
| 53 |
+
try:
|
| 54 |
+
text = _read_file(path)
|
| 55 |
+
except Exception as e:
|
| 56 |
+
logger.warning("Skipping %s (read failed: %s)", path, e)
|
| 57 |
+
continue
|
| 58 |
+
for i, ch in enumerate(chunk_text(text)):
|
| 59 |
+
all_chunks.append({
|
| 60 |
+
"text": ch,
|
| 61 |
+
"source": str(path.relative_to(input_dir)),
|
| 62 |
+
"chunk_index": i,
|
| 63 |
+
})
|
| 64 |
+
|
| 65 |
+
store = FAISSStore(dim=EMBEDDING_DIM)
|
| 66 |
+
if all_chunks:
|
| 67 |
+
embedder = Embedder()
|
| 68 |
+
vectors = embedder.encode([c["text"] for c in all_chunks])
|
| 69 |
+
store.add(vectors, all_chunks)
|
| 70 |
+
|
| 71 |
+
store.save(output_dir)
|
| 72 |
+
logger.info("Indexed %d chunk(s) → %s", len(all_chunks), output_dir)
|
| 73 |
+
return len(all_chunks)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def main() -> None:
|
| 77 |
+
args = sys.argv[1:]
|
| 78 |
+
inp = Path(args[0]) if len(args) >= 1 else _DEFAULT_INPUT
|
| 79 |
+
out = Path(args[1]) if len(args) >= 2 else _DEFAULT_OUTPUT
|
| 80 |
+
n = ingest_directory(inp, out)
|
| 81 |
+
print(f"Indexed {n} chunks into {out}")
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
tests/fixtures/kb_sample/combat_harmonization_primer.md
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ComBat Harmonization for Multi-Site Neuroimaging
|
| 2 |
+
|
| 3 |
+
ComBat (Johnson et al. 2007, adapted to MRI by Fortin et al. 2017, 2018)
|
| 4 |
+
is the de-facto standard for removing scanner / acquisition-site bias
|
| 5 |
+
from multi-center neuroimaging studies.
|
| 6 |
+
|
| 7 |
+
## How it works
|
| 8 |
+
|
| 9 |
+
ComBat models per-site location (mean) and scale (variance) parameters
|
| 10 |
+
using an empirical-Bayes hierarchical framework. It estimates these
|
| 11 |
+
parameters jointly across all sites and shrinks them toward a global
|
| 12 |
+
prior — small-N sites are pulled toward the global mean, preventing
|
| 13 |
+
overfitting.
|
| 14 |
+
|
| 15 |
+
## Site-gap reduction
|
| 16 |
+
|
| 17 |
+
A typical demonstration: the per-site mean of a hippocampus volume
|
| 18 |
+
feature can vary by 5+ standard deviations across hospitals. ComBat
|
| 19 |
+
typically collapses this gap to <0.005 — a 1000x+ reduction — while
|
| 20 |
+
preserving within-site biological variance (age, sex, diagnosis).
|
| 21 |
+
|
| 22 |
+
## When it fails
|
| 23 |
+
|
| 24 |
+
ComBat requires at least 2 sites with overlapping covariate
|
| 25 |
+
distributions. Single-site data, or sites with completely disjoint
|
| 26 |
+
populations (e.g., one site only-pediatric, another only-elderly),
|
| 27 |
+
produce unreliable harmonization.
|
tests/fixtures/kb_sample/lipinski_rule_of_five.md
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Lipinski's Rule of Five — BBB Permeability Heuristic
|
| 2 |
+
|
| 3 |
+
Lipinski's Rule of Five (Lipinski 1997, 2001) is the foundational
|
| 4 |
+
medicinal-chemistry rule for predicting whether a small molecule will
|
| 5 |
+
cross the blood-brain barrier (BBB) by passive diffusion.
|
| 6 |
+
|
| 7 |
+
## The four criteria
|
| 8 |
+
|
| 9 |
+
A molecule is likely BBB-permeable if it satisfies all four:
|
| 10 |
+
|
| 11 |
+
1. Molecular weight (MW) <= 500 Daltons
|
| 12 |
+
2. Octanol-water partition coefficient (logP) <= 5
|
| 13 |
+
3. Hydrogen-bond donors <= 5
|
| 14 |
+
4. Hydrogen-bond acceptors <= 10
|
| 15 |
+
|
| 16 |
+
Molecules violating two or more criteria are typically poorly absorbed
|
| 17 |
+
or impermeant.
|
| 18 |
+
|
| 19 |
+
## Why ethanol crosses
|
| 20 |
+
|
| 21 |
+
Ethanol (CCO) has MW=46 Da, logP=-0.31, 1 H-bond donor, 1 H-bond
|
| 22 |
+
acceptor — well within all four thresholds. This explains its rapid
|
| 23 |
+
CNS penetration despite hydrophilicity.
|
| 24 |
+
|
| 25 |
+
## SHAP attribution interpretation
|
| 26 |
+
|
| 27 |
+
When a Random Forest BBB classifier flags Morgan fingerprint bits with
|
| 28 |
+
positive SHAP values toward a "permeable" label, the bit usually
|
| 29 |
+
corresponds to a small lipophilic substructure (CH3-, -OCH3-, aromatic
|
| 30 |
+
ring) consistent with Lipinski compliance.
|
tests/fixtures/kb_sample/mne_ica_basics.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MNE-Python ICA for EEG Artifact Removal
|
| 2 |
+
|
| 3 |
+
Independent Component Analysis (ICA, Hyvärinen 1999) decomposes a
|
| 4 |
+
multi-channel EEG recording into statistically independent source
|
| 5 |
+
components. It is the de-facto method for removing eye-blink and
|
| 6 |
+
heartbeat artifacts before downstream analysis.
|
| 7 |
+
|
| 8 |
+
## Why ICA, not PCA
|
| 9 |
+
|
| 10 |
+
PCA decomposes signals into orthogonal components — but neural sources
|
| 11 |
+
are not orthogonal in scalp space, they are statistically independent.
|
| 12 |
+
ICA's independence assumption matches the physics: the eye, the heart,
|
| 13 |
+
and cortical sources fire on uncorrelated schedules.
|
| 14 |
+
|
| 15 |
+
## The standard workflow
|
| 16 |
+
|
| 17 |
+
1. Bandpass the raw recording at 0.5-40 Hz to remove DC drift and line
|
| 18 |
+
noise (50/60 Hz).
|
| 19 |
+
2. Fit ICA with N components (typically 15-30, less than channel count).
|
| 20 |
+
3. Identify artifact components by correlating each ICA source with the
|
| 21 |
+
EOG (eye) channel; reject components with |correlation| > 0.5.
|
| 22 |
+
4. Reconstruct the cleaned signal by zeroing out the rejected
|
| 23 |
+
components and inverse-transforming.
|
| 24 |
+
|
| 25 |
+
## Quality check
|
| 26 |
+
|
| 27 |
+
Post-ICA, the EOG channel should show minimal residual correlation
|
| 28 |
+
with frontal channels (Fp1/Fp2). If it doesn't, the ICA fit was likely
|
| 29 |
+
unstable — re-run with a different random seed or more components.
|
tests/rag/test_ingest.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for src.rag.ingest — walk a directory, chunk, embed, persist."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import shutil
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from src.rag.ingest import ingest_directory
|
| 10 |
+
from src.rag.store import FAISSStore
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
_FIXTURE_KB = Path(__file__).parent.parent / "fixtures" / "kb_sample"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class TestIngestDirectory:
|
| 17 |
+
def test_ingests_markdown_files(self, tmp_path: Path) -> None:
|
| 18 |
+
out_dir = tmp_path / "idx"
|
| 19 |
+
n = ingest_directory(_FIXTURE_KB, out_dir)
|
| 20 |
+
assert n > 0 # at least one chunk per fixture file
|
| 21 |
+
assert (out_dir / "index.bin").exists()
|
| 22 |
+
assert (out_dir / "chunks.json").exists()
|
| 23 |
+
|
| 24 |
+
def test_loaded_store_is_searchable(self, tmp_path: Path) -> None:
|
| 25 |
+
out_dir = tmp_path / "idx"
|
| 26 |
+
ingest_directory(_FIXTURE_KB, out_dir)
|
| 27 |
+
from src.rag.embed import EMBEDDING_DIM
|
| 28 |
+
store = FAISSStore.load(out_dir, dim=EMBEDDING_DIM)
|
| 29 |
+
assert len(store) > 0
|
| 30 |
+
# chunks have source metadata
|
| 31 |
+
assert all("source" in c for c in store._chunks)
|
| 32 |
+
assert all("text" in c for c in store._chunks)
|
| 33 |
+
|
| 34 |
+
def test_empty_directory_creates_empty_index(self, tmp_path: Path) -> None:
|
| 35 |
+
empty = tmp_path / "empty_kb"
|
| 36 |
+
empty.mkdir()
|
| 37 |
+
out_dir = tmp_path / "idx"
|
| 38 |
+
n = ingest_directory(empty, out_dir)
|
| 39 |
+
assert n == 0
|
| 40 |
+
assert (out_dir / "index.bin").exists()
|