mekosotto commited on
Commit
cf5c011
·
1 Parent(s): 57801de

feat(rag): ingest CLI (markdown/PDF → chunks → FAISS) + sample KB fixtures

Browse files
src/rag/ingest.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Walk a knowledge-base directory, chunk each file, embed, persist FAISS index.
2
+
3
+ CLI entry point: `python -m src.rag.ingest [<input_dir> [<output_dir>]]`.
4
+ Defaults: input=`data/knowledge_base/`, output=`data/processed/faiss_index/`.
5
+
6
+ Supported file types: `.md`, `.txt`, `.pdf`. Other extensions are ignored
7
+ with a logged WARNING.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ from src.core.logger import get_logger
15
+ from src.rag.chunker import chunk_text
16
+ from src.rag.embed import EMBEDDING_DIM, Embedder
17
+ from src.rag.store import FAISSStore
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ _DEFAULT_INPUT = Path("data/knowledge_base")
23
+ _DEFAULT_OUTPUT = Path("data/processed/faiss_index")
24
+ _SUPPORTED = {".md", ".txt", ".pdf"}
25
+
26
+
27
+ def _read_pdf(path: Path) -> str:
28
+ from pypdf import PdfReader
29
+ reader = PdfReader(str(path))
30
+ return "\n\n".join(page.extract_text() or "" for page in reader.pages)
31
+
32
+
33
+ def _read_file(path: Path) -> str:
34
+ suffix = path.suffix.lower()
35
+ if suffix == ".pdf":
36
+ return _read_pdf(path)
37
+ return path.read_text(encoding="utf-8", errors="replace")
38
+
39
+
40
+ def ingest_directory(input_dir: Path, output_dir: Path) -> int:
41
+ """Ingest every supported file in `input_dir` into a FAISS index at `output_dir`.
42
+
43
+ Returns the total number of chunks indexed.
44
+ """
45
+ input_dir = Path(input_dir)
46
+ output_dir = Path(output_dir)
47
+
48
+ files = sorted(p for p in input_dir.rglob("*") if p.suffix.lower() in _SUPPORTED)
49
+ logger.info("Ingesting %d file(s) from %s", len(files), input_dir)
50
+
51
+ all_chunks: list[dict] = []
52
+ for path in files:
53
+ try:
54
+ text = _read_file(path)
55
+ except Exception as e:
56
+ logger.warning("Skipping %s (read failed: %s)", path, e)
57
+ continue
58
+ for i, ch in enumerate(chunk_text(text)):
59
+ all_chunks.append({
60
+ "text": ch,
61
+ "source": str(path.relative_to(input_dir)),
62
+ "chunk_index": i,
63
+ })
64
+
65
+ store = FAISSStore(dim=EMBEDDING_DIM)
66
+ if all_chunks:
67
+ embedder = Embedder()
68
+ vectors = embedder.encode([c["text"] for c in all_chunks])
69
+ store.add(vectors, all_chunks)
70
+
71
+ store.save(output_dir)
72
+ logger.info("Indexed %d chunk(s) → %s", len(all_chunks), output_dir)
73
+ return len(all_chunks)
74
+
75
+
76
+ def main() -> None:
77
+ args = sys.argv[1:]
78
+ inp = Path(args[0]) if len(args) >= 1 else _DEFAULT_INPUT
79
+ out = Path(args[1]) if len(args) >= 2 else _DEFAULT_OUTPUT
80
+ n = ingest_directory(inp, out)
81
+ print(f"Indexed {n} chunks into {out}")
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
tests/fixtures/kb_sample/combat_harmonization_primer.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ComBat Harmonization for Multi-Site Neuroimaging
2
+
3
+ ComBat (Johnson et al. 2007, adapted to MRI by Fortin et al. 2017, 2018)
4
+ is the de-facto standard for removing scanner / acquisition-site bias
5
+ from multi-center neuroimaging studies.
6
+
7
+ ## How it works
8
+
9
+ ComBat models per-site location (mean) and scale (variance) parameters
10
+ using an empirical-Bayes hierarchical framework. It estimates these
11
+ parameters jointly across all sites and shrinks them toward a global
12
+ prior — small-N sites are pulled toward the global mean, preventing
13
+ overfitting.
14
+
15
+ ## Site-gap reduction
16
+
17
+ A typical demonstration: the per-site mean of a hippocampus volume
18
+ feature can vary by 5+ standard deviations across hospitals. ComBat
19
+ typically collapses this gap to <0.005 — a 1000x+ reduction — while
20
+ preserving within-site biological variance (age, sex, diagnosis).
21
+
22
+ ## When it fails
23
+
24
+ ComBat requires at least 2 sites with overlapping covariate
25
+ distributions. Single-site data, or sites with completely disjoint
26
+ populations (e.g., one site only-pediatric, another only-elderly),
27
+ produce unreliable harmonization.
tests/fixtures/kb_sample/lipinski_rule_of_five.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Lipinski's Rule of Five — BBB Permeability Heuristic
2
+
3
+ Lipinski's Rule of Five (Lipinski 1997, 2001) is the foundational
4
+ medicinal-chemistry rule for predicting whether a small molecule will
5
+ cross the blood-brain barrier (BBB) by passive diffusion.
6
+
7
+ ## The four criteria
8
+
9
+ A molecule is likely BBB-permeable if it satisfies all four:
10
+
11
+ 1. Molecular weight (MW) <= 500 Daltons
12
+ 2. Octanol-water partition coefficient (logP) <= 5
13
+ 3. Hydrogen-bond donors <= 5
14
+ 4. Hydrogen-bond acceptors <= 10
15
+
16
+ Molecules violating two or more criteria are typically poorly absorbed
17
+ or impermeant.
18
+
19
+ ## Why ethanol crosses
20
+
21
+ Ethanol (CCO) has MW=46 Da, logP=-0.31, 1 H-bond donor, 1 H-bond
22
+ acceptor — well within all four thresholds. This explains its rapid
23
+ CNS penetration despite hydrophilicity.
24
+
25
+ ## SHAP attribution interpretation
26
+
27
+ When a Random Forest BBB classifier flags Morgan fingerprint bits with
28
+ positive SHAP values toward a "permeable" label, the bit usually
29
+ corresponds to a small lipophilic substructure (CH3-, -OCH3-, aromatic
30
+ ring) consistent with Lipinski compliance.
tests/fixtures/kb_sample/mne_ica_basics.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MNE-Python ICA for EEG Artifact Removal
2
+
3
+ Independent Component Analysis (ICA, Hyvärinen 1999) decomposes a
4
+ multi-channel EEG recording into statistically independent source
5
+ components. It is the de-facto method for removing eye-blink and
6
+ heartbeat artifacts before downstream analysis.
7
+
8
+ ## Why ICA, not PCA
9
+
10
+ PCA decomposes signals into orthogonal components — but neural sources
11
+ are not orthogonal in scalp space, they are statistically independent.
12
+ ICA's independence assumption matches the physics: the eye, the heart,
13
+ and cortical sources fire on uncorrelated schedules.
14
+
15
+ ## The standard workflow
16
+
17
+ 1. Bandpass the raw recording at 0.5-40 Hz to remove DC drift and line
18
+ noise (50/60 Hz).
19
+ 2. Fit ICA with N components (typically 15-30, less than channel count).
20
+ 3. Identify artifact components by correlating each ICA source with the
21
+ EOG (eye) channel; reject components with |correlation| > 0.5.
22
+ 4. Reconstruct the cleaned signal by zeroing out the rejected
23
+ components and inverse-transforming.
24
+
25
+ ## Quality check
26
+
27
+ Post-ICA, the EOG channel should show minimal residual correlation
28
+ with frontal channels (Fp1/Fp2). If it doesn't, the ICA fit was likely
29
+ unstable — re-run with a different random seed or more components.
tests/rag/test_ingest.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for src.rag.ingest — walk a directory, chunk, embed, persist."""
2
+ from __future__ import annotations
3
+
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ from src.rag.ingest import ingest_directory
10
+ from src.rag.store import FAISSStore
11
+
12
+
13
+ _FIXTURE_KB = Path(__file__).parent.parent / "fixtures" / "kb_sample"
14
+
15
+
16
+ class TestIngestDirectory:
17
+ def test_ingests_markdown_files(self, tmp_path: Path) -> None:
18
+ out_dir = tmp_path / "idx"
19
+ n = ingest_directory(_FIXTURE_KB, out_dir)
20
+ assert n > 0 # at least one chunk per fixture file
21
+ assert (out_dir / "index.bin").exists()
22
+ assert (out_dir / "chunks.json").exists()
23
+
24
+ def test_loaded_store_is_searchable(self, tmp_path: Path) -> None:
25
+ out_dir = tmp_path / "idx"
26
+ ingest_directory(_FIXTURE_KB, out_dir)
27
+ from src.rag.embed import EMBEDDING_DIM
28
+ store = FAISSStore.load(out_dir, dim=EMBEDDING_DIM)
29
+ assert len(store) > 0
30
+ # chunks have source metadata
31
+ assert all("source" in c for c in store._chunks)
32
+ assert all("text" in c for c in store._chunks)
33
+
34
+ def test_empty_directory_creates_empty_index(self, tmp_path: Path) -> None:
35
+ empty = tmp_path / "empty_kb"
36
+ empty.mkdir()
37
+ out_dir = tmp_path / "idx"
38
+ n = ingest_directory(empty, out_dir)
39
+ assert n == 0
40
+ assert (out_dir / "index.bin").exists()