| """Paragraph-aware recursive character splitter for RAG ingestion. | |
| Public entry: `chunk_text(text, max_chars, overlap)`. Splits on the first | |
| of [paragraph break, sentence end, newline, space] that fits inside the | |
| window. Empty / whitespace-only inputs return []. | |
| """ | |
| from __future__ import annotations | |
| _SEPARATORS: tuple[str, ...] = ("\n\n", ". ", "\n", " ") | |
| def chunk_text(text: str, max_chars: int = 600, overlap: int = 80) -> list[str]: | |
| """Split `text` into chunks of at most `max_chars`, with `overlap` carry-over.""" | |
| text = text.strip() | |
| if not text: | |
| return [] | |
| if len(text) <= max_chars: | |
| return [text] | |
| chunks: list[str] = [] | |
| start = 0 | |
| n = len(text) | |
| while start < n: | |
| end = min(start + max_chars, n) | |
| if end < n: | |
| # try to land on a clean boundary inside [start, end] | |
| for sep in _SEPARATORS: | |
| last = text.rfind(sep, start, end) | |
| if last > start: | |
| end = last + len(sep) | |
| break | |
| chunk = text[start:end].strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| if end >= n: | |
| break | |
| start = max(start + 1, end - overlap) | |
| return chunks | |