File size: 1,232 Bytes
75fd700 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | """Paragraph-aware recursive character splitter for RAG ingestion.
Public entry: `chunk_text(text, max_chars, overlap)`. Splits on the first
of [paragraph break, sentence end, newline, space] that fits inside the
window. Empty / whitespace-only inputs return [].
"""
from __future__ import annotations
_SEPARATORS: tuple[str, ...] = ("\n\n", ". ", "\n", " ")
def chunk_text(text: str, max_chars: int = 600, overlap: int = 80) -> list[str]:
"""Split `text` into chunks of at most `max_chars`, with `overlap` carry-over."""
text = text.strip()
if not text:
return []
if len(text) <= max_chars:
return [text]
chunks: list[str] = []
start = 0
n = len(text)
while start < n:
end = min(start + max_chars, n)
if end < n:
# try to land on a clean boundary inside [start, end]
for sep in _SEPARATORS:
last = text.rfind(sep, start, end)
if last > start:
end = last + len(sep)
break
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
if end >= n:
break
start = max(start + 1, end - overlap)
return chunks
|