File size: 1,232 Bytes
75fd700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
"""Paragraph-aware recursive character splitter for RAG ingestion.

Public entry: `chunk_text(text, max_chars, overlap)`. Splits on the first
of [paragraph break, sentence end, newline, space] that fits inside the
window. Empty / whitespace-only inputs return [].
"""
from __future__ import annotations


_SEPARATORS: tuple[str, ...] = ("\n\n", ". ", "\n", " ")


def chunk_text(text: str, max_chars: int = 600, overlap: int = 80) -> list[str]:
    """Split `text` into chunks of at most `max_chars`, with `overlap` carry-over."""
    text = text.strip()
    if not text:
        return []
    if len(text) <= max_chars:
        return [text]

    chunks: list[str] = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chars, n)
        if end < n:
            # try to land on a clean boundary inside [start, end]
            for sep in _SEPARATORS:
                last = text.rfind(sep, start, end)
                if last > start:
                    end = last + len(sep)
                    break
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        if end >= n:
            break
        start = max(start + 1, end - overlap)
    return chunks