File size: 1,585 Bytes
75fd700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""Tests for src.rag.chunker — paragraph-aware character splitter."""
from __future__ import annotations

import pytest

from src.rag.chunker import chunk_text


class TestChunkText:
    def test_short_text_returns_single_chunk(self) -> None:
        out = chunk_text("hello world", max_chars=100, overlap=10)
        assert out == ["hello world"]

    def test_empty_text_returns_empty_list(self) -> None:
        assert chunk_text("", max_chars=100, overlap=10) == []
        assert chunk_text("   \n\n  ", max_chars=100, overlap=10) == []

    def test_long_text_splits_into_multiple_chunks(self) -> None:
        text = "a" * 250
        out = chunk_text(text, max_chars=100, overlap=10)
        assert len(out) >= 3
        # every chunk respects max_chars
        for c in out:
            assert len(c) <= 100

    def test_overlap_between_chunks(self) -> None:
        text = "abcdefghij" * 30  # 300 chars, no natural break
        out = chunk_text(text, max_chars=100, overlap=20)
        # consecutive chunks share at least some characters
        for i in range(len(out) - 1):
            assert out[i][-10:] in out[i + 1] or out[i + 1][:10] in out[i]

    def test_paragraph_boundary_preferred(self) -> None:
        # First paragraph fits, second doesn't — split at \n\n
        para_a = "First paragraph content."
        para_b = "Second paragraph content " * 10
        text = f"{para_a}\n\n{para_b}"
        out = chunk_text(text, max_chars=100, overlap=10)
        # first chunk should end at the paragraph boundary, not mid-word
        assert para_a in out[0]