| """Tests for src.rag.chunker — paragraph-aware character splitter.""" |
| from __future__ import annotations |
|
|
| import pytest |
|
|
| from src.rag.chunker import chunk_text |
|
|
|
|
| class TestChunkText: |
| def test_short_text_returns_single_chunk(self) -> None: |
| out = chunk_text("hello world", max_chars=100, overlap=10) |
| assert out == ["hello world"] |
|
|
| def test_empty_text_returns_empty_list(self) -> None: |
| assert chunk_text("", max_chars=100, overlap=10) == [] |
| assert chunk_text(" \n\n ", max_chars=100, overlap=10) == [] |
|
|
| def test_long_text_splits_into_multiple_chunks(self) -> None: |
| text = "a" * 250 |
| out = chunk_text(text, max_chars=100, overlap=10) |
| assert len(out) >= 3 |
| |
| for c in out: |
| assert len(c) <= 100 |
|
|
| def test_overlap_between_chunks(self) -> None: |
| text = "abcdefghij" * 30 |
| out = chunk_text(text, max_chars=100, overlap=20) |
| |
| for i in range(len(out) - 1): |
| assert out[i][-10:] in out[i + 1] or out[i + 1][:10] in out[i] |
|
|
| def test_paragraph_boundary_preferred(self) -> None: |
| |
| para_a = "First paragraph content." |
| para_b = "Second paragraph content " * 10 |
| text = f"{para_a}\n\n{para_b}" |
| out = chunk_text(text, max_chars=100, overlap=10) |
| |
| assert para_a in out[0] |
|
|