File size: 9,661 Bytes
f866820
 
 
7e07738
f866820
7e07738
 
 
 
f866820
 
7e07738
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e07738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f866820
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
# RAG-document-assistant/ingestion/chunker.py
"""
Text chunking utility for RAG ingestion.
Inputs: list of docs from load_docs.py or docling_loader.py
Output: list of chunks with metadata

Supports:
- Simple character-based chunking (legacy)
- Structure-aware chunking using Docling elements
"""

from typing import List, Dict, Optional, Any

def chunk_text(
    text: str,
    max_tokens: int = 300,
    overlap: int = 50
) -> List[str]:
    """
    Simple whitespace-based chunking.
    Assumes ~1 token ≈ 4 chars (rough approximation).
    
    Args:
        text: Text to chunk
        max_tokens: Maximum tokens per chunk
        overlap: Number of tokens to overlap between chunks
        
    Returns:
        List of text chunks
        
    Raises:
        ValueError: If max_tokens or overlap are not positive
    """
    if max_tokens <= 0:
        raise ValueError(f"max_tokens must be positive, got {max_tokens}")
    if overlap < 0:
        raise ValueError(f"overlap must be non-negative, got {overlap}")
    if overlap >= max_tokens:
        raise ValueError(f"overlap ({overlap}) must be less than max_tokens ({max_tokens})")
        
    approx_chars = max_tokens * 4
    approx_overlap = overlap * 4

    chunks = []
    start = 0
    text_len = len(text)

    while start < text_len:
        end = start + approx_chars
        chunk = text[start:end]

        if chunk.strip():
            chunks.append(chunk.strip())

        # next window with overlap
        start = start + approx_chars - approx_overlap
        # Ensure we don't go backwards
        if start <= 0:
            start = approx_chars

    return chunks


def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50):
    """
    Chunk a list of documents into smaller pieces for embedding.
    
    Args:
        docs: List of document dictionaries with 'filename' and 'text' keys
        max_tokens: Maximum tokens per chunk
        overlap: Number of tokens to overlap between chunks
        
    Returns:
        List of chunk dictionaries with filename, chunk_id, text, and chars keys
        
    Raises:
        TypeError: If docs is not a list or contains non-dict elements
        KeyError: If required keys are missing from document dictionaries
    """
    if not isinstance(docs, list):
        raise TypeError("docs must be a list")
        
    all_chunks = []
    for d in docs:
        if not isinstance(d, dict):
            raise TypeError("Each document must be a dictionary")
            
        if d.get("status") != "OK":
            continue

        filename = d["filename"]
        text = d["text"]
        raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)

        for i, ch in enumerate(raw_chunks):
            all_chunks.append({
                "filename": filename,
                "chunk_id": i,
                "text": ch,
                "chars": len(ch)
            })
    return all_chunks


def chunk_by_structure(
    elements: List[Any],
    max_tokens: int = 300,
    overlap: int = 50,
    keep_tables_intact: bool = True,
    include_heading_context: bool = True
) -> List[Dict]:
    """
    Structure-aware chunking using Docling document elements.

    Groups content by semantic boundaries (headings, tables) rather than
    arbitrary character counts. Falls back to character-based splitting
    for oversized elements.

    Args:
        elements: List of DocumentElement objects from docling_loader
        max_tokens: Maximum tokens per chunk (approx 4 chars/token)
        overlap: Token overlap for split elements
        keep_tables_intact: Keep tables as single chunks even if large
        include_heading_context: Prepend parent heading to chunks

    Returns:
        List of chunk dicts with element_type and section metadata
    """
    if not elements:
        return []

    max_chars = max_tokens * 4
    chunks = []
    current_heading = ""
    current_section = []
    current_chars = 0

    def flush_section():
        """Flush accumulated section content as a chunk."""
        nonlocal current_section, current_chars
        if not current_section:
            return

        combined_text = "\n\n".join(el.text for el in current_section)
        if combined_text.strip():
            # Prepend heading context if available
            if include_heading_context and current_heading:
                combined_text = f"## {current_heading}\n\n{combined_text}"

            chunks.append({
                "text": combined_text.strip(),
                "chars": len(combined_text),
                "element_type": "section",
                "section_heading": current_heading,
                "element_count": len(current_section)
            })

        current_section = []
        current_chars = 0

    for element in elements:
        el_type = getattr(element, "element_type", "paragraph")
        el_text = getattr(element, "text", str(element))
        el_chars = len(el_text)

        # Handle headings - start new section
        if el_type == "heading":
            flush_section()
            current_heading = el_text
            continue

        # Handle tables - keep intact if configured
        if el_type == "table" and keep_tables_intact:
            flush_section()
            table_text = el_text
            if include_heading_context and current_heading:
                table_text = f"## {current_heading}\n\n{el_text}"

            chunks.append({
                "text": table_text.strip(),
                "chars": len(table_text),
                "element_type": "table",
                "section_heading": current_heading,
                "element_count": 1
            })
            continue

        # Check if adding this element exceeds limit
        if current_chars + el_chars > max_chars and current_section:
            flush_section()

        # Handle oversized single elements
        if el_chars > max_chars:
            flush_section()
            # Split large element using character-based chunking
            sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap)
            for i, sub_text in enumerate(sub_chunks):
                prefix = ""
                if include_heading_context and current_heading:
                    prefix = f"## {current_heading}\n\n"
                chunks.append({
                    "text": f"{prefix}{sub_text}".strip(),
                    "chars": len(sub_text) + len(prefix),
                    "element_type": f"{el_type}_split",
                    "section_heading": current_heading,
                    "split_index": i,
                    "element_count": 1
                })
            continue

        # Accumulate element in current section
        current_section.append(element)
        current_chars += el_chars

    # Flush remaining content
    flush_section()

    return chunks


def chunk_documents_with_structure(
    docs: List[Dict],
    max_tokens: int = 300,
    overlap: int = 50,
    keep_tables_intact: bool = True,
    use_structure: bool = True
) -> List[Dict]:
    """
    Chunk documents using structure-aware or legacy chunking.

    Args:
        docs: List of document dicts (from docling_loader or load_docs)
        max_tokens: Maximum tokens per chunk
        overlap: Token overlap between chunks
        keep_tables_intact: Keep tables as single chunks
        use_structure: Use structure-aware chunking if elements available

    Returns:
        List of chunk dicts with metadata
    """
    if not isinstance(docs, list):
        raise TypeError("docs must be a list")

    all_chunks = []

    for d in docs:
        if not isinstance(d, dict):
            raise TypeError("Each document must be a dictionary")

        status = d.get("status", "")
        if status != "OK":
            continue

        filename = d.get("filename", "unknown")
        elements = d.get("elements", [])

        # Use structure-aware chunking if elements available
        if use_structure and elements:
            raw_chunks = chunk_by_structure(
                elements,
                max_tokens=max_tokens,
                overlap=overlap,
                keep_tables_intact=keep_tables_intact
            )
            for i, ch in enumerate(raw_chunks):
                all_chunks.append({
                    "filename": filename,
                    "chunk_id": i,
                    "text": ch["text"],
                    "chars": ch["chars"],
                    "element_type": ch.get("element_type", "section"),
                    "section_heading": ch.get("section_heading", ""),
                    "format": d.get("format", ""),
                    "page_count": d.get("page_count", 0)
                })
        else:
            # Fallback to legacy text-based chunking
            text = d.get("text", "")
            if not text:
                continue

            raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
            for i, ch in enumerate(raw_chunks):
                all_chunks.append({
                    "filename": filename,
                    "chunk_id": i,
                    "text": ch,
                    "chars": len(ch),
                    "element_type": "text",
                    "section_heading": "",
                    "format": d.get("format", ".md"),
                    "page_count": 0
                })

    return all_chunks


if __name__ == "__main__":
    # Minimal test
    sample = "This is a test text " * 200
    chunks = chunk_text(sample, max_tokens=50, overlap=10)
    print(f"Generated {len(chunks)} chunks")
    print(chunks[0])