# RAG-document-assistant/ingestion/chunker.py """ Text chunking utility for RAG ingestion. Inputs: list of docs from load_docs.py or docling_loader.py Output: list of chunks with metadata Supports: - Simple character-based chunking (legacy) - Structure-aware chunking using Docling elements """ from typing import List, Dict, Optional, Any def chunk_text( text: str, max_tokens: int = 300, overlap: int = 50 ) -> List[str]: """ Simple whitespace-based chunking. Assumes ~1 token ≈ 4 chars (rough approximation). Args: text: Text to chunk max_tokens: Maximum tokens per chunk overlap: Number of tokens to overlap between chunks Returns: List of text chunks Raises: ValueError: If max_tokens or overlap are not positive """ if max_tokens <= 0: raise ValueError(f"max_tokens must be positive, got {max_tokens}") if overlap < 0: raise ValueError(f"overlap must be non-negative, got {overlap}") if overlap >= max_tokens: raise ValueError(f"overlap ({overlap}) must be less than max_tokens ({max_tokens})") approx_chars = max_tokens * 4 approx_overlap = overlap * 4 chunks = [] start = 0 text_len = len(text) while start < text_len: end = start + approx_chars chunk = text[start:end] if chunk.strip(): chunks.append(chunk.strip()) # next window with overlap start = start + approx_chars - approx_overlap # Ensure we don't go backwards if start <= 0: start = approx_chars return chunks def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50): """ Chunk a list of documents into smaller pieces for embedding. Args: docs: List of document dictionaries with 'filename' and 'text' keys max_tokens: Maximum tokens per chunk overlap: Number of tokens to overlap between chunks Returns: List of chunk dictionaries with filename, chunk_id, text, and chars keys Raises: TypeError: If docs is not a list or contains non-dict elements KeyError: If required keys are missing from document dictionaries """ if not isinstance(docs, list): raise TypeError("docs must be a list") all_chunks = [] for d in docs: if not isinstance(d, dict): raise TypeError("Each document must be a dictionary") if d.get("status") != "OK": continue filename = d["filename"] text = d["text"] raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap) for i, ch in enumerate(raw_chunks): all_chunks.append({ "filename": filename, "chunk_id": i, "text": ch, "chars": len(ch) }) return all_chunks def chunk_by_structure( elements: List[Any], max_tokens: int = 300, overlap: int = 50, keep_tables_intact: bool = True, include_heading_context: bool = True ) -> List[Dict]: """ Structure-aware chunking using Docling document elements. Groups content by semantic boundaries (headings, tables) rather than arbitrary character counts. Falls back to character-based splitting for oversized elements. Args: elements: List of DocumentElement objects from docling_loader max_tokens: Maximum tokens per chunk (approx 4 chars/token) overlap: Token overlap for split elements keep_tables_intact: Keep tables as single chunks even if large include_heading_context: Prepend parent heading to chunks Returns: List of chunk dicts with element_type and section metadata """ if not elements: return [] max_chars = max_tokens * 4 chunks = [] current_heading = "" current_section = [] current_chars = 0 def flush_section(): """Flush accumulated section content as a chunk.""" nonlocal current_section, current_chars if not current_section: return combined_text = "\n\n".join(el.text for el in current_section) if combined_text.strip(): # Prepend heading context if available if include_heading_context and current_heading: combined_text = f"## {current_heading}\n\n{combined_text}" chunks.append({ "text": combined_text.strip(), "chars": len(combined_text), "element_type": "section", "section_heading": current_heading, "element_count": len(current_section) }) current_section = [] current_chars = 0 for element in elements: el_type = getattr(element, "element_type", "paragraph") el_text = getattr(element, "text", str(element)) el_chars = len(el_text) # Handle headings - start new section if el_type == "heading": flush_section() current_heading = el_text continue # Handle tables - keep intact if configured if el_type == "table" and keep_tables_intact: flush_section() table_text = el_text if include_heading_context and current_heading: table_text = f"## {current_heading}\n\n{el_text}" chunks.append({ "text": table_text.strip(), "chars": len(table_text), "element_type": "table", "section_heading": current_heading, "element_count": 1 }) continue # Check if adding this element exceeds limit if current_chars + el_chars > max_chars and current_section: flush_section() # Handle oversized single elements if el_chars > max_chars: flush_section() # Split large element using character-based chunking sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap) for i, sub_text in enumerate(sub_chunks): prefix = "" if include_heading_context and current_heading: prefix = f"## {current_heading}\n\n" chunks.append({ "text": f"{prefix}{sub_text}".strip(), "chars": len(sub_text) + len(prefix), "element_type": f"{el_type}_split", "section_heading": current_heading, "split_index": i, "element_count": 1 }) continue # Accumulate element in current section current_section.append(element) current_chars += el_chars # Flush remaining content flush_section() return chunks def chunk_documents_with_structure( docs: List[Dict], max_tokens: int = 300, overlap: int = 50, keep_tables_intact: bool = True, use_structure: bool = True ) -> List[Dict]: """ Chunk documents using structure-aware or legacy chunking. Args: docs: List of document dicts (from docling_loader or load_docs) max_tokens: Maximum tokens per chunk overlap: Token overlap between chunks keep_tables_intact: Keep tables as single chunks use_structure: Use structure-aware chunking if elements available Returns: List of chunk dicts with metadata """ if not isinstance(docs, list): raise TypeError("docs must be a list") all_chunks = [] for d in docs: if not isinstance(d, dict): raise TypeError("Each document must be a dictionary") status = d.get("status", "") if status != "OK": continue filename = d.get("filename", "unknown") elements = d.get("elements", []) # Use structure-aware chunking if elements available if use_structure and elements: raw_chunks = chunk_by_structure( elements, max_tokens=max_tokens, overlap=overlap, keep_tables_intact=keep_tables_intact ) for i, ch in enumerate(raw_chunks): all_chunks.append({ "filename": filename, "chunk_id": i, "text": ch["text"], "chars": ch["chars"], "element_type": ch.get("element_type", "section"), "section_heading": ch.get("section_heading", ""), "format": d.get("format", ""), "page_count": d.get("page_count", 0) }) else: # Fallback to legacy text-based chunking text = d.get("text", "") if not text: continue raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap) for i, ch in enumerate(raw_chunks): all_chunks.append({ "filename": filename, "chunk_id": i, "text": ch, "chars": len(ch), "element_type": "text", "section_heading": "", "format": d.get("format", ".md"), "page_count": 0 }) return all_chunks if __name__ == "__main__": # Minimal test sample = "This is a test text " * 200 chunks = chunk_text(sample, max_tokens=50, overlap=10) print(f"Generated {len(chunks)} chunks") print(chunks[0])