vn6295337's picture
Add Docling integration for multi-format document processing
7e07738
# RAG-document-assistant/ingestion/chunker.py
"""
Text chunking utility for RAG ingestion.
Inputs: list of docs from load_docs.py or docling_loader.py
Output: list of chunks with metadata
Supports:
- Simple character-based chunking (legacy)
- Structure-aware chunking using Docling elements
"""
from typing import List, Dict, Optional, Any
def chunk_text(
text: str,
max_tokens: int = 300,
overlap: int = 50
) -> List[str]:
"""
Simple whitespace-based chunking.
Assumes ~1 token ≈ 4 chars (rough approximation).
Args:
text: Text to chunk
max_tokens: Maximum tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of text chunks
Raises:
ValueError: If max_tokens or overlap are not positive
"""
if max_tokens <= 0:
raise ValueError(f"max_tokens must be positive, got {max_tokens}")
if overlap < 0:
raise ValueError(f"overlap must be non-negative, got {overlap}")
if overlap >= max_tokens:
raise ValueError(f"overlap ({overlap}) must be less than max_tokens ({max_tokens})")
approx_chars = max_tokens * 4
approx_overlap = overlap * 4
chunks = []
start = 0
text_len = len(text)
while start < text_len:
end = start + approx_chars
chunk = text[start:end]
if chunk.strip():
chunks.append(chunk.strip())
# next window with overlap
start = start + approx_chars - approx_overlap
# Ensure we don't go backwards
if start <= 0:
start = approx_chars
return chunks
def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50):
"""
Chunk a list of documents into smaller pieces for embedding.
Args:
docs: List of document dictionaries with 'filename' and 'text' keys
max_tokens: Maximum tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of chunk dictionaries with filename, chunk_id, text, and chars keys
Raises:
TypeError: If docs is not a list or contains non-dict elements
KeyError: If required keys are missing from document dictionaries
"""
if not isinstance(docs, list):
raise TypeError("docs must be a list")
all_chunks = []
for d in docs:
if not isinstance(d, dict):
raise TypeError("Each document must be a dictionary")
if d.get("status") != "OK":
continue
filename = d["filename"]
text = d["text"]
raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
for i, ch in enumerate(raw_chunks):
all_chunks.append({
"filename": filename,
"chunk_id": i,
"text": ch,
"chars": len(ch)
})
return all_chunks
def chunk_by_structure(
elements: List[Any],
max_tokens: int = 300,
overlap: int = 50,
keep_tables_intact: bool = True,
include_heading_context: bool = True
) -> List[Dict]:
"""
Structure-aware chunking using Docling document elements.
Groups content by semantic boundaries (headings, tables) rather than
arbitrary character counts. Falls back to character-based splitting
for oversized elements.
Args:
elements: List of DocumentElement objects from docling_loader
max_tokens: Maximum tokens per chunk (approx 4 chars/token)
overlap: Token overlap for split elements
keep_tables_intact: Keep tables as single chunks even if large
include_heading_context: Prepend parent heading to chunks
Returns:
List of chunk dicts with element_type and section metadata
"""
if not elements:
return []
max_chars = max_tokens * 4
chunks = []
current_heading = ""
current_section = []
current_chars = 0
def flush_section():
"""Flush accumulated section content as a chunk."""
nonlocal current_section, current_chars
if not current_section:
return
combined_text = "\n\n".join(el.text for el in current_section)
if combined_text.strip():
# Prepend heading context if available
if include_heading_context and current_heading:
combined_text = f"## {current_heading}\n\n{combined_text}"
chunks.append({
"text": combined_text.strip(),
"chars": len(combined_text),
"element_type": "section",
"section_heading": current_heading,
"element_count": len(current_section)
})
current_section = []
current_chars = 0
for element in elements:
el_type = getattr(element, "element_type", "paragraph")
el_text = getattr(element, "text", str(element))
el_chars = len(el_text)
# Handle headings - start new section
if el_type == "heading":
flush_section()
current_heading = el_text
continue
# Handle tables - keep intact if configured
if el_type == "table" and keep_tables_intact:
flush_section()
table_text = el_text
if include_heading_context and current_heading:
table_text = f"## {current_heading}\n\n{el_text}"
chunks.append({
"text": table_text.strip(),
"chars": len(table_text),
"element_type": "table",
"section_heading": current_heading,
"element_count": 1
})
continue
# Check if adding this element exceeds limit
if current_chars + el_chars > max_chars and current_section:
flush_section()
# Handle oversized single elements
if el_chars > max_chars:
flush_section()
# Split large element using character-based chunking
sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap)
for i, sub_text in enumerate(sub_chunks):
prefix = ""
if include_heading_context and current_heading:
prefix = f"## {current_heading}\n\n"
chunks.append({
"text": f"{prefix}{sub_text}".strip(),
"chars": len(sub_text) + len(prefix),
"element_type": f"{el_type}_split",
"section_heading": current_heading,
"split_index": i,
"element_count": 1
})
continue
# Accumulate element in current section
current_section.append(element)
current_chars += el_chars
# Flush remaining content
flush_section()
return chunks
def chunk_documents_with_structure(
docs: List[Dict],
max_tokens: int = 300,
overlap: int = 50,
keep_tables_intact: bool = True,
use_structure: bool = True
) -> List[Dict]:
"""
Chunk documents using structure-aware or legacy chunking.
Args:
docs: List of document dicts (from docling_loader or load_docs)
max_tokens: Maximum tokens per chunk
overlap: Token overlap between chunks
keep_tables_intact: Keep tables as single chunks
use_structure: Use structure-aware chunking if elements available
Returns:
List of chunk dicts with metadata
"""
if not isinstance(docs, list):
raise TypeError("docs must be a list")
all_chunks = []
for d in docs:
if not isinstance(d, dict):
raise TypeError("Each document must be a dictionary")
status = d.get("status", "")
if status != "OK":
continue
filename = d.get("filename", "unknown")
elements = d.get("elements", [])
# Use structure-aware chunking if elements available
if use_structure and elements:
raw_chunks = chunk_by_structure(
elements,
max_tokens=max_tokens,
overlap=overlap,
keep_tables_intact=keep_tables_intact
)
for i, ch in enumerate(raw_chunks):
all_chunks.append({
"filename": filename,
"chunk_id": i,
"text": ch["text"],
"chars": ch["chars"],
"element_type": ch.get("element_type", "section"),
"section_heading": ch.get("section_heading", ""),
"format": d.get("format", ""),
"page_count": d.get("page_count", 0)
})
else:
# Fallback to legacy text-based chunking
text = d.get("text", "")
if not text:
continue
raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
for i, ch in enumerate(raw_chunks):
all_chunks.append({
"filename": filename,
"chunk_id": i,
"text": ch,
"chars": len(ch),
"element_type": "text",
"section_heading": "",
"format": d.get("format", ".md"),
"page_count": 0
})
return all_chunks
if __name__ == "__main__":
# Minimal test
sample = "This is a test text " * 200
chunks = chunk_text(sample, max_tokens=50, overlap=10)
print(f"Generated {len(chunks)} chunks")
print(chunks[0])