Spaces:

vn6295337
/

RAG-document-assistant

Sleeping

App Files Files Community

RAG-document-assistant / src /ingestion /chunker.py

vn6295337

Add Docling integration for multi-format document processing

7e07738 4 months ago

raw

history blame contribute delete

9.66 kB

	# RAG-document-assistant/ingestion/chunker.py
	"""
	Text chunking utility for RAG ingestion.
	Inputs: list of docs from load_docs.py or docling_loader.py
	Output: list of chunks with metadata

	Supports:
	- Simple character-based chunking (legacy)
	- Structure-aware chunking using Docling elements
	"""

	from typing import List, Dict, Optional, Any

	def chunk_text(
	text: str,
	max_tokens: int = 300,
	overlap: int = 50
	) -> List[str]:
	"""
	Simple whitespace-based chunking.
	Assumes ~1 token ≈ 4 chars (rough approximation).

	Args:
	text: Text to chunk
	max_tokens: Maximum tokens per chunk
	overlap: Number of tokens to overlap between chunks

	Returns:
	List of text chunks

	Raises:
	ValueError: If max_tokens or overlap are not positive
	"""
	if max_tokens <= 0:
	raise ValueError(f"max_tokens must be positive, got {max_tokens}")
	if overlap < 0:
	raise ValueError(f"overlap must be non-negative, got {overlap}")
	if overlap >= max_tokens:
	raise ValueError(f"overlap ({overlap}) must be less than max_tokens ({max_tokens})")

	approx_chars = max_tokens * 4
	approx_overlap = overlap * 4

	chunks = []
	start = 0
	text_len = len(text)

	while start < text_len:
	end = start + approx_chars
	chunk = text[start:end]

	if chunk.strip():
	chunks.append(chunk.strip())

	# next window with overlap
	start = start + approx_chars - approx_overlap
	# Ensure we don't go backwards
	if start <= 0:
	start = approx_chars

	return chunks


	def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50):
	"""
	Chunk a list of documents into smaller pieces for embedding.

	Args:
	docs: List of document dictionaries with 'filename' and 'text' keys
	max_tokens: Maximum tokens per chunk
	overlap: Number of tokens to overlap between chunks

	Returns:
	List of chunk dictionaries with filename, chunk_id, text, and chars keys

	Raises:
	TypeError: If docs is not a list or contains non-dict elements
	KeyError: If required keys are missing from document dictionaries
	"""
	if not isinstance(docs, list):
	raise TypeError("docs must be a list")

	all_chunks = []
	for d in docs:
	if not isinstance(d, dict):
	raise TypeError("Each document must be a dictionary")

	if d.get("status") != "OK":
	continue

	filename = d["filename"]
	text = d["text"]
	raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)

	for i, ch in enumerate(raw_chunks):
	all_chunks.append({
	"filename": filename,
	"chunk_id": i,
	"text": ch,
	"chars": len(ch)
	})
	return all_chunks


	def chunk_by_structure(
	elements: List[Any],
	max_tokens: int = 300,
	overlap: int = 50,
	keep_tables_intact: bool = True,
	include_heading_context: bool = True
	) -> List[Dict]:
	"""
	Structure-aware chunking using Docling document elements.

	Groups content by semantic boundaries (headings, tables) rather than
	arbitrary character counts. Falls back to character-based splitting
	for oversized elements.

	Args:
	elements: List of DocumentElement objects from docling_loader
	max_tokens: Maximum tokens per chunk (approx 4 chars/token)
	overlap: Token overlap for split elements
	keep_tables_intact: Keep tables as single chunks even if large
	include_heading_context: Prepend parent heading to chunks

	Returns:
	List of chunk dicts with element_type and section metadata
	"""
	if not elements:
	return []

	max_chars = max_tokens * 4
	chunks = []
	current_heading = ""
	current_section = []
	current_chars = 0

	def flush_section():
	"""Flush accumulated section content as a chunk."""
	nonlocal current_section, current_chars
	if not current_section:
	return

	combined_text = "\n\n".join(el.text for el in current_section)
	if combined_text.strip():
	# Prepend heading context if available
	if include_heading_context and current_heading:
	combined_text = f"## {current_heading}\n\n{combined_text}"

	chunks.append({
	"text": combined_text.strip(),
	"chars": len(combined_text),
	"element_type": "section",
	"section_heading": current_heading,
	"element_count": len(current_section)
	})

	current_section = []
	current_chars = 0

	for element in elements:
	el_type = getattr(element, "element_type", "paragraph")
	el_text = getattr(element, "text", str(element))
	el_chars = len(el_text)

	# Handle headings - start new section
	if el_type == "heading":
	flush_section()
	current_heading = el_text
	continue

	# Handle tables - keep intact if configured
	if el_type == "table" and keep_tables_intact:
	flush_section()
	table_text = el_text
	if include_heading_context and current_heading:
	table_text = f"## {current_heading}\n\n{el_text}"

	chunks.append({
	"text": table_text.strip(),
	"chars": len(table_text),
	"element_type": "table",
	"section_heading": current_heading,
	"element_count": 1
	})
	continue

	# Check if adding this element exceeds limit
	if current_chars + el_chars > max_chars and current_section:
	flush_section()

	# Handle oversized single elements
	if el_chars > max_chars:
	flush_section()
	# Split large element using character-based chunking
	sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap)
	for i, sub_text in enumerate(sub_chunks):
	prefix = ""
	if include_heading_context and current_heading:
	prefix = f"## {current_heading}\n\n"
	chunks.append({
	"text": f"{prefix}{sub_text}".strip(),
	"chars": len(sub_text) + len(prefix),
	"element_type": f"{el_type}_split",
	"section_heading": current_heading,
	"split_index": i,
	"element_count": 1
	})
	continue

	# Accumulate element in current section
	current_section.append(element)
	current_chars += el_chars

	# Flush remaining content
	flush_section()

	return chunks


	def chunk_documents_with_structure(
	docs: List[Dict],
	max_tokens: int = 300,
	overlap: int = 50,
	keep_tables_intact: bool = True,
	use_structure: bool = True
	) -> List[Dict]:
	"""
	Chunk documents using structure-aware or legacy chunking.

	Args:
	docs: List of document dicts (from docling_loader or load_docs)
	max_tokens: Maximum tokens per chunk
	overlap: Token overlap between chunks
	keep_tables_intact: Keep tables as single chunks
	use_structure: Use structure-aware chunking if elements available

	Returns:
	List of chunk dicts with metadata
	"""
	if not isinstance(docs, list):
	raise TypeError("docs must be a list")

	all_chunks = []

	for d in docs:
	if not isinstance(d, dict):
	raise TypeError("Each document must be a dictionary")

	status = d.get("status", "")
	if status != "OK":
	continue

	filename = d.get("filename", "unknown")
	elements = d.get("elements", [])

	# Use structure-aware chunking if elements available
	if use_structure and elements:
	raw_chunks = chunk_by_structure(
	elements,
	max_tokens=max_tokens,
	overlap=overlap,
	keep_tables_intact=keep_tables_intact
	)
	for i, ch in enumerate(raw_chunks):
	all_chunks.append({
	"filename": filename,
	"chunk_id": i,
	"text": ch["text"],
	"chars": ch["chars"],
	"element_type": ch.get("element_type", "section"),
	"section_heading": ch.get("section_heading", ""),
	"format": d.get("format", ""),
	"page_count": d.get("page_count", 0)
	})
	else:
	# Fallback to legacy text-based chunking
	text = d.get("text", "")
	if not text:
	continue

	raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
	for i, ch in enumerate(raw_chunks):
	all_chunks.append({
	"filename": filename,
	"chunk_id": i,
	"text": ch,
	"chars": len(ch),
	"element_type": "text",
	"section_heading": "",
	"format": d.get("format", ".md"),
	"page_count": 0
	})

	return all_chunks


	if __name__ == "__main__":
	# Minimal test
	sample = "This is a test text " * 200
	chunks = chunk_text(sample, max_tokens=50, overlap=10)
	print(f"Generated {len(chunks)} chunks")
	print(chunks[0])