Spaces:
Sleeping
Sleeping
File size: 9,661 Bytes
f866820 7e07738 f866820 7e07738 f866820 7e07738 f866820 7e07738 f866820 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | # RAG-document-assistant/ingestion/chunker.py
"""
Text chunking utility for RAG ingestion.
Inputs: list of docs from load_docs.py or docling_loader.py
Output: list of chunks with metadata
Supports:
- Simple character-based chunking (legacy)
- Structure-aware chunking using Docling elements
"""
from typing import List, Dict, Optional, Any
def chunk_text(
text: str,
max_tokens: int = 300,
overlap: int = 50
) -> List[str]:
"""
Simple whitespace-based chunking.
Assumes ~1 token ≈ 4 chars (rough approximation).
Args:
text: Text to chunk
max_tokens: Maximum tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of text chunks
Raises:
ValueError: If max_tokens or overlap are not positive
"""
if max_tokens <= 0:
raise ValueError(f"max_tokens must be positive, got {max_tokens}")
if overlap < 0:
raise ValueError(f"overlap must be non-negative, got {overlap}")
if overlap >= max_tokens:
raise ValueError(f"overlap ({overlap}) must be less than max_tokens ({max_tokens})")
approx_chars = max_tokens * 4
approx_overlap = overlap * 4
chunks = []
start = 0
text_len = len(text)
while start < text_len:
end = start + approx_chars
chunk = text[start:end]
if chunk.strip():
chunks.append(chunk.strip())
# next window with overlap
start = start + approx_chars - approx_overlap
# Ensure we don't go backwards
if start <= 0:
start = approx_chars
return chunks
def chunk_documents(docs: List[Dict], max_tokens: int = 300, overlap: int = 50):
"""
Chunk a list of documents into smaller pieces for embedding.
Args:
docs: List of document dictionaries with 'filename' and 'text' keys
max_tokens: Maximum tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of chunk dictionaries with filename, chunk_id, text, and chars keys
Raises:
TypeError: If docs is not a list or contains non-dict elements
KeyError: If required keys are missing from document dictionaries
"""
if not isinstance(docs, list):
raise TypeError("docs must be a list")
all_chunks = []
for d in docs:
if not isinstance(d, dict):
raise TypeError("Each document must be a dictionary")
if d.get("status") != "OK":
continue
filename = d["filename"]
text = d["text"]
raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
for i, ch in enumerate(raw_chunks):
all_chunks.append({
"filename": filename,
"chunk_id": i,
"text": ch,
"chars": len(ch)
})
return all_chunks
def chunk_by_structure(
elements: List[Any],
max_tokens: int = 300,
overlap: int = 50,
keep_tables_intact: bool = True,
include_heading_context: bool = True
) -> List[Dict]:
"""
Structure-aware chunking using Docling document elements.
Groups content by semantic boundaries (headings, tables) rather than
arbitrary character counts. Falls back to character-based splitting
for oversized elements.
Args:
elements: List of DocumentElement objects from docling_loader
max_tokens: Maximum tokens per chunk (approx 4 chars/token)
overlap: Token overlap for split elements
keep_tables_intact: Keep tables as single chunks even if large
include_heading_context: Prepend parent heading to chunks
Returns:
List of chunk dicts with element_type and section metadata
"""
if not elements:
return []
max_chars = max_tokens * 4
chunks = []
current_heading = ""
current_section = []
current_chars = 0
def flush_section():
"""Flush accumulated section content as a chunk."""
nonlocal current_section, current_chars
if not current_section:
return
combined_text = "\n\n".join(el.text for el in current_section)
if combined_text.strip():
# Prepend heading context if available
if include_heading_context and current_heading:
combined_text = f"## {current_heading}\n\n{combined_text}"
chunks.append({
"text": combined_text.strip(),
"chars": len(combined_text),
"element_type": "section",
"section_heading": current_heading,
"element_count": len(current_section)
})
current_section = []
current_chars = 0
for element in elements:
el_type = getattr(element, "element_type", "paragraph")
el_text = getattr(element, "text", str(element))
el_chars = len(el_text)
# Handle headings - start new section
if el_type == "heading":
flush_section()
current_heading = el_text
continue
# Handle tables - keep intact if configured
if el_type == "table" and keep_tables_intact:
flush_section()
table_text = el_text
if include_heading_context and current_heading:
table_text = f"## {current_heading}\n\n{el_text}"
chunks.append({
"text": table_text.strip(),
"chars": len(table_text),
"element_type": "table",
"section_heading": current_heading,
"element_count": 1
})
continue
# Check if adding this element exceeds limit
if current_chars + el_chars > max_chars and current_section:
flush_section()
# Handle oversized single elements
if el_chars > max_chars:
flush_section()
# Split large element using character-based chunking
sub_chunks = chunk_text(el_text, max_tokens=max_tokens, overlap=overlap)
for i, sub_text in enumerate(sub_chunks):
prefix = ""
if include_heading_context and current_heading:
prefix = f"## {current_heading}\n\n"
chunks.append({
"text": f"{prefix}{sub_text}".strip(),
"chars": len(sub_text) + len(prefix),
"element_type": f"{el_type}_split",
"section_heading": current_heading,
"split_index": i,
"element_count": 1
})
continue
# Accumulate element in current section
current_section.append(element)
current_chars += el_chars
# Flush remaining content
flush_section()
return chunks
def chunk_documents_with_structure(
docs: List[Dict],
max_tokens: int = 300,
overlap: int = 50,
keep_tables_intact: bool = True,
use_structure: bool = True
) -> List[Dict]:
"""
Chunk documents using structure-aware or legacy chunking.
Args:
docs: List of document dicts (from docling_loader or load_docs)
max_tokens: Maximum tokens per chunk
overlap: Token overlap between chunks
keep_tables_intact: Keep tables as single chunks
use_structure: Use structure-aware chunking if elements available
Returns:
List of chunk dicts with metadata
"""
if not isinstance(docs, list):
raise TypeError("docs must be a list")
all_chunks = []
for d in docs:
if not isinstance(d, dict):
raise TypeError("Each document must be a dictionary")
status = d.get("status", "")
if status != "OK":
continue
filename = d.get("filename", "unknown")
elements = d.get("elements", [])
# Use structure-aware chunking if elements available
if use_structure and elements:
raw_chunks = chunk_by_structure(
elements,
max_tokens=max_tokens,
overlap=overlap,
keep_tables_intact=keep_tables_intact
)
for i, ch in enumerate(raw_chunks):
all_chunks.append({
"filename": filename,
"chunk_id": i,
"text": ch["text"],
"chars": ch["chars"],
"element_type": ch.get("element_type", "section"),
"section_heading": ch.get("section_heading", ""),
"format": d.get("format", ""),
"page_count": d.get("page_count", 0)
})
else:
# Fallback to legacy text-based chunking
text = d.get("text", "")
if not text:
continue
raw_chunks = chunk_text(text, max_tokens=max_tokens, overlap=overlap)
for i, ch in enumerate(raw_chunks):
all_chunks.append({
"filename": filename,
"chunk_id": i,
"text": ch,
"chars": len(ch),
"element_type": "text",
"section_heading": "",
"format": d.get("format", ".md"),
"page_count": 0
})
return all_chunks
if __name__ == "__main__":
# Minimal test
sample = "This is a test text " * 200
chunks = chunk_text(sample, max_tokens=50, overlap=10)
print(f"Generated {len(chunks)} chunks")
print(chunks[0]) |