Spaces:

khagu
/

setu

Running

App Files Files Community

setu / module_a /chunkers.py

khagu

chore: finally untrack large database files

3998131 3 months ago

raw

history blame contribute delete

9.03 kB

	"""
	Text chunking module
	Intelligently splits legal documents into meaningful chunks
	"""

	import re
	import logging
	from typing import List, Tuple, Optional, Dict
	from pathlib import Path

	from .config import (
	CHUNK_SIZE_MIN_WORDS,
	CHUNK_SIZE_MAX_WORDS,
	CHUNK_SIZE_TARGET_WORDS,
	CHUNK_OVERLAP_WORDS,
	COMPILED_SECTION_PATTERNS
	)
	from .models import DocumentChunk, ChunkMetadata

	logger = logging.getLogger(__name__)


	class LegalDocumentChunker:
	"""Chunks legal documents with section/article awareness"""

	def __init__(
	self,
	min_words: int = CHUNK_SIZE_MIN_WORDS,
	max_words: int = CHUNK_SIZE_MAX_WORDS,
	target_words: int = CHUNK_SIZE_TARGET_WORDS,
	overlap_words: int = CHUNK_OVERLAP_WORDS
	):
	"""
	Initialize chunker

	Args:
	min_words: Minimum words per chunk
	max_words: Maximum words per chunk
	target_words: Target words per chunk
	overlap_words: Words to overlap between chunks
	"""
	self.min_words = min_words
	self.max_words = max_words
	self.target_words = target_words
	self.overlap_words = overlap_words

	def chunk_document(
	self,
	text: str,
	source_file: str,
	pages_data: List[Dict[str, any]] = None
	) -> List[DocumentChunk]:
	"""
	Chunk a document into meaningful pieces

	Args:
	text: Full document text
	source_file: Source filename
	pages_data: Optional page data for page number tracking

	Returns:
	List of DocumentChunk objects
	"""
	logger.info(f"Chunking document: {source_file}")

	# First, try to split by sections/articles
	sections = self._split_by_sections(text)

	# Then chunk each section appropriately
	all_chunks = []
	chunk_counter = 0

	for section_title, section_text in sections:
	section_chunks = self._chunk_section(
	section_text,
	section_title,
	source_file,
	chunk_counter
	)
	all_chunks.extend(section_chunks)
	chunk_counter += len(section_chunks)

	logger.info(f"Created {len(all_chunks)} chunks from {source_file}")

	return all_chunks

	def _split_by_sections(self, text: str) -> List[Tuple[Optional[str], str]]:
	"""
	Split text by sections/articles

	Returns:
	List of (section_title, section_text) tuples
	"""
	sections = []
	current_section = None
	current_text = []

	lines = text.split('\n')

	for line in lines:
	# Check if line contains a section marker
	section_match = self._detect_section(line)

	if section_match:
	# Save previous section if it has content
	if current_text:
	sections.append((current_section, '\n'.join(current_text)))
	current_text = []

	# Start new section with this title
	current_section = section_match
	# Include the section header line in the text
	current_text = [line]
	else:
	current_text.append(line)

	# Add final section
	if current_text:
	sections.append((current_section, '\n'.join(current_text)))

	# If no sections detected, return entire text as one section
	if len(sections) == 0:
	sections.append((None, text))

	logger.info(f"Detected {len(sections)} sections in document")

	return sections

	def _detect_section(self, line: str) -> Optional[str]:
	"""
	Detect if a line contains a section/article marker

	Returns:
	Section title if detected, None otherwise
	"""
	for pattern in COMPILED_SECTION_PATTERNS:
	match = pattern.search(line)
	if match:
	# For numbered sections like "11. Citizenship:", return "11. Citizenship"
	if len(match.groups()) >= 2:
	# Pattern has both number and title
	return f"{match.group(1)}. {match.group(2)}"
	else:
	# Pattern has just the identifier, return the full match
	return match.group(0)

	return None

	def _chunk_section(
	self,
	section_text: str,
	section_title: Optional[str],
	source_file: str,
	start_counter: int
	) -> List[DocumentChunk]:
	"""
	Chunk a single section into appropriate sizes

	Args:
	section_text: Text of the section
	section_title: Title/identifier of the section
	source_file: Source filename
	start_counter: Starting chunk number

	Returns:
	List of chunks for this section
	"""
	words = section_text.split()
	word_count = len(words)

	# If section is small enough, keep as single chunk
	if word_count <= self.max_words:
	chunk = self._create_chunk(
	text=section_text,
	chunk_id=f"{Path(source_file).stem}_chunk_{start_counter:04d}",
	source_file=source_file,
	article_section=section_title
	)
	return [chunk]

	# Otherwise, split into multiple chunks
	chunks = []
	start_idx = 0
	chunk_num = start_counter
	max_iterations = word_count # Safety limit to prevent infinite loops
	iteration_count = 0

	while start_idx < word_count and iteration_count < max_iterations:
	iteration_count += 1

	# Calculate end index
	end_idx = min(start_idx + self.target_words, word_count)

	# Ensure we make progress (end_idx must be greater than start_idx)
	if end_idx <= start_idx:
	logger.warning(f"Chunking issue: end_idx ({end_idx}) <= start_idx ({start_idx}), breaking")
	break

	# Try to find a good break point (sentence end)
	if end_idx < word_count:
	# Look for sentence endings near target
	chunk_words = words[start_idx:end_idx]
	chunk_text = ' '.join(chunk_words)

	# Find last sentence ending
	last_period = max(
	chunk_text.rfind('. '),
	chunk_text.rfind('! '),
	chunk_text.rfind('? ')
	)

	if last_period > len(chunk_text) * 0.5: # At least 50% through
	# Adjust end_idx to sentence boundary
	words_before_period = chunk_text[:last_period + 1].split()
	new_end_idx = start_idx + len(words_before_period)
	# Only use the new end_idx if it's actually moving forward
	if new_end_idx > start_idx:
	end_idx = new_end_idx

	# Create chunk
	chunk_words = words[start_idx:end_idx]
	chunk_text = ' '.join(chunk_words)

	chunk = self._create_chunk(
	text=chunk_text,
	chunk_id=f"{Path(source_file).stem}_chunk_{chunk_num:04d}",
	source_file=source_file,
	article_section=section_title
	)
	chunks.append(chunk)

	# Move to next chunk with overlap
	# Ensure we always move forward by at least 1 word
	overlap = min(self.overlap_words, end_idx - start_idx - 1)
	next_start_idx = end_idx - overlap

	# Safety check: ensure we're making progress
	if next_start_idx <= start_idx:
	next_start_idx = start_idx + 1

	start_idx = next_start_idx
	chunk_num += 1

	if iteration_count >= max_iterations:
	logger.warning(f"Hit max iterations ({max_iterations}) while chunking section")

	return chunks

	def _create_chunk(
	self,
	text: str,
	chunk_id: str,
	source_file: str,
	article_section: Optional[str] = None
	) -> DocumentChunk:
	"""Create a DocumentChunk object"""
	words = text.split()

	metadata = ChunkMetadata(
	source_file=source_file,
	article_section=article_section,
	word_count=len(words),
	char_count=len(text)
	)

	return DocumentChunk(
	chunk_id=chunk_id,
	text=text,
	metadata=metadata
	)