| """ |
| Storage module for saving and loading processed chunks |
| """ |
|
|
| import json |
| import logging |
| from pathlib import Path |
| from typing import List, Dict, Any |
|
|
| from .models import DocumentChunk, ProcessingStats |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class ChunkStorage: |
| """Handles saving and loading of document chunks""" |
| |
| def __init__(self, output_file: Path): |
| """ |
| Initialize storage |
| |
| Args: |
| output_file: Path to output JSON file |
| """ |
| self.output_file = output_file |
| self.output_file.parent.mkdir(parents=True, exist_ok=True) |
| |
| def save_chunks( |
| self, |
| chunks: List[DocumentChunk], |
| stats: ProcessingStats = None |
| ) -> None: |
| """ |
| Save chunks to JSON file |
| |
| Args: |
| chunks: List of DocumentChunk objects |
| stats: Optional processing statistics |
| """ |
| logger.info(f"Saving {len(chunks)} chunks to {self.output_file}") |
| |
| |
| chunks_data = [chunk.to_dict() for chunk in chunks] |
| |
| |
| output = { |
| 'metadata': { |
| 'total_chunks': len(chunks), |
| 'version': '1.0', |
| }, |
| 'chunks': chunks_data |
| } |
| |
| |
| if stats: |
| output['metadata']['processing_stats'] = stats.to_dict() |
| |
| |
| with open(self.output_file, 'w', encoding='utf-8') as f: |
| json.dump(output, f, indent=2, ensure_ascii=False) |
| |
| logger.info(f"Successfully saved chunks to {self.output_file}") |
| |
| |
| self._save_summary(chunks, stats) |
| |
| def _save_summary( |
| self, |
| chunks: List[DocumentChunk], |
| stats: ProcessingStats = None |
| ) -> None: |
| """Save a human-readable summary""" |
| summary_file = self.output_file.parent / "chunks_summary.txt" |
| |
| with open(summary_file, 'w', encoding='utf-8') as f: |
| f.write("=" * 80 + "\n") |
| f.write("DOCUMENT CHUNKS SUMMARY\n") |
| f.write("=" * 80 + "\n\n") |
| |
| if stats: |
| f.write(f"Total Documents Processed: {stats.total_documents}\n") |
| f.write(f"Total Chunks Created: {stats.total_chunks}\n") |
| f.write(f"Total Words: {stats.total_words}\n") |
| f.write(f"Average Chunk Size: {stats.avg_chunk_size:.1f} words\n") |
| f.write(f"Processing Time: {stats.processing_time_seconds:.2f} seconds\n") |
| f.write(f"\nDocuments:\n") |
| for doc in stats.documents_processed: |
| f.write(f" - {doc}\n") |
| f.write("\n") |
| |
| f.write("-" * 80 + "\n") |
| f.write("SAMPLE CHUNKS (First 5)\n") |
| f.write("-" * 80 + "\n\n") |
| |
| for i, chunk in enumerate(chunks[:5], 1): |
| f.write(f"Chunk {i}: {chunk.chunk_id}\n") |
| f.write(f"Source: {chunk.metadata.source_file}\n") |
| f.write(f"Section: {chunk.metadata.article_section or 'N/A'}\n") |
| f.write(f"Words: {chunk.metadata.word_count}\n") |
| f.write(f"Preview: {chunk.text[:200]}...\n") |
| f.write("\n" + "-" * 80 + "\n\n") |
| |
| logger.info(f"Summary saved to {summary_file}") |
| |
| def load_chunks(self) -> List[DocumentChunk]: |
| """ |
| Load chunks from JSON file |
| |
| Returns: |
| List of DocumentChunk objects |
| """ |
| logger.info(f"Loading chunks from {self.output_file}") |
| |
| if not self.output_file.exists(): |
| raise FileNotFoundError(f"Chunks file not found: {self.output_file}") |
| |
| with open(self.output_file, 'r', encoding='utf-8') as f: |
| data = json.load(f) |
| |
| chunks = [DocumentChunk.from_dict(chunk_data) for chunk_data in data['chunks']] |
| |
| logger.info(f"Loaded {len(chunks)} chunks") |
| |
| return chunks |
| |
| def validate_chunks(self, chunks: List[DocumentChunk]) -> bool: |
| """ |
| Validate chunks before saving |
| |
| Args: |
| chunks: List of chunks to validate |
| |
| Returns: |
| True if valid, raises exception otherwise |
| """ |
| if not chunks: |
| raise ValueError("No chunks to save") |
| |
| for i, chunk in enumerate(chunks): |
| if not chunk.text or not chunk.text.strip(): |
| raise ValueError(f"Chunk {i} has empty text") |
| |
| if not chunk.chunk_id: |
| raise ValueError(f"Chunk {i} has no ID") |
| |
| if chunk.metadata.word_count == 0: |
| raise ValueError(f"Chunk {i} has zero word count") |
| |
| logger.info(f"Validated {len(chunks)} chunks successfully") |
| return True |
|
|