researchpilot-api / run_chunking.py
Subhadip007's picture
feat: document chunking pipeline complete
511a4f9
"""
Run the chunking pipeline on all processed documents.
OPTIMIZATION: Checks existing chunks before loading model,
so if everything is already chunked, we exit immediately
without loading 110MB embedding model.
"""
import json
from pathlib import Path
from src.utils.logger import setup_logger, get_logger
from src.processing.chunker import ChunkingPipeline
from config.settings import PROCESSED_DIR, CHUNKS_DIR
setup_logger()
logger = get_logger(__name__)
def count_remaining(strategy: str) -> int:
"""Count how many papers still need chunking."""
processed = list(PROCESSED_DIR.glob("*.json"))
remaining = 0
for f in processed:
paper_id = f.stem
output_path = CHUNKS_DIR / f"{paper_id}_{strategy}.json"
if not output_path.exists():
remaining += 1
return remaining
def main():
strategy = 'semantic'
remaining = count_remaining(strategy)
logger.info(f"Papers remaining to chunk: {remaining}")
if remaining == 0:
logger.info("All papers already chunked. Nothing to do.")
# Print summary of existing chunks
chunk_files = list(CHUNKS_DIR.glob(f"*_{strategy}.json"))
total = 0
for cf in chunk_files:
with open(cf) as f:
chunks = json.load()
total += len(chunks)
logger.info(f"Existing chunks: {total} across {len(chunk_files)} papers")
logger.info(f"Starting chunking pipeline for {remaining} papers...")
pipeline = ChunkingPipeline(strategy = strategy)
stats = pipeline.run(PROCESSED_DIR)
logger.info(f"Done: {stats}")
if __name__ == "__main__":
main()