Spaces:
Running
Running
| """ | |
| Verify chunk quality across the full dataset. | |
| Run this before embedding to catch any data issues early. | |
| """ | |
| import json | |
| from pathlib import Path | |
| from config.settings import CHUNKS_DIR | |
| from src.utils.logger import setup_logger, get_logger | |
| setup_logger() | |
| logger = get_logger(__name__) | |
| def main(): | |
| chunk_files = list(CHUNKS_DIR.glob("*_semantic.json")) | |
| logger.info(f"Checking {len(chunk_files)} chunk files...") | |
| total_chunks = 0 | |
| total_words = 0 | |
| tiny_chunks = 0 # < 50 words | |
| giant_chunks = 0 # > 600 words | |
| clean_endings = 0 | |
| sample_chunks = [] # Store a few for display | |
| for cf in chunk_files: | |
| with open(cf, encoding = 'utf-8') as f: | |
| chunks = json.load(f) | |
| for c in chunks: | |
| total_chunks += 1 | |
| wc = c["word_count"] | |
| total_words += wc | |
| if wc < 50: | |
| tiny_chunks += 1 | |
| if wc > 600: | |
| giant_chunks += 1 | |
| if c["text"].rstrip().endswith(('.', '!', '?')): | |
| clean_endings += 1 | |
| if len(sample_chunks) < 3: | |
| sample_chunks.append(c) | |
| avg_words = total_words / total_chunks if total_chunks else 0 | |
| print(f"\n{'='*55}") | |
| print(f" CHUNK QUALITY REPORT") | |
| print(f"{'='*55}") | |
| print(f" Total chunk files: {len(chunk_files)}") | |
| print(f" Total chunks: {total_chunks:,}") | |
| print(f" Avg words per chunk: {avg_words:.0f}") | |
| print(f" Tiny chunks (<50w): {tiny_chunks} ({100*tiny_chunks/total_chunks:.1f}%)") | |
| print(f" Giant chunks (>600w): {giant_chunks} ({100*giant_chunks/total_chunks:.1f}%)") | |
| print(f" Clean endings: {clean_endings} ({100*clean_endings/total_chunks:.1f}%)") | |
| print() | |
| print(" SAMPLE CHUNKS:") | |
| print(f" {'-'*50}") | |
| for i, c in enumerate(sample_chunks): | |
| print(f" [{i+1}] Paper: {c['paper_id']}") | |
| print(f" Words: {c['word_count']} | Strategy: {c['chunking_strategy']}") | |
| print(f" Text: {c['text'][:120].replace(chr(10), ' ')}...") | |
| print() | |
| # Quality gates - these thresholds indicate healthy chunking | |
| print(f"{'='*55}") | |
| print(f" QUALITY GATES") | |
| print(f"{'='*55}") | |
| gates = [ | |
| ("Total chunks > 10,000", total_chunks > 10_000), | |
| ("Avg words 100-400", 100 <= avg_words <= 400), | |
| ("Tiny chunks < 15%", tiny_chunks/total_chunks < 0.15), | |
| ("Clean endings > 70%", clean_endings/total_chunks > 0.70), | |
| ] | |
| all_pass = True | |
| for name, passed in gates: | |
| status = "✅ PASS" if passed else "❌ FAIL" | |
| print(f" {status} {name}") | |
| if not passed: | |
| all_pass = False | |
| print() | |
| if all_pass: | |
| print(" ✅ All quality gates passed. Ready for Phase 6.") | |
| else: | |
| print(" ⚠️ Some gates failed. Review before proceeding.") | |
| if __name__ == "__main__": | |
| main() |