# test_fetch.py """ Smart test script that handles existing data correctly. Tests three things: 1. Can we load existing papers from disk? 2. Can we fetch NEW papers (beyond what we have)? 3. Is our data schema correct? """ import json from pathlib import Path from src.utils.logger import setup_logger, get_logger from src.ingestion.arxiv_fetcher import ArXivFetcher from config.settings import RAW_DIR setup_logger() logger = get_logger(__name__) def test_existing_data(): """Check what we already have on disk.""" paper_files = [ f for f in RAW_DIR.glob("*.json") if f.name != "paper_index.json" ] logger.info(f"Papers already on disk: {len(paper_files)}") if not paper_files: logger.warning("No papers found on disk. Run fetch first.") return [] papers = [] for pf in paper_files[:3]: # Show first 3 with open(pf) as f: data = json.load(f) papers.append(data) logger.info(f" -> {data['paper_id']}: {data['title'][:60]}...") logger.info(f" Category: {data['primary_categories']} | Date: {data['published_date']}") return papers def test_schema_validation(): """Verify our Pydantic schema works correctly.""" from src.ingestion.arxiv_fetcher import PaperMetadata logger.info("Testing schema validation...") # Test with valid data try: paper = PaperMetadata( paper_id = "http://arxiv.org/abs/2301.07041v2", # Raw ID with version title = " Test Paper With Extra Spaces ", abstract = "This is a test abstract.", authors = ["Author One", "Author Two"], categories = ["cs.LG", "cs.AI"], primary_categories = "cs.LG", published_date = "2023-01-17", updated_date = "2023-03-15", arxiv_url = "https://arxiv.org/abs/2301.07041", pdf_url = "https://arxiv.org/pdf/2301.07041", ) # Verify our validators ran assert paper.paper_id == "2301.07041", f"ID cleanup failed: {paper.paper_id}" assert paper.title == "Test Paper With Extra Spaces", f"Whitespace cleanup failed: {paper.title}" logger.info(" -> Schema validation: PASSED") logger.info(f" paper_id cleaned: '2301.07041'") logger.info(f" title cleaned: '{paper.title}'") return True except Exception as e: logger.error(f" -> Schema validation FAILED: {e}") return False def test_fresh_fetch(n: int = 3): """ Fetch papers, but temporarily ignore existing index to force fresh results for testing. """ logger.info(f"Fetching {n} fresh papers from ArXiv...") fetcher = ArXivFetcher() # TEMPORARY: clear existing IDs in memory only (not on disk) # This lets us test the fetch logic without deleting real data original_ids = fetcher.existing_ids.copy() fetcher.existing_ids = set() # Pretend we have nothing papers = fetcher.fetch_papers(max_papers=n) # Restore original IDs fetcher.existing_ids = original_ids if papers: logger.info(f" -> Fresh fetch: PASSED. Got {len(papers)} papers") for p in papers: logger.info(f" {p.paper_id}: {p.title[:55]}...") else: logger.warning(" -> Fresh fetch returned 0 papers. Check network connection.") return papers def main(): logger.info("=" * 55) logger.info("RESEARCHPILOT — INGESTION TEST SUITE") logger.info("=" * 55) # Test 1: Existing data logger.info("\n[TEST 1] Checking existing data on disk...") existing = test_existing_data() # Test 2: Schema validation logger.info("\n[TEST 2] Schema validation...") test_schema_validation() # Test 3: Fresh fetch logger.info("\n[TEST 3] Fresh fetch from ArXiv...") fresh = test_fresh_fetch(n=3) logger.info("\n" + "=" * 55) logger.info("TEST SUITE COMPLETE") logger.info(f"Existing papers: {len(existing)} shown (may have more)") logger.info(f"Fresh papers fetched: {len(fresh)}") logger.info("=" * 55) if __name__ == "__main__": main()