Spaces:

Subhadip007
/

researchpilot-api

Running

Subhadip007 commited on Mar 13

Commit

233102d

0 Parent(s):

feat: data ingestion and processing pipeline complete

- ArXiv API fetcher with deduplication and rate limiting
- PDF downloader with exponential backoff retry
- PDF text extractor using PyMuPDF
- Text cleaner: hyphenation fix, artifact removal, reference stripping
- Pydantic schema validation for all paper metadata
- Idempotent pipeline: safe to re-run without reprocessing

Papers in system: 303 fetched, 270+ processed

Files changed (19) hide show

.env.example +1 -0
.gitignore +9 -0
.vscode/settings.json +2 -0
config/__init__.py +0 -0
config/settings.py +100 -0
output.txt +0 -0
run_ingestion.py +76 -0
src/__init__.py +0 -0
src/ingestion/__init__.py +0 -0
src/ingestion/arxiv_fetcher.py +324 -0
src/ingestion/pdf_downloader.py +204 -0
src/processing/__init__.py +0 -0
src/processing/pdf_extractor.py +253 -0
src/processing/text_cleaner.py +262 -0
src/utils/__init__.py +0 -0
src/utils/logger.py +78 -0
test_fetch.py +10 -0
test_fetch_2.py +128 -0
test_processing.py +38 -0

.env.example ADDED Viewed

	@@ -0,0 +1 @@


1	+ GROQ_API_KEY=your_groq_api_key_here

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+venv/
+.env
+__pycache__/
+*.pyc
+data/raw/
+data/processed/
+data/embeddings/
+*.log
+.DS_Store

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {
2	+ }

config/__init__.py ADDED Viewed

File without changes

config/settings.py ADDED Viewed

	@@ -0,0 +1,100 @@

+"""
+Central configuration for ResearchPilot
+RULE: No hardcoded values anywhere else in this codebase.
+Every constant lives here. This make the system to
+tune without hunting through multiple files.
+"""
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+# Load environment variables from .env file
+# This must happen before anything else reads os.environ
+load_dotenv()
+# ------------------------------------------
+# PROJECT PATHS
+# ------------------------------------------
+# Path(__file__) = config/setting.py
+# .parent =      = config/
+# .parent.parent = researchpilot/ <- project root
+ROOT_DIR = Path(__file__).parent.parent
+DATA_DIR        = ROOT_DIR / "data"
+RAW_DIR         = DATA_DIR / "raw"
+PROCESSED_DIR   = DATA_DIR / "processed"
+CHUNKS_DIR      = DATA_DIR / "chunks"
+EMBEDDINGS_DIR  = DATA_DIR / "embeddings"
+LOGS_DIR        = ROOT_DIR / "logs"
+# Create directories if they don't exist
+# This ensures the app works on any machine without manual setup
+for directory in [RAW_DIR, PROCESSED_DIR, CHUNKS_DIR, EMBEDDINGS_DIR, LOGS_DIR]:
+    directory.mkdir(
+        parents = True,
+        exist_ok = True
+    )
+# ------------------------------------------
+# DATA INGESTION SETTINGS
+# ------------------------------------------
+ARXIV_CATEGORIES = ["cs.LG", "cs.AI"]    # Machine Learning + AI
+MAX_PAPERS_PER_FETCH = 100               # Papers per API call
+TOTAL_PAPERS_TARGET  = 100               # Total papers to collect
+ARXIV_API_DELAY_SECONDS = 3.0            # ArXiv rate limit: be respectful
+PDF_DOWNLOAD_TIMEOUT = 30                # Seconds before giving up on a PDF
+MAX_DOWNLOAD_RETRIES = 3                 # Retry failed downloads N times
+# ------------------------------------------
+# DOCUMENT PROCESSING SETTINGS
+# ------------------------------------------
+MIN_TEXT_LENGTH = 500       # Skip papers with less that 500 chars
+MAX_TEXT_LENGTH = 500_000   # Skip papers larger than 100k chars (corrupted)
+# ------------------------------------------
+# CHUNKING SETTINGS
+# ------------------------------------------
+CHUNK_SIZE = 512        # Charaters per chunk
+CHUNK_OVERLAP = 50      # Overlap between consecutive chunks
+MIN_CHUNK_SIZE = 100    # Discard chunks smaller than this
+# ------------------------------------------
+# EMBEDDING SETTINGS
+# ------------------------------------------
+EMBEDDING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
+EMBEDDING_BATCH_SIZE = 32                       # Process N chunks at once
+EMBEDDING_DIMENSION  = 768                      # BGE-base output dimension
+# ------------------------------------------
+# VECTOR STORE SETTINGS
+# ------------------------------------------
+QDRANT_COLLECTION_NAME = 'research_papers'
+QDRANT_PATH = str(ROOT_DIR / 'data' / 'qdrant_db')  # Local Storage path
+TOP_K_RETRIEVAL = 20                                # Retieve top 20 candidates
+TOP_K_RERANK = 5                                    # Keep top 5 after reranking
+# ------------------------------------------
+# LLM SETTINGS
+# ------------------------------------------
+GROQ_API_KEY = os.getenv('GROQ_API_KEY')    # Loaded from .env
+LLM_MODEL_NAME = 'llama3-8b-8192'           # Groq model ID
+LLM_TEMPERATURE = 0.1                       # Low = More factual/consistent
+LLM_MAX_TOKENS = 1024                       # Max response tokens
+# ------------------------------------------
+# API SETTINGS
+# ------------------------------------------
+API_HOST = "0.0.0.0"
+API_PORT = 8000
+API_RELOAD = True   # Auto-reload on code change (dev-only)
+# ------------------------------------------
+# LOGGING SETTINGS
+# ------------------------------------------
+LOG_LEVEL     = "INFO"
+LOG_FILE      = LOGS_DIR / "researchpilot.log"
+LOG_ROTATION  = "10 MB"                         # Create new log file after 10MB
+LOG_RETENTION = "7 days"                        # Keep logs for 7 days

output.txt ADDED Viewed

Binary file (10.4 kB). View file

run_ingestion.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+Master script to run the data ingestion pipeline.
+Run this from the project root:
+    python run_ingestion.py
+This script orchestrates:
+    1. Fetch paper metadata from ArXiv
+    2. Download PDFs for fetched papers
+"""
+import json
+from pathlib import Path
+from src.utils.logger import get_logger, setup_logger
+from src.ingestion.arxiv_fetcher import ArXivFetcher
+from src.ingestion.pdf_downloader import PDFDownloader
+from src.processing.pdf_extractor import PDFExtractor
+from config.settings import RAW_DIR, PROCESSED_DIR, TOTAL_PAPERS_TARGET
+setup_logger()
+logger = get_logger(__name__)
+def load_all_raw_papers() -> list[dict]:
+    papers = []
+    for f in RAW_DIR.glob("*.json"):
+        if f.name == "paper_index.json":
+            continue
+        with open(f, encoding = 'utf-8') as fp:
+            papers.append(json.load(fp))
+    return papers
+def print_section(title: str):
+    logger.info("=" * 60)
+    logger.info(title)
+    logger.info("=" * 60)
+def main():
+    print_section("RESEARCHPILOT — FULL PIPELINE")
+    # -------- PHASE 1: Fetch Metadata --------
+    print_section("PHASE 1: Fetching ArXiv Metadata")
+    fetcher     = ArXivFetcher()
+    new_papers  = fetcher.fetch_papers(max_papers = TOTAL_PAPERS_TARGET)
+    logger.info(f"New papers fetched: {len(new_papers)}")
+    # -------- PHASE 2: Download PDFs --------
+    print_section("PHASE 2: Downloading PDFs")
+    all_papers = load_all_raw_papers()
+    downloader = PDFDownloader()
+    dl_stats   = downloader.download_all(all_papers)
+    logger.info(f"Download stats: {dl_stats}")
+    # -------- PHASE 3: Extract Text --------
+    print_section("PHASE 3: Extracting and Cleaning Text")
+    extractor  = PDFExtractor()
+    proc_stats = extractor.process_all()
+    logger.info(f"Processing stats: {proc_stats}")
+    # -------- SUMMARY --------
+    processed_files = list(PROCESSED_DIR.glob("*.json"))
+    print_section("PIPELINE COMPLETE")
+    logger.info(f"Papers in processed/: {len(processed_files)}")
+    logger.info("Ready for Phase 5: Chunking")
+if __name__ == "__main__":
+    main()

src/__init__.py ADDED Viewed

File without changes

src/ingestion/__init__.py ADDED Viewed

File without changes

src/ingestion/arxiv_fetcher.py ADDED Viewed

	@@ -0,0 +1,324 @@

+"""
+ArXiv API client for fetching ML paper metadata.
+RESPONSIBILITY: This module has ONE job - fetch paper metadata from ArXiv
+and return validated, structured data. It does NOT download PDFs (that's
+the pdf extractor's job). Single Responsibility  Principle.
+Why ArXiv LIBRARY:
+    The arxiv Python library wraps the raw XML API response into clean
+    Python objects. We could parse XML ourselves with BeautifulSoup,
+    but using the official library means we benefit from their bug fixes
+    and API changes without rewriting our code.
+"""
+import time
+import json
+from pathlib import Path
+from datetime import datetime
+from typing import Optional
+import arxiv
+from pydantic import BaseModel, field_validator
+from src.utils.logger import get_logger
+from config.settings import (
+    RAW_DIR,
+    ARXIV_CATEGORIES,
+    MAX_PAPERS_PER_FETCH,
+    TOTAL_PAPERS_TARGET,
+    ARXIV_API_DELAY_SECONDS
+)
+# Get a named logger for this module
+# Every module gets its own named logger - makes debugging trivial
+logger = get_logger(__name__)
+# -------------------------------------------
+# DATA MODEL
+# -------------------------------------------
+class PaperMetadata(BaseModel):
+    """
+    Pydantic model defininf the exact schema for a paper's metadata.
+    WHY PYDANTIC:
+    Pydantic enforces data types at runtime. If ArXiv returns a date
+    in an unexpected format, Pydantic raises a clear error immediately
+    instead of silently storing bad data that breaks things 3 steps later.
+    This is called "fail fast" - catch bad data as early as possible.
+    """
+    paper_id:           str
+    title:              str
+    abstract:           str
+    authors:            list[str]
+    categories:         list[str]
+    primary_categories: str
+    published_date:     str         # ISO Format: "2023-01-17"
+    updated_date:       str
+    arxiv_url:          str
+    pdf_url:            str
+    # Pipeline stage flags - track what processing has been done
+    pdf_downloaded:     bool = False
+    text_extracted:     bool = False
+    chunked:            bool = False
+    embedded:           bool = False
+    @field_validator("title", "abstract")
+    @classmethod
+    def clean_whitespace(cls, value: str) -> str:
+        """
+        Strip excess whitespace from text fields
+        ArXiv abstracts often contain \n and multiple spaces
+        """
+        return " ".join(value.split())
+    @field_validator("paper_id")
+    @classmethod
+    def extract_short_id(cls, value: str) -> str:
+        """
+        ArXiv returns IDs like 'http://arxiv.org/abs/2301.07041v1'
+        We want just '2301.07041'
+        """
+        # Split on "/" and take the last part, then remove version suffix
+        short_id = value.split("/")[-1]
+        if "v" in short_id:
+            short_id = short_id.split("v")[0]
+        return short_id
+# -------------------------------------------
+# FETCHER CLASS
+# -------------------------------------------
+class ArXivFetcher:
+    """
+    Fetches and persists paper metadata from the ArXiv API.
+    DESIGN PATTERN: This class is stateless — it doesn't store any
+    papers in memory. It fetches, validates, and immediately saves
+    to disk. This means if the process crashes at paper #347,
+    papers 1-346 are already saved and we can resume.
+    """
+    def __init__(self):
+        # arxiv.Client lets us configure rate limiting behavior
+        self.client = arxiv.Client(
+            page_size = MAX_PAPERS_PER_FETCH,
+            # Delay between API page requests (ArXiv policy: >= 3 Seconds)
+            delay_seconds = ARXIV_API_DELAY_SECONDS,
+            num_retries = 3     # Retry failed requests automatically
+        )
+        # File to track which paper IDs we've already downloaded
+        # This enables idempotent runs - safe to run pipeline multiple times
+        self.index_file = RAW_DIR / "paper_index.json"
+        self.existing_ids = self._load_existing_ids()
+        logger.info(
+            f"ArXivFetcher initialized. "
+            f"Already have {len(self.existing_ids)} papers indexed."
+        )
+    def _load_existing_ids(self) -> set[str]:
+        """
+        Load set of already-fetched paper IDs from disk
+        WHY A SET: Checking 'if paper_id in existing_ids' is O(1) with a set
+        versus O(n) with a list. At 10,000 papers, this matters.
+        """
+        if self.index_file.exists():
+            with open(self.index_file, "r") as f:
+                data = json.load(f)
+                return set(data.get("paper_ids", []))
+        return set()
+    def _save_paper_metadata(self, paper: PaperMetadata) -> Path:
+        """
+        Save a single paper's metadata as JSON to disk.
+        Each paper gets its own JSON file named by its ID.
+        WHY NOT A DATABASE: For a pipeline this size, flat JSON files
+        are simpler, portable, and Git-friendly. We add a database
+        later when we need querying capabilities.
+        """
+        # e.g., data/raw/2301.07041.json
+        file_path = RAW_DIR / f"{paper.paper_id}.json"
+        with open(file_path, "w", encoding = 'utf-8') as f:
+            # model.dump() converts Pydantic model to dict
+            # indent = 2 makes the JSON human-readable
+            json.dump(paper.model_dump(), f, indent = 2, ensure_ascii = False)
+        return file_path
+    def _update_index(self, paper_id: str):
+        """
+        Add paper_id to our index file and memory set.
+        Called after every successful save
+        """
+        self.existing_ids.add(paper_id)
+        with open(self.index_file, "w") as f:
+            json.dump(
+                {
+                    "paper_ids": list(self.existing_ids),
+                    "last_updated": datetime.now().isoformat(),
+                    "total_count": len(self.existing_ids)
+                },
+                f, indent = 2
+            )
+    def _parse_arxiv_result(self, result: arxiv.Result) -> Optional[PaperMetadata]:
+        """
+        Convert a raw arxiv.Result object into our PaperMetadata model.
+        WHY THIS WRAPPER EXISTS:
+        The arxiv library's Result object has its own structure that
+        may change across library versions. By converting to our own
+        PaperMetadata model here, the rest of our codebase never
+        depends on the arxiv library directly. If arxiv changes its
+        API tomorrow, we only fix this one function.
+        This is called the ADAPTER PATTERN.
+        """
+        try:
+            metadata = PaperMetadata(
+                paper_id            = result.entry_id,
+                title               = result.title,
+                abstract            = result.summary,
+                authors             = [str(a) for a in result.authors],
+                categories          = result.categories,
+                primary_categories  = result.primary_category,
+                published_date      = result.published.strftime("%Y-%m-%d"),
+                updated_date        = result.updated.strftime("%Y-%m-%d"),
+                arxiv_url           = result.entry_id,
+                pdf_url             = result.pdf_url,
+            )
+            return metadata
+        except Exception as e:
+            # Log warning but don't crash - one bad paper shouldn't
+            # stop the entire pipeline
+            logger.warning(f"Failed to parse paper: {result.entry_id}: {e}")
+            return None
+    def fetch_papers(
+        self,
+        categories: list[str] = None,
+        max_papers: int = None,
+        date_filter_year: Optional[int] = None
+    ) -> list[PaperMetadata]:
+        """
+        Main method: fetch papers from ArXiv for given categories.
+        Args:
+            categories:         ArXiv category codes e.g. ["cs.LG", "cs.AI"]
+            max_papers:         Maximum papers to fetch
+            date_filter_year:   Only fetch papers from this years onwards
+        Returns:
+            List of validated PaperMetaData objects
+        HOW THE QUERY WORKS:
+        ArXiv search syntax uses boolean operators.
+        'cat:cs.LG' OR 'cat:cs.AI' means "Papers in cs.LG OR cs.AI category"
+        We sort by submission date (newest first) to get fresh papers.
+        """
+        if categories is None:
+            categories = ARXIV_CATEGORIES
+        if max_papers is None:
+            max_papers = TOTAL_PAPERS_TARGET
+        # Build search query: "cat:cs.LG OR cat:cs.AI"
+        category_query = " OR ".join([f"cat:{cat}" for cat in categories])
+        logger.info(f"Search query: '{category_query}'")
+        logger.info(f"Target: '{max_papers} papers'")
+        # Configure ArXiv search
+        search = arxiv.Search(
+            query       = category_query,
+            max_results = max_papers * 2,    # Fetch extra account for skips
+            sort_by     = arxiv.SortCriterion.SubmittedDate,
+            sort_order  = arxiv.SortOrder.Descending,
+        )
+        fetched_papers = []
+        skipped_duplicate = 0
+        skipped_invalid = 0
+        logger.info("Starting ArXiv fetch...")
+        # self.client.results() is a GENERATOR
+        # WHY GENERATOR: It fetches pages lazily - doesn't load all 500
+        # papers into memory at once. Memory efficient.
+        for result in self.client.results(search):
+            # Stop if we've reached our target
+            if len(fetched_papers) >= max_papers:
+                break
+            # Skip papers we already have
+            raw_id = result.entry_id.split("/")[-1].split("v")[0]
+            if raw_id in self.existing_ids:
+                skipped_duplicate += 1
+                continue
+            # Apply year filter if specified
+            if date_filter_year and result.published.year < date_filter_year:
+                continue
+            # Parse and validate
+            paper = self._parse_arxiv_result(result)
+            if paper is None:
+                skipped_invalid += 1
+                continue
+            # Save to disk immediately
+            self._save_paper_metadata(paper)
+            self._update_index(paper.paper_id)
+            fetched_papers.append(paper)
+            # Progress logging every 10 papers
+            if len(fetched_papers) % 10 == 0:
+                logger.info(
+                    f"Progress: {len(fetched_papers)}/{max_papers} papers fetched"
+                )
+        logger.info(
+            f"Fetch complete."
+            f"Fetched: {len(fetched_papers)} | "
+            f"Skipped (duplicate): {skipped_duplicate} | "
+            f"Skipped (invalid): {skipped_invalid}"
+        )
+        return fetched_papers

src/ingestion/pdf_downloader.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Downloads PDF files for papers that have been fetched from ArXiv.
+SEPARATION OF CONCERNS:
+  arxiv_fetcher.py  → Gets metadata (fast, no large files)
+  pdf_downloader.py → Downloads PDFs (slow, large files)
+This separation means if PDF download fails, metadata is safe.
+We can retry ONLY the failed PDFs without re-fetching metadata.
+"""
+import time
+import json
+import requests
+from pathlib import Path
+from tqdm import tqdm # Progress bar
+from src.utils.logger import get_logger
+from config.settings import (
+    RAW_DIR,
+    PDF_DOWNLOAD_TIMEOUT,
+    MAX_DOWNLOAD_RETRIES,
+    ARXIV_API_DELAY_SECONDS
+)
+logger = get_logger(__name__)
+class PDFDownloader:
+    """
+    Download PDFs from ArXiv with retry logic and progress tracking
+    """
+    def __init__(self):
+        # Configure requests session
+        # WHY SESSION: Reuses TCP Connection across requests
+        # (faster than creating new connection per download)
+        self.session = requests.Session()
+        self.session.headers.update(
+            {
+                # Identify themselves to ArXiv - polite and avoids blocks
+                "User-Agent": "ResearchPilot/1.0 (educational research project)"
+            }
+        )
+        # Directory for downloaded PDFs
+        self.pdf_dir = RAW_DIR / "pdfs"
+        self.pdf_dir.mkdir(exist_ok = True)
+    def download_pdf(self, paper_id: str, pdf_url: str) -> bool:
+        """
+        Download a single PDF with retry logic.
+        Args:
+            paper_id: ArXiv paper ID (used for filename)
+            pdf_url:  Direct URL to the PDF
+        Returns:
+            True if downloaded successfully, False otherwise
+        RETRY PATTERN (Exponential Backoff):
+            Attempt 1: fail → wait 2 seconds
+            Attempt 2: fail → wait 4 seconds
+            Attempt 3: fail → wait 8 seconds
+            → give up, log error, continue to next paper
+        WHY EXPONENTIAL BACKOFF:
+        If a server is overloaded, hammering it with immediate retries
+        makes things worse. Waiting longer between retries gives the
+        server time to recover. This is standard practice for all
+        production systems that call external services.
+        """
+        output_path = self.pdf_dir / f"{paper_id}.pdf"
+        # Skip if already downloaded (idempotent)
+        if output_path.exists() and output_path.stat().st_size > 1000:
+            logger.debug(f"PDF already exists: {paper_id}")
+            return True
+        for attempt in range(1, MAX_DOWNLOAD_RETRIES + 1):
+            try:
+                logger.debug(f"Downloading {paper_id} (attempt {attempt})")
+                # stream = True means we download in chunks, not all at once
+                # This prevents running out of memory on large PDFs
+                response = self.session.get(
+                    pdf_url,
+                    timeout = PDF_DOWNLOAD_TIMEOUT,
+                    stream = True
+                )
+                # Raise exception for 4xx or 5xx status codes
+                response.raise_for_status()
+                # Write PDF to disk in chunks of 8KB
+                with open(output_path, "wb") as f:
+                    for chunk in response.iter_content(chunk_size = 8192):
+                        if chunk:   # Filter out keep-alive empty chunks
+                            f.write(chunk)
+                # Verify file is not empty or suspiciously small
+                file_size = output_path.stat().st_size
+                if file_size < 1000:
+                    logger.warning(f"Suspiciously small PDF: {paper_id} ({file_size} bytes)")
+                    output_path.unlink()    # Delete Bad File
+                    return False
+                logger.debug(f"Downloaded {paper_id}: {file_size / 1024:.1f} KB")
+                return True
+            except requests.exceptions.RequestException as e:
+                logger.warning(f"Download attempt {attempt} failed for {paper_id}: {e}")
+                if attempt < MAX_DOWNLOAD_RETRIES:
+                    # Exponential backoff: 2^attempt seconds
+                    wait_time = 2 ** attempt
+                    logger.debug(f"Waiting {wait_time}s before retry...")
+                    time.sleep(wait_time)
+                else:
+                    logger.error(f"All {MAX_DOWNLOAD_RETRIES} attemps failed for {paper_id}")
+                    return False
+        return False
+    def download_all(self, papers: list[str]) -> dict:
+        """
+        Download PDFs for a list of papers with progress tracking.
+        Args:
+            papers: List of paper metadata dicts (loaded from JSON files)
+        Returns:
+            Summary statistics dict
+        """
+        successful = 0
+        failed     = 0
+        skipped    = 0
+        # tqdm wraps our list to show a progress bar
+        # desc= sets the label on the progress bar
+        for paper in tqdm(papers, desc = "Downloading PDFs"):
+            paper_id = paper['paper_id']
+            pdf_url  = paper['pdf_url']
+            # Skip already downloaded papers
+            if paper.get("pdf_downloaded"):
+                skipped += 1
+                continue
+            # Download with delay to respect rate limits
+            success = self.download_pdf(paper_id, pdf_url)
+            if success:
+                successful += 1
+                # Update the paper's JSON file to mark pdf_downloaded = True
+                self._mark_downloaded(paper_id)
+                time.sleep(ARXIV_API_DELAY_SECONDS)
+            else:
+                failed += 1
+        summary = {
+            "successful": successful,
+            "failed":     failed,
+            "skipped":    skipped,
+            "total":      len(papers)
+        }
+        logger.info(f"PDF download complete: {summary}")
+        return summary
+    def _mark_downloaded(self, paper_id: str):
+        """
+        Update the paper's JSON metadata to mark pdf_downloaded = True.
+        This updates our pipeline state flag.
+        """
+        json_path = RAW_DIR / f"{paper_id}.json"
+        if not json_path.exists():
+            return
+        with open(json_path, 'r', encoding = 'utf-8') as f:
+            data = json.load(f)
+        data["pdf_downloaded"] = True
+        with open(json_path, "w") as f:
+            json.dump(data, f, indent = 2)

src/processing/__init__.py ADDED Viewed

File without changes

src/processing/pdf_extractor.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Extracts and cleans text from downloaded PDF files.
+WHY PYMUPDF (fitz) over alternatives:
+    Library        | Speed  |  Quality   | Handles columns?
+    ---------------|--------|------------|-----------------
+    PyMuPDF        | Fast   |  ★★★★★   | Yes (sort=True)
+    pdfplumber     | Medium |  ★★★★☆   | Partial
+    pypdf2         | Medium |  ★★★☆☆   | No
+    pdfminer       | Slow   |  ★★★★☆   | Partial
+PyMuPDF's sort=True parameter reads text in natural reading order
+(top-to-bottom, left-to-right) which is critical for multi-column
+academic papers.
+"""
+import json
+from pathlib import Path
+import fitz  # PyMuPDF - imported as 'fitz' (legacy name from founder)
+from tqdm import tqdm
+from src.processing.text_cleaner import clean_text
+from src.utils.logger import get_logger
+from config.settings import (
+    RAW_DIR,
+    PROCESSED_DIR,
+    MIN_TEXT_LENGTH,
+    MAX_TEXT_LENGTH
+)
+logger = get_logger(__name__)
+class PDFExtractor:
+    """
+    Extracts clean text from PDF files and saves to processed directory.
+    Output structure for each paper:
+    data/processed/2301.07041.json  ← cleaned text + original metadata
+    """
+    def __init__(self):
+        self.pdf_dir = RAW_DIR / 'pdfs'
+    def extract_text_from_pdf(self, pdf_path: Path) -> str:
+        """
+        Extract raw text from a PDF using PyMuPDF.
+        Args:
+            pdf_path: Path to the PDF file
+        Returns:
+            Raw extracted text string (not yet cleaned)
+        HOW PYMUPDF READS PDFS:
+        PDF is a page-based format. We iterate each page,
+        extract text with sort=True (reading order), then
+        join all pages. The 'text' flag tells PyMuPDF to
+        extract plain text (vs HTML or dict formats).
+        """
+        try:
+            # Open PDF - fitz.open() handles file reading
+            doc = fitz.open(str(pdf_path))
+            pages_text = []
+            for page_num, page in enumerate(doc):
+                # get_text("text", sort = True)
+                #   "text" -> plain text extraction mode
+                #   sort = True -> respect reading order (critical for columns)
+                page_text = page.get_text("text", sort = True)
+                if page_text.strip():
+                    pages_text.append(page_text)
+            # Close the document to free memory
+            doc.close()
+            # Join all pages with double newline (paragraph seperator)
+            full_text = '\n\n'.join(pages_text)
+            return full_text
+        except Exception as e:
+            logger.error(f"Failed to extract text from {pdf_path.name}: {e}")
+            return ""
+    def validate_extracted_text(self, text: str, paper_id: str) -> tuple[bool, str]:
+        """
+        Validate that extracted text is usable.
+        Returns:
+            (is_valid: bool, reason: str)
+        VALIDATION RULES:
+        1. Not empty
+        2. Long enough to be a real paper (not a 1-page erratum)
+        3. Not too long (might indicate extraction corruption)
+        4. Contains alphabetic characters (not just symbols/numbers)
+        5. Is primarily English (our embedding model is English-optimized)
+        """
+        if not text:
+            return False, "Empty text"
+        if len(text) < MIN_TEXT_LENGTH:
+            return False, f"Too short: {len(text)} chars < {MIN_TEXT_LENGTH}"
+        if len(text) > MAX_TEXT_LENGTH:
+            return False, f"Too long: {len(text)} chars > {MAX_TEXT_LENGTH}"
+        # Check that text contains substantial alphabetic content
+        # (not just numbers, equations, or garbled encoding)
+        alpha_chars  = sum(1 for c in text if c.isalpha())
+        alpha_ratio = alpha_chars / len(text)
+        if alpha_ratio < 0.4:
+            return False, f"Low alphanumeric ration: {alpha_ratio:.2f} (likely encoding issue)"
+        return True, "Valid"
+    def process_paper(self, paper_metadata: dict) -> bool:
+        """
+        Full pipeline for one paper: extract -> clean -> validate -> save.
+        Args:
+            paper_metadata: dict loaded from data/raw/{paper_id}.json
+        Returns:
+            True if processed successfully, False otherwise
+        """
+        paper_id = paper_metadata['paper_id']
+        # Skip if already processed (idempotent)
+        output_path = PROCESSED_DIR / f'{paper_id}.json'
+        if output_path.exists():
+            logger.debug(f"Already processed: {paper_id}")
+            return True
+        # Check PDF exists
+        pdf_path = self.pdf_dir / f"{paper_id}.pdf"
+        if not pdf_path.exists():
+            logger.warning(f"PDF not found for {paper_id}, using abstract only")
+            # FALLBACK: Use abstract as the text source
+            # Abstract is short but better than nothing
+            # This handles cases where PDF download failed
+            text = paper_metadata.get("abstract", "")
+            if not text:
+                return False
+        else:
+            # Extract from PDF
+            raw_text = self.extract_text_from_pdf(pdf_path)
+            # Clean the text
+            text = clean_text(raw_text)
+        # Validate
+        is_valid, reason = self.validate_extracted_text(text, paper_id)
+        if not is_valid:
+            logger.warning(f"Validation failed for {paper_id}: {reason}")
+            return False
+        # Build processed document
+        processed_doc = {
+            # Copy all original metadata
+            **paper_metadata,
+            # Add processed text
+            "full_text": text,
+            "text_length": len(text),
+            "word_count": len(text.split()),
+            # Update pipeline state
+            "text_extracted": True,
+            "pdf_downloaded": paper_metadata.get("pdf_downloaded", False),
+        }
+        # Save to processed directory
+        with open(output_path, "w", encoding = 'utf-8') as f:
+            json.dump(processed_doc, f, indent = 2, ensure_ascii = False)
+        logger.debug(
+            f"Processed {paper_id}: "
+            f"{processed_doc['word_count']} words, "
+            f"{len(text)} chars"
+        )
+        return True
+    def process_all(self) -> dict:
+        """
+        Process all papers that have been fetched.
+        Loads metadata from data/raw/, extracts text,
+        saves results to data/processed/.
+        """
+        # Load all paper metadata from raw directory
+        raw_files = [
+            f for f in RAW_DIR.glob("*.json")
+            if f.name != "paper_index.json"
+        ]
+        logger.info(f"Found {len(raw_files)} papers to process")
+        successful = 0
+        failed     = 0
+        skipped    = 0
+        for raw_file in tqdm(raw_files, desc = "Extracting text"):
+            with open(raw_file, 'r', encoding = 'utf-8') as f:
+                metadata = json.load(f)
+            # Skip if alrady processed
+            output_path = PROCESSED_DIR / f"{metadata['paper_id']}.json"
+            if output_path.exists():
+                skipped += 1
+                continue
+            success = self.process_paper(metadata)
+            if success:
+                successful += 1
+            else:
+                failed += 1
+        stats = {
+            "total":      len(raw_files),
+            "successful": successful,
+            "failed":     failed,
+            "skipped":    skipped,
+        }
+        logger.info(f"Processing complete: {stats}")
+        return stats

src/processing/text_cleaner.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+Text normalization utilities for extracted PDF content.
+These functions are PURE FUNCTIONS — they take a string,
+return a string, have no side effects, and are independently
+testable. This is the correct way to write data transformation
+logic.
+"""
+import re
+import unicodedata
+import ftfy
+from src.utils.logger import get_logger
+logger = get_logger(__name__)
+def fix_hyphenated_linebreaks(text: str) -> str:
+    """
+    Fix words broken across lines with hyphens.
+    Research PDFs use justified text with hyphenation:
+        "This is a demon-
+         stration of the problem"
+    Should become:
+        "This is a demonstration of the problem"
+    REGEX EXPLANATION:
+        ([a-zA-Z])   -> capture a letter (end of line fragment)
+        -            -> literal hyphen
+        \n           -> newline
+        \s*          -> optional whitespace on next line
+        ([a-zA-Z])   -> capture a letter (start of continuation)
+    """
+    return re.sub(r'([a-zA-Z])-\n\s*([a-zA-Z])', r'\1\2', text)
+def remove_page_artifacts(text: str) -> str:
+    """
+    Remove common PDF page artifacts that pollute extracted text.
+    Handles:
+    - Form feed characters (\x0c) that mark page boundaries
+    - Standalone page numbers (lines containing only digits)
+    - Running headers/footers (short lines that repeat)
+    """
+    # Remove form feed characters (page breaks)
+    text = text.replace('\x0c', '\n')
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        stripped = line.strip()
+        # Skip empty lines (we'll normalize spacing later)
+        if not stripped:
+            cleaned_lines.append('')
+            continue
+        # Skip standalone page numbers: lines that are ONLY digits
+        # e.g., "12", "247"
+        if re.match(r'^\d{1,4}$', stripped):
+            continue
+        # Skip lines that look like page header/footers
+        # Pattern: short lines with mostly uppercase or digits
+        # e.g., "NEURIPS 2023", "arXiv:2301.07041v2"
+        # FIX: Check if the line CONTAINS these patterns anywhere,
+        # not just at the start. Also expanded patterns.
+        artifact_patterns = [
+            r'arXiv:\d{4}\.\d+',           # arXiv:2301.07041v2
+            r'^doi:\s*10\.',               # DOI lines
+            r'Preprint\.\s*Under review',  # "Preprint. Under review"
+            r'Under review',               # Review notice
+            r'Proceedings of (ICML|NeurIPS|ICLR|CVPR|ACL|EMNLP)',
+            r'(ICML|NeurIPS|ICLR|CVPR|ACL|EMNLP)\s+20\d{2}',  # "ICML 2023"
+            r'Workshop on',                # Workshop lines
+            r'^\*+Equal contribution',     # Footnotes
+            r'^\dDepartment of',           # Affiliation footnotes
+            r'^\d+University of',          # University affiliations
+            r'Correspondence to:',         # Contact info
+        ]
+        is_artifacts = False
+        for pattern in artifact_patterns:
+            if re.search(pattern, stripped, re.IGNORECASE):
+                is_artifacts = True
+                break
+        if is_artifacts:
+            continue
+        cleaned_lines.append(line)
+    return "\n".join(cleaned_lines)
+def normalize_whitespace(text: str) -> str:
+    """
+    Normalize all forms of whitespace to standard single spaces.
+    PDFs produce various whitespace characters:
+    - Multiple consecutive spaces (from column alignment)
+    - Tabs
+    - Non-breaking spaces (\xa0)
+    - Zero-width spaces
+    STRATEGY:
+    1. Replace all non-newline whitespace with single space
+    2. Collapse multiple newlines into max double newline
+       (preserving paragraph breaks)
+    3. Strip leading/trailing whitespace
+    """
+    # Replace tabs and non-breaking spaces with regular space
+    text = text.replace('\t', ' ')
+    text = text.replace('\xa0', ' ')
+    # Collapse multiple spaces into one
+    # re.sub with pattern ' +' matches one or more spaces
+    text = re.sub(r' +', ' ', text)
+    # Collapse 3+ consecutive newlines into exactly 2
+    # (preserves paragraph breaks without excessive gaps)
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Strip each line individually, then rejoin
+    lines = [line.strip() for line in text.split('\n')]
+    text = '\n'.join(lines)
+    return text.strip()
+def fix_unicode(text: str) -> str:
+    """
+    Fix broken Unicode encoding common in PDF text extraction.
+    PDFs often have encoding issues:
+    - "â€™" instead of "'" (UTF-8 read as Latin-1)
+    - "Ã©" instead of "é"
+    - Ligature characters: "ﬁ" (fi ligature) instead of "fi"
+    ftfy (Fixes Text For You) handles all these cases automatically.
+    It was created at Luminoso and is used in production at scale.
+    """
+    return ftfy.fix_text(text)
+def remove_reference_section(text: str) -> str:
+    """
+    Remove the bibliography/references section from papers.
+    WHY: References contain hundreds of author names, journal names,
+    and years. These would pollute our vector index — if someone asks
+    about "attention mechanisms", we don't want to retrieve a chunk
+    that's just a list of citations like:
+    "Vaswani, A., Shazeer, N., Parmar, N., ... (2017). Attention is all you need."
+    APPROACH: Find the last occurrence of a "References" header and
+    remove everything after it. We use LAST occurrence because some
+    papers have "Related Work" sections that reference other sections
+    before the actual bibliography.
+    """
+    # Patterns that signal start of references section
+    # re.IGNORECASE to handle "References", "REFERENCES", "Bibliography"
+    # FIX: More robust patterns that handle varied spacing
+    referece_patterns = [
+        r'\n\s*References\s*\n',
+        r'\n\s*REFERENCES\s*\n',
+        r'\n\s*Bibliography\s*\n',
+        r'\n\s*BIBLIOGRAPHY\s*\n',
+        r'\n\s*\d+\.\s*References\s*\n',
+        r'\n\s*\d+\s+References\s*\n',
+        # Handle case where References appears after a section number
+        r'\nReferences$',           # At end of line
+    ]
+    last_match_pos = -1
+    for pattern in referece_patterns:
+        # Find all matches, take the last one
+        matches = list(re.finditer(pattern, text, re.MULTILINE))
+        if matches:
+            # Take position of the last match
+            pos = matches[-1].start()
+            if pos > last_match_pos:
+                last_match_pos = pos
+    if last_match_pos > 0:
+        # Only remove if references is in the last 40% of document
+        # Increased from 30% because some papers have long conclusions
+        cutoff_threshold = len(text) * 0.60
+        if last_match_pos > cutoff_threshold:
+            text = text[:last_match_pos]
+            logger.debug('References section removed')
+        else:
+            logger.debug(
+                f"Reference found at {last_match_pos/len(text):.0%} "
+                f"- too early to be bibliography, keeping"
+            )
+    return text
+def remove_short_lines(text: str, min_length: int = 3) -> str:
+    """
+    Remove lines that are too short to be meaningful content.
+    Very short lines in PDFs are usually:
+    - Stray characters from column separators
+    - Figure/table labels: "Fig.", "Table 1"
+    - Single letter section markers
+    We keep lines >= min_length characters.
+    """
+    lines = text.split('\n')
+    cleaned = [
+        line for line in lines
+        if len(line.strip()) == 0 or len(line.strip()) >= min_length
+    ]
+    return '\n'.join(cleaned)
+def clean_text(text: str) -> str:
+    """
+    Master cleaning function — applies all transformations in order.
+    ORDER MATTERS:
+    1. Fix encoding first (so subsequent regex works on clean chars)
+    2. Fix hyphenation (before whitespace normalization)
+    3. Remove page artifacts (before whitespace normalization)
+    4. Remove references (on mostly clean text)
+    5. Remove short lines
+    6. Normalize whitespace LAST (cleans up after all other operations)
+    """
+    if not text or not text.strip():
+        return ""
+    text = fix_unicode(text)
+    text = fix_hyphenated_linebreaks(text)
+    text = remove_page_artifacts(text)
+    text = remove_reference_section(text)
+    text = remove_short_lines(text)
+    text = normalize_whitespace(text)
+    return text

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Centralized logging setup using loguru.
+WHY LOGURU over Python's built-in logging:
+- Built-in logging requires 10+ lines of boilerplate to set up properly
+- Loguru does it in 3 lines
+- Loguru auto-formats with colors, timestamps, and file/line info
+- Loguru handles file rotation automatically
+- Every serious Python project at companies like Stripe uses structured logging
+"""
+import sys
+from loguru import logger
+from config.settings import LOG_LEVEL, LOG_FILE, LOG_ROTATION, LOG_RETENTION
+def setup_logger():
+    """
+    Configure loguru logger with both console and file output.
+    Return the configured logger instance
+    Call this once at application startup
+    """
+    # Remove the default logger handler
+    # (it only prints to console with basic formatting)
+    logger.remove()
+    # ----------- Console Handler -----------
+    # Prints colored, formatted logs to terminal
+    # Format: 2024-01-15 10:23:45 | INFO | module:function:42 | Message
+    logger.add(
+        sys.stdout,
+        level = LOG_LEVEL,
+        format = (
+            "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
+            "<level>{level: <8}</level> | "
+            "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
+            "<level>{message}</level>"
+        ),
+        colorize = True
+    )
+    # ----------- File Handler -----------
+    # Writes all logs to file for debugging and monitoring
+    # rotation="10 MB"      -> creates new file when current reaches 10MB
+    # retention="7 days"    -> deletes log files oder than 7 days
+    logger.add(
+        LOG_FILE,
+        level = 'DEBUG',
+        format = "{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {name}:{function}:{line} | {message}",
+        rotation = LOG_ROTATION,
+        retention = LOG_RETENTION,
+        encoding = 'utf-8'
+    )
+    return logger
+# Create the logger instance
+# Other modules import this directly:
+#   from src.utils.logger import get_logger
+#   logger = get_logger(__name__)
+def get_logger(name: str):
+    """
+    Get a named logger instance.
+    The name appears in log output so you know which module logged what
+    Usage:
+        from src.utils.logger import get_logger
+        logger = get_logger(__name__)
+        logger.info("Starting ingestion...")
+    """
+    return logger.bind(name = name)
+# Initialize logger when this module is first imported
+# setup_logger()

test_fetch.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from src.utils.logger import setup_logger
+from src.ingestion.arxiv_fetcher import ArXivFetcher
+setup_logger()
+fetcher = ArXivFetcher()
+papers = fetcher.fetch_papers(max_papers = 5)
+for p in papers:
+    print(f"{p.paper_id}: {p.title[:60]}...")

test_fetch_2.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# test_fetch.py
+"""
+Smart test script that handles existing data correctly.
+Tests three things:
+  1. Can we load existing papers from disk?
+  2. Can we fetch NEW papers (beyond what we have)?
+  3. Is our data schema correct?
+"""
+import json
+from pathlib import Path
+from src.utils.logger import setup_logger, get_logger
+from src.ingestion.arxiv_fetcher import ArXivFetcher
+from config.settings import RAW_DIR
+setup_logger()
+logger = get_logger(__name__)
+def test_existing_data():
+    """Check what we already have on disk."""
+    paper_files = [
+        f for f in RAW_DIR.glob("*.json")
+        if f.name != "paper_index.json"
+    ]
+    logger.info(f"Papers already on disk: {len(paper_files)}")
+    if not paper_files:
+        logger.warning("No papers found on disk. Run fetch first.")
+        return []
+    papers = []
+    for pf in paper_files[:3]:  # Show first 3
+        with open(pf) as f:
+            data = json.load(f)
+        papers.append(data)
+        logger.info(f"  -> {data['paper_id']}: {data['title'][:60]}...")
+        logger.info(f"     Category: {data['primary_categories']} | Date: {data['published_date']}")
+    return papers
+def test_schema_validation():
+    """Verify our Pydantic schema works correctly."""
+    from src.ingestion.arxiv_fetcher import PaperMetadata
+    logger.info("Testing schema validation...")
+    # Test with valid data
+    try:
+        paper = PaperMetadata(
+            paper_id         = "http://arxiv.org/abs/2301.07041v2",  # Raw ID with version
+            title            = "  Test Paper  With  Extra   Spaces  ",
+            abstract         = "This is a test abstract.",
+            authors          = ["Author One", "Author Two"],
+            categories       = ["cs.LG", "cs.AI"],
+            primary_categories = "cs.LG",
+            published_date   = "2023-01-17",
+            updated_date     = "2023-03-15",
+            arxiv_url        = "https://arxiv.org/abs/2301.07041",
+            pdf_url          = "https://arxiv.org/pdf/2301.07041",
+        )
+        # Verify our validators ran
+        assert paper.paper_id == "2301.07041", f"ID cleanup failed: {paper.paper_id}"
+        assert paper.title == "Test Paper With Extra Spaces", f"Whitespace cleanup failed: {paper.title}"
+        logger.info("  -> Schema validation: PASSED")
+        logger.info(f"     paper_id cleaned: '2301.07041'")
+        logger.info(f"     title cleaned: '{paper.title}'")
+        return True
+    except Exception as e:
+        logger.error(f"  -> Schema validation FAILED: {e}")
+        return False
+def test_fresh_fetch(n: int = 3):
+    """
+    Fetch papers, but temporarily ignore existing index
+    to force fresh results for testing.
+    """
+    logger.info(f"Fetching {n} fresh papers from ArXiv...")
+    fetcher = ArXivFetcher()
+    # TEMPORARY: clear existing IDs in memory only (not on disk)
+    # This lets us test the fetch logic without deleting real data
+    original_ids = fetcher.existing_ids.copy()
+    fetcher.existing_ids = set()  # Pretend we have nothing
+    papers = fetcher.fetch_papers(max_papers=n)
+    # Restore original IDs
+    fetcher.existing_ids = original_ids
+    if papers:
+        logger.info(f"  -> Fresh fetch: PASSED. Got {len(papers)} papers")
+        for p in papers:
+            logger.info(f"     {p.paper_id}: {p.title[:55]}...")
+    else:
+        logger.warning("  -> Fresh fetch returned 0 papers. Check network connection.")
+    return papers
+def main():
+    logger.info("=" * 55)
+    logger.info("RESEARCHPILOT — INGESTION TEST SUITE")
+    logger.info("=" * 55)
+    # Test 1: Existing data
+    logger.info("\n[TEST 1] Checking existing data on disk...")
+    existing = test_existing_data()
+    # Test 2: Schema validation
+    logger.info("\n[TEST 2] Schema validation...")
+    test_schema_validation()
+    # Test 3: Fresh fetch
+    logger.info("\n[TEST 3] Fresh fetch from ArXiv...")
+    fresh = test_fresh_fetch(n=3)
+    logger.info("\n" + "=" * 55)
+    logger.info("TEST SUITE COMPLETE")
+    logger.info(f"Existing papers: {len(existing)} shown (may have more)")
+    logger.info(f"Fresh papers fetched: {len(fresh)}")
+    logger.info("=" * 55)
+if __name__ == "__main__":
+    main()

test_processing.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from src.utils.logger import setup_logger, get_logger
+from src.processing.text_cleaner import clean_text
+setup_logger()
+logger = get_logger(__name__)
+# Simulate dirty PDF text
+dirty_text = """
+arXiv:2301.07041v2  [cs.LG]  17 Jan 2023
+We propose a novel at-
+tention mechanism that re-
+duces computational com-
+plexity significantly.
+This method achieves state-of-the-art results.
+2
+ICML 2023 Workshop
+The key insight is that sparse attention patterns
+can approximate full attention with minimal quality loss.
+References
+Vaswani, A., et al. (2017). Attention is all you need.
+Brown, T., et al. (2020). Language models are few-shot learners.
+"""
+cleaned = clean_text(dirty_text)
+logger.info("─── DIRTY TEXT ───")
+print(dirty_text[:300])
+logger.info("─── CLEANED TEXT ───")
+print(cleaned)
+logger.info(f"Original length: {len(dirty_text)}")
+logger.info(f"Cleaned length:  {len(cleaned)}")