phd_research_os_v2/layer0/parser.py · nkshirsa/phd-research-os-brain at main

File size: 22,450 Bytes

08701e5

"""
Layer 0: Structural PDF Ingestion
===================================
Converts PDF bundles into section-aware, bbox-annotated, quality-scored regions.
Uses Marker as primary parser with fallback to pdfplumber.
"""

import json
import os
import re
from pathlib import Path
from typing import Optional

from ..core.database import (
    get_db, init_db, gen_id, now_iso, to_fixed, from_fixed, hash_text
)


# Section detection patterns for scientific papers
SECTION_PATTERNS = [
    (r'(?i)^(abstract)\b', 'abstract'),
    (r'(?i)^(introduction)\b', 'introduction'),
    (r'(?i)^(background)\b', 'introduction'),
    (r'(?i)^(related\s+work)\b', 'related_work'),
    (r'(?i)^(materials?\s+and\s+methods?|methods?|experimental)\b', 'methods'),
    (r'(?i)^(results?\s+and\s+discussion)\b', 'results_discussion'),
    (r'(?i)^(results?)\b', 'results'),
    (r'(?i)^(discussion)\b', 'discussion'),
    (r'(?i)^(conclusions?|summary)\b', 'conclusion'),
    (r'(?i)^(acknowledge?ments?)\b', 'acknowledgments'),
    (r'(?i)^(references?|bibliography)\b', 'references'),
    (r'(?i)^(supplementary|supporting\s+information|appendix)\b', 'supplement'),
]


def detect_section(text: str) -> Optional[str]:
    """Detect which section a text block belongs to."""
    first_line = text.strip().split('\n')[0].strip()
    # Remove numbering like "2.1", "3.", "III.", "2.1 ", etc.
    first_line = re.sub(r'^[\d]+\.[\d]*\s*', '', first_line)
    first_line = re.sub(r'^[\d]+\.\s*', '', first_line)
    first_line = re.sub(r'^[IVXivx]+\.\s*', '', first_line)
    first_line = first_line.strip()
    
    for pattern, section in SECTION_PATTERNS:
        if re.match(pattern, first_line):
            return section
    return None


def classify_region_type(text: str) -> str:
    """Classify a text block's type based on content patterns."""
    stripped = text.strip()
    
    # Table detection
    if '|' in stripped and stripped.count('|') > 3:
        return 'table'
    if re.match(r'(?i)^table\s+\d', stripped):
        return 'caption'
    
    # Figure detection
    if re.match(r'(?i)^(figure|fig\.?)\s+\d', stripped):
        return 'caption'
    
    # Equation detection (LaTeX or heavy math symbols)
    if stripped.count('$') >= 2 or '\\frac' in stripped or '\\sum' in stripped:
        return 'equation'
    if re.match(r'^\s*\([\d]+\)\s*$', stripped):
        return 'equation'
    
    # Reference detection
    if re.match(r'(?i)^references?\s*$', stripped) or re.match(r'^\[\d+\]', stripped):
        return 'reference'
    
    # Header detection (short, possibly bold/caps)
    if len(stripped) < 100 and stripped.isupper():
        return 'header'
    if len(stripped) < 80 and not stripped.endswith('.'):
        section = detect_section(stripped)
        if section:
            return 'header'
    
    return 'body_text'


def extract_cross_references(text: str) -> list:
    """Extract in-text references to figures, tables, equations."""
    refs = []
    
    # Figure references
    for m in re.finditer(r'(?i)(figure|fig\.?)\s+(\d+[a-z]?)', text):
        refs.append({
            "ref_text": m.group(0),
            "ref_type": "figure",
            "ref_number": m.group(2),
            "resolved_to": None,
            "verified": False,
        })
    
    # Table references
    for m in re.finditer(r'(?i)(table)\s+(\d+[a-z]?)', text):
        refs.append({
            "ref_text": m.group(0),
            "ref_type": "table",
            "ref_number": m.group(2),
            "resolved_to": None,
            "verified": False,
        })
    
    # Equation references
    for m in re.finditer(r'(?i)(eq\.?|equation)\s+\(?(\d+)\)?', text):
        refs.append({
            "ref_text": m.group(0),
            "ref_type": "equation",
            "ref_number": m.group(2),
            "resolved_to": None,
            "verified": False,
        })
    
    # Citation references [1], [2,3], [1-5]
    for m in re.finditer(r'\[(\d+(?:[,\-–]\s*\d+)*)\]', text):
        refs.append({
            "ref_text": m.group(0),
            "ref_type": "citation",
            "ref_number": m.group(1),
            "resolved_to": None,
            "verified": False,
        })
    
    return refs


def score_parse_quality(text: str, method: str) -> int:
    """Score parsing quality for a text region (fixed-point ×1000)."""
    score = 1000  # Start at perfect
    
    if not text or not text.strip():
        return 0
    
    # Penalize: garbled characters (common in bad OCR/parsing)
    garbled_chars = sum(1 for c in text if ord(c) > 65535 or c in '□■◊▪▫●○◆◇')
    garbled_ratio = garbled_chars / max(len(text), 1)
    score -= int(garbled_ratio * 3000)  # Heavy penalty: even 10% garbled → -300
    if garbled_chars > 0:
        score -= garbled_chars * 50  # Additional per-character penalty
    
    # Penalize: excessive whitespace (column merge artifact)
    ws_ratio = text.count('  ') / max(len(text), 1)
    score -= int(ws_ratio * 200)
    
    # Penalize: very short fragments (likely parsing artifact)
    if len(text.strip()) < 20:
        score -= 200
    
    # Penalize: no sentence structure (no periods, likely garbled)
    if len(text) > 100 and '.' not in text:
        score -= 300
    
    # Bonus: markdown structure preserved (Marker output)
    if method == 'marker' and '#' in text:
        score += 50
    
    return max(0, min(1000, score))


class StructuralParser:
    """
    Layer 0: Parse PDF bundles into annotated regions.
    
    Tries Marker first (best quality), falls back to pdfplumber/PyMuPDF.
    Every region gets: section tag, bbox, quality score, cross-references.
    """
    
    def __init__(self, db_path: str = None):
        self.db_path = db_path or os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db")
        init_db(self.db_path)
        self._marker_available = None
        self._fitz_available = None
        self._pdfplumber_available = None
    
    def _check_marker(self) -> bool:
        if self._marker_available is None:
            try:
                import marker
                self._marker_available = True
            except ImportError:
                self._marker_available = False
        return self._marker_available
    
    def _check_fitz(self) -> bool:
        if self._fitz_available is None:
            try:
                import fitz
                self._fitz_available = True
            except ImportError:
                self._fitz_available = False
        return self._fitz_available
    
    def _check_pdfplumber(self) -> bool:
        if self._pdfplumber_available is None:
            try:
                import pdfplumber
                self._pdfplumber_available = True
            except ImportError:
                self._pdfplumber_available = False
        return self._pdfplumber_available
    
    def ingest_document(self, file_path: str, doc_type: str = "main",
                        title: str = None, doi: str = None) -> dict:
        """
        Ingest a single document. Returns ingestion summary.
        """
        if not os.path.exists(file_path):
            return {"error": f"File not found: {file_path}", "doc_id": None}
        
        doc_id = gen_id("DOC")
        conn = get_db(self.db_path)
        
        # Create document record
        conn.execute("""
            INSERT INTO documents (doc_id, file_path, doc_type, title, doi,
                ingestion_status, schema_version, created_at)
            VALUES (?, ?, ?, ?, ?, 'processing', '2.0', ?)
        """, (doc_id, file_path, doc_type, title, doi, now_iso()))
        conn.commit()
        
        # Parse based on available tools
        regions = []
        parse_method = "unknown"
        
        if file_path.lower().endswith('.pdf'):
            if self._check_fitz():
                regions, parse_method = self._parse_with_fitz(file_path, doc_id)
            elif self._check_pdfplumber():
                regions, parse_method = self._parse_with_pdfplumber(file_path, doc_id)
            else:
                conn.execute(
                    "UPDATE documents SET ingestion_status = 'failed' WHERE doc_id = ?",
                    (doc_id,)
                )
                conn.commit()
                conn.close()
                return {"error": "No PDF parser available. Install PyMuPDF: pip install pymupdf", "doc_id": doc_id}
        elif file_path.lower().endswith(('.csv', '.xlsx', '.xls')):
            regions, parse_method = self._parse_tabular(file_path, doc_id)
        elif file_path.lower().endswith(('.md', '.txt')):
            regions, parse_method = self._parse_text(file_path, doc_id)
        else:
            regions, parse_method = self._parse_text(file_path, doc_id)
        
        # Store regions
        for region in regions:
            conn.execute("""
                INSERT INTO regions (region_id, doc_id, page, bbox, region_type,
                    section, subsection, content_text, content_markdown,
                    parse_method, parse_confidence, extraction_status,
                    quality_flags, cross_refs, schema_version, created_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '2.0', ?)
            """, (
                region["region_id"], doc_id, region["page"],
                json.dumps(region.get("bbox")),
                region["region_type"], region.get("section"), region.get("subsection"),
                region["content_text"], region.get("content_markdown"),
                parse_method, region["parse_confidence"],
                region["extraction_status"],
                json.dumps(region.get("quality_flags", [])),
                json.dumps(region.get("cross_refs", [])),
                now_iso()
            ))
        
        # Update document status
        avg_quality = sum(r["parse_confidence"] for r in regions) // max(len(regions), 1)
        conn.execute("""
            UPDATE documents SET ingestion_status = 'complete', parse_method = ?,
                parse_quality_avg = ?, total_regions = ?, created_at = ?
            WHERE doc_id = ?
        """, (parse_method, avg_quality, len(regions), now_iso(), doc_id))
        conn.commit()
        conn.close()
        
        return {
            "doc_id": doc_id,
            "parse_method": parse_method,
            "total_regions": len(regions),
            "avg_quality": from_fixed(avg_quality),
            "regions_by_type": self._count_by_type(regions),
            "sections_found": list(set(r.get("section") for r in regions if r.get("section"))),
        }
    
    def _parse_with_fitz(self, file_path: str, doc_id: str) -> tuple:
        """Parse PDF using PyMuPDF (fitz) with section detection."""
        import fitz
        doc = fitz.open(file_path)
        regions = []
        current_section = None
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            blocks = page.get_text("dict")["blocks"]
            
            for block in blocks:
                if block["type"] == 0:  # Text block
                    text = ""
                    for line in block.get("lines", []):
                        for span in line.get("spans", []):
                            text += span.get("text", "")
                        text += "\n"
                    
                    text = text.strip()
                    if not text or len(text) < 5:
                        continue
                    
                    # Detect section from headers
                    detected = detect_section(text)
                    if detected:
                        current_section = detected
                    
                    region_type = classify_region_type(text)
                    quality = score_parse_quality(text, "fitz")
                    cross_refs = extract_cross_references(text)
                    
                    # Extraction status based on quality
                    if quality >= 700:
                        status = "extractable"
                    elif quality >= 400:
                        status = "low_confidence"
                    else:
                        status = "unextractable"
                    
                    bbox = block.get("bbox", [0, 0, 0, 0])
                    
                    regions.append({
                        "region_id": gen_id("REG"),
                        "page": page_num + 1,
                        "bbox": list(bbox),
                        "region_type": region_type,
                        "section": current_section,
                        "subsection": None,
                        "content_text": text,
                        "content_markdown": text,
                        "parse_confidence": quality,
                        "extraction_status": status,
                        "quality_flags": [],
                        "cross_refs": cross_refs,
                    })
                
                elif block["type"] == 1:  # Image block
                    bbox = block.get("bbox", [0, 0, 0, 0])
                    regions.append({
                        "region_id": gen_id("REG"),
                        "page": page_num + 1,
                        "bbox": list(bbox),
                        "region_type": "figure",
                        "section": current_section,
                        "subsection": None,
                        "content_text": "[Image detected — requires VLM processing]",
                        "content_markdown": "![Figure](image)",
                        "parse_confidence": 500,
                        "extraction_status": "low_confidence",
                        "quality_flags": ["image_region_needs_vlm"],
                        "cross_refs": [],
                    })
        
        doc.close()
        return regions, "fitz"
    
    def _parse_with_pdfplumber(self, file_path: str, doc_id: str) -> tuple:
        """Fallback parser using pdfplumber."""
        import pdfplumber
        regions = []
        current_section = None
        
        with pdfplumber.open(file_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if not text or len(text.strip()) < 10:
                    continue
                
                # Split into paragraphs
                paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
                
                for para in paragraphs:
                    detected = detect_section(para)
                    if detected:
                        current_section = detected
                    
                    region_type = classify_region_type(para)
                    quality = score_parse_quality(para, "pdfplumber")
                    cross_refs = extract_cross_references(para)
                    
                    status = "extractable" if quality >= 700 else "low_confidence" if quality >= 400 else "unextractable"
                    
                    regions.append({
                        "region_id": gen_id("REG"),
                        "page": page_num + 1,
                        "bbox": None,
                        "region_type": region_type,
                        "section": current_section,
                        "subsection": None,
                        "content_text": para,
                        "content_markdown": para,
                        "parse_confidence": quality,
                        "extraction_status": status,
                        "quality_flags": ["no_bbox_available"],
                        "cross_refs": cross_refs,
                    })
                
                # Extract tables
                tables = page.extract_tables()
                for table in tables:
                    if not table:
                        continue
                    table_text = "\n".join([" | ".join([str(c) if c else "" for c in row]) for row in table])
                    regions.append({
                        "region_id": gen_id("REG"),
                        "page": page_num + 1,
                        "bbox": None,
                        "region_type": "table",
                        "section": current_section,
                        "subsection": None,
                        "content_text": table_text,
                        "content_markdown": table_text,
                        "parse_confidence": 700,
                        "extraction_status": "extractable",
                        "quality_flags": ["table_extracted"],
                        "cross_refs": [],
                    })
        
        return regions, "pdfplumber"
    
    def _parse_tabular(self, file_path: str, doc_id: str) -> tuple:
        """Parse CSV/Excel files as data regions."""
        regions = []
        try:
            if file_path.endswith('.csv'):
                with open(file_path) as f:
                    text = f.read()
            else:
                text = f"[Excel file: {os.path.basename(file_path)} — requires pandas for full parsing]"
            
            regions.append({
                "region_id": gen_id("REG"),
                "page": 1,
                "bbox": None,
                "region_type": "table",
                "section": "data",
                "subsection": None,
                "content_text": text[:10000],
                "content_markdown": text[:10000],
                "parse_confidence": 900,
                "extraction_status": "extractable",
                "quality_flags": ["tabular_data"],
                "cross_refs": [],
            })
        except Exception as e:
            regions.append({
                "region_id": gen_id("REG"),
                "page": 1, "bbox": None, "region_type": "body_text",
                "section": None, "subsection": None,
                "content_text": f"Error reading file: {e}",
                "content_markdown": "", "parse_confidence": 0,
                "extraction_status": "unextractable",
                "quality_flags": ["parse_error"], "cross_refs": [],
            })
        return regions, "tabular"
    
    def _parse_text(self, file_path: str, doc_id: str) -> tuple:
        """Parse plain text or markdown files."""
        regions = []
        try:
            with open(file_path, encoding='utf-8', errors='replace') as f:
                text = f.read()
            
            paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
            current_section = None
            
            for para in paragraphs:
                detected = detect_section(para)
                if detected:
                    current_section = detected
                
                regions.append({
                    "region_id": gen_id("REG"),
                    "page": 1, "bbox": None,
                    "region_type": classify_region_type(para),
                    "section": current_section, "subsection": None,
                    "content_text": para, "content_markdown": para,
                    "parse_confidence": 900,
                    "extraction_status": "extractable",
                    "quality_flags": [], "cross_refs": extract_cross_references(para),
                })
        except Exception as e:
            regions.append({
                "region_id": gen_id("REG"),
                "page": 1, "bbox": None, "region_type": "body_text",
                "section": None, "subsection": None,
                "content_text": f"Error: {e}", "content_markdown": "",
                "parse_confidence": 0, "extraction_status": "unextractable",
                "quality_flags": ["parse_error"], "cross_refs": [],
            })
        return regions, "text"
    
    def _count_by_type(self, regions: list) -> dict:
        counts = {}
        for r in regions:
            t = r["region_type"]
            counts[t] = counts.get(t, 0) + 1
        return counts
    
    def get_extractable_regions(self, doc_id: str) -> list:
        """Get all extractable regions for a document, ordered by section."""
        conn = get_db(self.db_path)
        rows = conn.execute("""
            SELECT * FROM regions 
            WHERE doc_id = ? AND extraction_status = 'extractable'
            AND region_type IN ('body_text', 'table', 'caption')
            ORDER BY page, region_id
        """, (doc_id,)).fetchall()
        conn.close()
        
        results = []
        for r in rows:
            d = dict(r)
            d["cross_refs"] = json.loads(d.get("cross_refs", "[]"))
            d["quality_flags"] = json.loads(d.get("quality_flags", "[]"))
            d["bbox"] = json.loads(d["bbox"]) if d.get("bbox") else None
            results.append(d)
        return results
    
    def get_section_chunks(self, doc_id: str) -> list:
        """
        Get section-aware chunks for extraction.
        Merges consecutive body_text regions in the same section.
        """
        regions = self.get_extractable_regions(doc_id)
        chunks = []
        current_chunk = None
        
        for region in regions:
            section = region.get("section") or "unknown"
            
            if (current_chunk and 
                current_chunk["section"] == section and 
                region["region_type"] == "body_text" and
                len(current_chunk["text"]) < 3000):
                # Merge into current chunk
                current_chunk["text"] += "\n\n" + region["content_text"]
                current_chunk["region_ids"].append(region["region_id"])
                current_chunk["min_confidence"] = min(
                    current_chunk["min_confidence"], region["parse_confidence"]
                )
            else:
                # Start new chunk
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = {
                    "chunk_id": gen_id("CHK"),
                    "doc_id": doc_id,
                    "section": section,
                    "text": region["content_text"],
                    "region_ids": [region["region_id"]],
                    "page": region["page"],
                    "min_confidence": region["parse_confidence"],
                    "region_type": region["region_type"],
                }
        
        if current_chunk:
            chunks.append(current_chunk)
        
        return chunks