File size: 5,297 Bytes
e1624f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import json
import re
import pymupdf4llm
import networkx as nx
import logging
from typing import List, Dict, Optional

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class AdvancedOncoIngestor:
    """
    Advanced Ingestor for SOTA RAG.
    - Uses pymupdf4llm for Markdown table preservation.
    - Builds a basic Knowledge Graph (GraphRAG) using NetworkX.
    """
    
    def __init__(self, output_dir: str = "data/processed/sota_chunks", graph_path: str = "data/processed/knowledge_graph.gml"):
        self.output_dir = output_dir
        self.graph_path = graph_path
        self.graph = nx.Graph()
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Keywords for entity extraction (Basic regex-based GraphRAG)
        self.drugs = ["pembrolizumab", "nivolumab", "erlotinib", "afatinib", "osimertinib", "gefitinib", "alectinib", "brigatinib", "lorlatinib", "trastuzumab", "pertuzumab", "lapatinib", "neratinib", "t-dm1", "paclitaxel", "docetaxel", "carboplatin", "cisplatin", "gemcitabine", "pemetrexed", "bevacizumab", "ramucirumab", "atezolizumab", "durvalumab"]
        self.mutations = ["egfr", "alk", "ros1", "braf", "kras", "nras", "her2", "pd-l1", "msi-h", "dmmr", "pik3ca", "esr1", "brca1", "brca2", "ret", "met", "ntrk"]
        self.conditions = ["nsclc", "sclc", "breast cancer", "colon cancer", "rectal cancer", "melanoma", "adenocarcinoma", "squamous cell carcinoma"]

    def extract_and_graph(self, pdf_path: str):
        """
        Converts PDF to Markdown and updates the Knowledge Graph.
        """
        filename = os.path.basename(pdf_path)
        logger.info(f"⏳ Processing {filename} with SOTA Markdown extraction...")
        
        # 1. Convert PDF to Markdown (preserves tables!)
        md_text = pymupdf4llm.to_markdown(pdf_path)
        
        # 2. Simple Semantic Chunking (split by headers)
        # We look for # or ## headers in markdown
        chunks = []
        current_chunk = []
        current_header = "Intro"
        
        for line in md_text.split("\n"):
            if line.startswith("#"):
                if current_chunk:
                    content = "\n".join(current_chunk)
                    chunks.append({
                        "header": current_header,
                        "content": content,
                        "source": filename
                    })
                    self._update_graph(content, filename)
                current_header = line.strip("# ").strip()
                current_chunk = []
            else:
                current_chunk.append(line)
        
        # Save last chunk
        if current_chunk:
            content = "\n".join(current_chunk)
            chunks.append({
                "header": current_header,
                "content": content,
                "source": filename
            })
            self._update_graph(content, filename)
            
        # 3. Save chunks
        output_path = os.path.join(self.output_dir, f"{filename.replace('.pdf', '')}.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunks, f, ensure_ascii=False, indent=4)
        
        logger.info(f"✅ Saved {len(chunks)} Markdown chunks for {filename}")

    def _update_graph(self, text: str, source: str):
        """
        Updates the NetworkX graph by extracting clinical entities.
        """
        text_lower = text.lower()
        
        found_drugs = [d for d in self.drugs if d in text_lower]
        found_mutations = [m for m in self.mutations if m in text_lower]
        found_conditions = [c for c in self.conditions if c in text_lower]
        
        # Add nodes and edges
        for d in found_drugs:
            self.graph.add_node(d, type="drug")
            for m in found_mutations:
                self.graph.add_node(m, type="mutation")
                self.graph.add_edge(d, m, relation="targets", source=source)
            for c in found_conditions:
                self.graph.add_node(c, type="condition")
                self.graph.add_edge(d, c, relation="treats", source=source)
        
        for m in found_mutations:
            for c in found_conditions:
                self.graph.add_edge(m, c, relation="associated_with", source=source)

    def save_graph(self):
        """
        Saves the graph to disk.
        """
        # Save as GML for better compatibility with graph tools, or JSON for simplicity
        nx.write_gml(self.graph, self.graph_path)
        logger.info(f"🕸️ Knowledge Graph saved to {self.graph_path} ({len(self.graph.nodes)} nodes, {len(self.graph.edges)} edges)")

if __name__ == "__main__":
    ingestor = AdvancedOncoIngestor()
    
    guides_dir = "data/clinical_guides"
    target_files = ["nscl.pdf", "breast.pdf", "colon.pdf", "hcc.pdf"]
    
    if os.path.exists(guides_dir):
        for root, dirs, files in os.walk(guides_dir):
            for file in files:
                if file in target_files and "patient" not in file.lower():
                    pdf_path = os.path.join(root, file)
                    ingestor.extract_and_graph(pdf_path)
        
        ingestor.save_graph()
    else:
        logger.warning(f"Directory {guides_dir} not found.")