| """ |
| Layer 4: SQLite-Backed Knowledge Graph |
| ========================================= |
| Typed epistemic edges, conflict detection, gap analysis. |
| """ |
|
|
| import json |
| from typing import Optional |
| from ..core.database import get_db, gen_id, now_iso, to_fixed, from_fixed |
|
|
|
|
| class KnowledgeGraph: |
| """SQLite-backed knowledge graph with typed epistemic edges.""" |
| |
| def __init__(self, db_path: str = None): |
| self.db_path = db_path |
| |
| def add_claim_node(self, claim_id: str, text: str, properties: dict = None): |
| """Add a claim as a graph node.""" |
| conn = get_db(self.db_path) |
| conn.execute(""" |
| INSERT OR IGNORE INTO graph_nodes (node_id, node_type, label, properties, created_at) |
| VALUES (?, 'claim', ?, ?, ?) |
| """, (claim_id, text[:200], json.dumps(properties or {}), now_iso())) |
| conn.commit() |
| conn.close() |
| |
| def add_edge(self, source: str, target: str, edge_type: str, |
| confidence: float, evidence_sources: list = None, |
| is_inferred: bool = False, method_compatible: bool = None) -> str: |
| """Add a typed edge between two nodes.""" |
| edge_id = gen_id("EDGE") |
| conn = get_db(self.db_path) |
| conn.execute(""" |
| INSERT INTO graph_edges (edge_id, source_node, target_node, edge_type, |
| confidence, evidence_sources, is_inferred, method_compatible, |
| created_at, updated_at) |
| VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) |
| """, (edge_id, source, target, edge_type, |
| to_fixed(confidence), json.dumps(evidence_sources or []), |
| int(is_inferred), |
| int(method_compatible) if method_compatible is not None else None, |
| now_iso(), now_iso())) |
| conn.commit() |
| conn.close() |
| return edge_id |
| |
| def get_neighbors(self, node_id: str, edge_type: str = None, |
| include_inferred: bool = False) -> list: |
| """Get all neighbors of a node with optional edge type filter.""" |
| conn = get_db(self.db_path) |
| conditions = ["(source_node = ? OR target_node = ?)"] |
| params = [node_id, node_id] |
| |
| if edge_type: |
| conditions.append("edge_type = ?") |
| params.append(edge_type) |
| if not include_inferred: |
| conditions.append("is_inferred = 0") |
| |
| where = " AND ".join(conditions) |
| rows = conn.execute(f""" |
| SELECT e.*, n.label as neighbor_label, n.node_type as neighbor_type |
| FROM graph_edges e |
| LEFT JOIN graph_nodes n ON ( |
| CASE WHEN e.source_node = ? THEN e.target_node ELSE e.source_node END = n.node_id |
| ) |
| WHERE {where} |
| ORDER BY e.confidence DESC |
| """, [node_id] + params).fetchall() |
| conn.close() |
| |
| results = [] |
| for r in rows: |
| d = dict(r) |
| d["confidence"] = from_fixed(d["confidence"]) if d.get("confidence") else 0 |
| d["evidence_sources"] = json.loads(d.get("evidence_sources", "[]")) |
| results.append(d) |
| return results |
| |
| def find_conflicts(self, min_similarity: float = 0.3, limit: int = 50) -> list: |
| """Find potential conflicts between claims (keyword overlap heuristic).""" |
| conn = get_db(self.db_path) |
| claims = conn.execute(""" |
| SELECT claim_id, text, epistemic_tag, composite_confidence, source_doi |
| FROM claims WHERE status != 'Unextractable' |
| ORDER BY composite_confidence DESC LIMIT 500 |
| """).fetchall() |
| conn.close() |
| |
| pairs = [] |
| claims_list = [dict(c) for c in claims] |
| |
| for i, a in enumerate(claims_list): |
| for b in claims_list[i+1:]: |
| if a.get("source_doi") == b.get("source_doi"): |
| continue |
| |
| words_a = set(a["text"].lower().split()) - self._stopwords |
| words_b = set(b["text"].lower().split()) - self._stopwords |
| |
| if not words_a or not words_b: |
| continue |
| |
| overlap = len(words_a & words_b) / min(len(words_a), len(words_b)) |
| if overlap >= min_similarity: |
| pairs.append({ |
| "claim_a": a, |
| "claim_b": b, |
| "overlap": overlap, |
| }) |
| |
| if len(pairs) >= limit: |
| break |
| if len(pairs) >= limit: |
| break |
| |
| pairs.sort(key=lambda x: -x["overlap"]) |
| return pairs |
| |
| def find_gaps(self, min_degree: int = 3) -> list: |
| """ |
| Gap Analysis: Find well-connected entities with no direct edge between them. |
| These are high-value research opportunities. |
| """ |
| conn = get_db(self.db_path) |
| |
| |
| nodes = conn.execute(""" |
| SELECT n.node_id, n.label, n.node_type, |
| (SELECT COUNT(*) FROM graph_edges WHERE source_node = n.node_id OR target_node = n.node_id) as degree |
| FROM graph_nodes n |
| HAVING degree >= ? |
| ORDER BY degree DESC |
| """, (min_degree,)).fetchall() |
| |
| gaps = [] |
| node_list = [dict(n) for n in nodes] |
| |
| for i, a in enumerate(node_list): |
| for b in node_list[i+1:]: |
| if a["node_type"] != b["node_type"]: |
| continue |
| |
| |
| edge = conn.execute(""" |
| SELECT 1 FROM graph_edges |
| WHERE (source_node = ? AND target_node = ?) |
| OR (source_node = ? AND target_node = ?) |
| """, (a["node_id"], b["node_id"], b["node_id"], a["node_id"])).fetchone() |
| |
| if not edge: |
| info_gain = (a["degree"] + b["degree"]) / max( |
| max(n["degree"] for n in node_list), 1 |
| ) |
| gaps.append({ |
| "entity_a": a["label"], |
| "entity_b": b["label"], |
| "a_degree": a["degree"], |
| "b_degree": b["degree"], |
| "information_gain": round(info_gain, 3), |
| }) |
| |
| conn.close() |
| gaps.sort(key=lambda g: -g["information_gain"]) |
| return gaps[:20] |
| |
| def get_stats(self) -> dict: |
| conn = get_db(self.db_path) |
| stats = { |
| "total_nodes": conn.execute("SELECT COUNT(*) FROM graph_nodes").fetchone()[0], |
| "total_edges": conn.execute("SELECT COUNT(*) FROM graph_edges").fetchone()[0], |
| "observed_edges": conn.execute("SELECT COUNT(*) FROM graph_edges WHERE is_inferred = 0").fetchone()[0], |
| "inferred_edges": conn.execute("SELECT COUNT(*) FROM graph_edges WHERE is_inferred = 1").fetchone()[0], |
| } |
| |
| |
| types = conn.execute( |
| "SELECT edge_type, COUNT(*) FROM graph_edges GROUP BY edge_type" |
| ).fetchall() |
| stats["edge_types"] = {dict(t)["edge_type"]: list(dict(t).values())[1] for t in types} |
| |
| conn.close() |
| return stats |
| |
| _stopwords = {'the', 'a', 'an', 'is', 'was', 'were', 'are', 'been', |
| 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', |
| 'would', 'could', 'should', 'may', 'might', 'shall', |
| 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', |
| 'from', 'and', 'or', 'but', 'not', 'no', 'this', 'that', |
| 'these', 'those', 'it', 'its', 'we', 'our', 'they'} |
|
|