v2.0: phd_research_os_v2/layer4/graph.py
Browse files
phd_research_os_v2/layer4/graph.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Layer 4: SQLite-Backed Knowledge Graph
|
| 3 |
+
=========================================
|
| 4 |
+
Typed epistemic edges, conflict detection, gap analysis.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
from typing import Optional
|
| 9 |
+
from ..core.database import get_db, gen_id, now_iso, to_fixed, from_fixed
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class KnowledgeGraph:
|
| 13 |
+
"""SQLite-backed knowledge graph with typed epistemic edges."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, db_path: str = None):
|
| 16 |
+
self.db_path = db_path
|
| 17 |
+
|
| 18 |
+
def add_claim_node(self, claim_id: str, text: str, properties: dict = None):
|
| 19 |
+
"""Add a claim as a graph node."""
|
| 20 |
+
conn = get_db(self.db_path)
|
| 21 |
+
conn.execute("""
|
| 22 |
+
INSERT OR IGNORE INTO graph_nodes (node_id, node_type, label, properties, created_at)
|
| 23 |
+
VALUES (?, 'claim', ?, ?, ?)
|
| 24 |
+
""", (claim_id, text[:200], json.dumps(properties or {}), now_iso()))
|
| 25 |
+
conn.commit()
|
| 26 |
+
conn.close()
|
| 27 |
+
|
| 28 |
+
def add_edge(self, source: str, target: str, edge_type: str,
|
| 29 |
+
confidence: float, evidence_sources: list = None,
|
| 30 |
+
is_inferred: bool = False, method_compatible: bool = None) -> str:
|
| 31 |
+
"""Add a typed edge between two nodes."""
|
| 32 |
+
edge_id = gen_id("EDGE")
|
| 33 |
+
conn = get_db(self.db_path)
|
| 34 |
+
conn.execute("""
|
| 35 |
+
INSERT INTO graph_edges (edge_id, source_node, target_node, edge_type,
|
| 36 |
+
confidence, evidence_sources, is_inferred, method_compatible,
|
| 37 |
+
created_at, updated_at)
|
| 38 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 39 |
+
""", (edge_id, source, target, edge_type,
|
| 40 |
+
to_fixed(confidence), json.dumps(evidence_sources or []),
|
| 41 |
+
int(is_inferred),
|
| 42 |
+
int(method_compatible) if method_compatible is not None else None,
|
| 43 |
+
now_iso(), now_iso()))
|
| 44 |
+
conn.commit()
|
| 45 |
+
conn.close()
|
| 46 |
+
return edge_id
|
| 47 |
+
|
| 48 |
+
def get_neighbors(self, node_id: str, edge_type: str = None,
|
| 49 |
+
include_inferred: bool = False) -> list:
|
| 50 |
+
"""Get all neighbors of a node with optional edge type filter."""
|
| 51 |
+
conn = get_db(self.db_path)
|
| 52 |
+
conditions = ["(source_node = ? OR target_node = ?)"]
|
| 53 |
+
params = [node_id, node_id]
|
| 54 |
+
|
| 55 |
+
if edge_type:
|
| 56 |
+
conditions.append("edge_type = ?")
|
| 57 |
+
params.append(edge_type)
|
| 58 |
+
if not include_inferred:
|
| 59 |
+
conditions.append("is_inferred = 0")
|
| 60 |
+
|
| 61 |
+
where = " AND ".join(conditions)
|
| 62 |
+
rows = conn.execute(f"""
|
| 63 |
+
SELECT e.*, n.label as neighbor_label, n.node_type as neighbor_type
|
| 64 |
+
FROM graph_edges e
|
| 65 |
+
LEFT JOIN graph_nodes n ON (
|
| 66 |
+
CASE WHEN e.source_node = ? THEN e.target_node ELSE e.source_node END = n.node_id
|
| 67 |
+
)
|
| 68 |
+
WHERE {where}
|
| 69 |
+
ORDER BY e.confidence DESC
|
| 70 |
+
""", [node_id] + params).fetchall()
|
| 71 |
+
conn.close()
|
| 72 |
+
|
| 73 |
+
results = []
|
| 74 |
+
for r in rows:
|
| 75 |
+
d = dict(r)
|
| 76 |
+
d["confidence"] = from_fixed(d["confidence"]) if d.get("confidence") else 0
|
| 77 |
+
d["evidence_sources"] = json.loads(d.get("evidence_sources", "[]"))
|
| 78 |
+
results.append(d)
|
| 79 |
+
return results
|
| 80 |
+
|
| 81 |
+
def find_conflicts(self, min_similarity: float = 0.3, limit: int = 50) -> list:
|
| 82 |
+
"""Find potential conflicts between claims (keyword overlap heuristic)."""
|
| 83 |
+
conn = get_db(self.db_path)
|
| 84 |
+
claims = conn.execute("""
|
| 85 |
+
SELECT claim_id, text, epistemic_tag, composite_confidence, source_doi
|
| 86 |
+
FROM claims WHERE status != 'Unextractable'
|
| 87 |
+
ORDER BY composite_confidence DESC LIMIT 500
|
| 88 |
+
""").fetchall()
|
| 89 |
+
conn.close()
|
| 90 |
+
|
| 91 |
+
pairs = []
|
| 92 |
+
claims_list = [dict(c) for c in claims]
|
| 93 |
+
|
| 94 |
+
for i, a in enumerate(claims_list):
|
| 95 |
+
for b in claims_list[i+1:]:
|
| 96 |
+
if a.get("source_doi") == b.get("source_doi"):
|
| 97 |
+
continue
|
| 98 |
+
|
| 99 |
+
words_a = set(a["text"].lower().split()) - self._stopwords
|
| 100 |
+
words_b = set(b["text"].lower().split()) - self._stopwords
|
| 101 |
+
|
| 102 |
+
if not words_a or not words_b:
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
overlap = len(words_a & words_b) / min(len(words_a), len(words_b))
|
| 106 |
+
if overlap >= min_similarity:
|
| 107 |
+
pairs.append({
|
| 108 |
+
"claim_a": a,
|
| 109 |
+
"claim_b": b,
|
| 110 |
+
"overlap": overlap,
|
| 111 |
+
})
|
| 112 |
+
|
| 113 |
+
if len(pairs) >= limit:
|
| 114 |
+
break
|
| 115 |
+
if len(pairs) >= limit:
|
| 116 |
+
break
|
| 117 |
+
|
| 118 |
+
pairs.sort(key=lambda x: -x["overlap"])
|
| 119 |
+
return pairs
|
| 120 |
+
|
| 121 |
+
def find_gaps(self, min_degree: int = 3) -> list:
|
| 122 |
+
"""
|
| 123 |
+
Gap Analysis: Find well-connected entities with no direct edge between them.
|
| 124 |
+
These are high-value research opportunities.
|
| 125 |
+
"""
|
| 126 |
+
conn = get_db(self.db_path)
|
| 127 |
+
|
| 128 |
+
# Get all nodes with their degree
|
| 129 |
+
nodes = conn.execute("""
|
| 130 |
+
SELECT n.node_id, n.label, n.node_type,
|
| 131 |
+
(SELECT COUNT(*) FROM graph_edges WHERE source_node = n.node_id OR target_node = n.node_id) as degree
|
| 132 |
+
FROM graph_nodes n
|
| 133 |
+
HAVING degree >= ?
|
| 134 |
+
ORDER BY degree DESC
|
| 135 |
+
""", (min_degree,)).fetchall()
|
| 136 |
+
|
| 137 |
+
gaps = []
|
| 138 |
+
node_list = [dict(n) for n in nodes]
|
| 139 |
+
|
| 140 |
+
for i, a in enumerate(node_list):
|
| 141 |
+
for b in node_list[i+1:]:
|
| 142 |
+
if a["node_type"] != b["node_type"]:
|
| 143 |
+
continue
|
| 144 |
+
|
| 145 |
+
# Check if edge exists
|
| 146 |
+
edge = conn.execute("""
|
| 147 |
+
SELECT 1 FROM graph_edges
|
| 148 |
+
WHERE (source_node = ? AND target_node = ?)
|
| 149 |
+
OR (source_node = ? AND target_node = ?)
|
| 150 |
+
""", (a["node_id"], b["node_id"], b["node_id"], a["node_id"])).fetchone()
|
| 151 |
+
|
| 152 |
+
if not edge:
|
| 153 |
+
info_gain = (a["degree"] + b["degree"]) / max(
|
| 154 |
+
max(n["degree"] for n in node_list), 1
|
| 155 |
+
)
|
| 156 |
+
gaps.append({
|
| 157 |
+
"entity_a": a["label"],
|
| 158 |
+
"entity_b": b["label"],
|
| 159 |
+
"a_degree": a["degree"],
|
| 160 |
+
"b_degree": b["degree"],
|
| 161 |
+
"information_gain": round(info_gain, 3),
|
| 162 |
+
})
|
| 163 |
+
|
| 164 |
+
conn.close()
|
| 165 |
+
gaps.sort(key=lambda g: -g["information_gain"])
|
| 166 |
+
return gaps[:20] # Top 20 gaps
|
| 167 |
+
|
| 168 |
+
def get_stats(self) -> dict:
|
| 169 |
+
conn = get_db(self.db_path)
|
| 170 |
+
stats = {
|
| 171 |
+
"total_nodes": conn.execute("SELECT COUNT(*) FROM graph_nodes").fetchone()[0],
|
| 172 |
+
"total_edges": conn.execute("SELECT COUNT(*) FROM graph_edges").fetchone()[0],
|
| 173 |
+
"observed_edges": conn.execute("SELECT COUNT(*) FROM graph_edges WHERE is_inferred = 0").fetchone()[0],
|
| 174 |
+
"inferred_edges": conn.execute("SELECT COUNT(*) FROM graph_edges WHERE is_inferred = 1").fetchone()[0],
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
# Edge type distribution
|
| 178 |
+
types = conn.execute(
|
| 179 |
+
"SELECT edge_type, COUNT(*) FROM graph_edges GROUP BY edge_type"
|
| 180 |
+
).fetchall()
|
| 181 |
+
stats["edge_types"] = {dict(t)["edge_type"]: list(dict(t).values())[1] for t in types}
|
| 182 |
+
|
| 183 |
+
conn.close()
|
| 184 |
+
return stats
|
| 185 |
+
|
| 186 |
+
_stopwords = {'the', 'a', 'an', 'is', 'was', 'were', 'are', 'been',
|
| 187 |
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
| 188 |
+
'would', 'could', 'should', 'may', 'might', 'shall',
|
| 189 |
+
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
|
| 190 |
+
'from', 'and', 'or', 'but', 'not', 'no', 'this', 'that',
|
| 191 |
+
'these', 'those', 'it', 'its', 'we', 'our', 'they'}
|