nkshirsa commited on
Commit
14f0da5
·
verified ·
1 Parent(s): 2ba027a

v2.0: phd_research_os_v2/layer4/graph.py

Browse files
Files changed (1) hide show
  1. phd_research_os_v2/layer4/graph.py +191 -0
phd_research_os_v2/layer4/graph.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layer 4: SQLite-Backed Knowledge Graph
3
+ =========================================
4
+ Typed epistemic edges, conflict detection, gap analysis.
5
+ """
6
+
7
+ import json
8
+ from typing import Optional
9
+ from ..core.database import get_db, gen_id, now_iso, to_fixed, from_fixed
10
+
11
+
12
+ class KnowledgeGraph:
13
+ """SQLite-backed knowledge graph with typed epistemic edges."""
14
+
15
+ def __init__(self, db_path: str = None):
16
+ self.db_path = db_path
17
+
18
+ def add_claim_node(self, claim_id: str, text: str, properties: dict = None):
19
+ """Add a claim as a graph node."""
20
+ conn = get_db(self.db_path)
21
+ conn.execute("""
22
+ INSERT OR IGNORE INTO graph_nodes (node_id, node_type, label, properties, created_at)
23
+ VALUES (?, 'claim', ?, ?, ?)
24
+ """, (claim_id, text[:200], json.dumps(properties or {}), now_iso()))
25
+ conn.commit()
26
+ conn.close()
27
+
28
+ def add_edge(self, source: str, target: str, edge_type: str,
29
+ confidence: float, evidence_sources: list = None,
30
+ is_inferred: bool = False, method_compatible: bool = None) -> str:
31
+ """Add a typed edge between two nodes."""
32
+ edge_id = gen_id("EDGE")
33
+ conn = get_db(self.db_path)
34
+ conn.execute("""
35
+ INSERT INTO graph_edges (edge_id, source_node, target_node, edge_type,
36
+ confidence, evidence_sources, is_inferred, method_compatible,
37
+ created_at, updated_at)
38
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
39
+ """, (edge_id, source, target, edge_type,
40
+ to_fixed(confidence), json.dumps(evidence_sources or []),
41
+ int(is_inferred),
42
+ int(method_compatible) if method_compatible is not None else None,
43
+ now_iso(), now_iso()))
44
+ conn.commit()
45
+ conn.close()
46
+ return edge_id
47
+
48
+ def get_neighbors(self, node_id: str, edge_type: str = None,
49
+ include_inferred: bool = False) -> list:
50
+ """Get all neighbors of a node with optional edge type filter."""
51
+ conn = get_db(self.db_path)
52
+ conditions = ["(source_node = ? OR target_node = ?)"]
53
+ params = [node_id, node_id]
54
+
55
+ if edge_type:
56
+ conditions.append("edge_type = ?")
57
+ params.append(edge_type)
58
+ if not include_inferred:
59
+ conditions.append("is_inferred = 0")
60
+
61
+ where = " AND ".join(conditions)
62
+ rows = conn.execute(f"""
63
+ SELECT e.*, n.label as neighbor_label, n.node_type as neighbor_type
64
+ FROM graph_edges e
65
+ LEFT JOIN graph_nodes n ON (
66
+ CASE WHEN e.source_node = ? THEN e.target_node ELSE e.source_node END = n.node_id
67
+ )
68
+ WHERE {where}
69
+ ORDER BY e.confidence DESC
70
+ """, [node_id] + params).fetchall()
71
+ conn.close()
72
+
73
+ results = []
74
+ for r in rows:
75
+ d = dict(r)
76
+ d["confidence"] = from_fixed(d["confidence"]) if d.get("confidence") else 0
77
+ d["evidence_sources"] = json.loads(d.get("evidence_sources", "[]"))
78
+ results.append(d)
79
+ return results
80
+
81
+ def find_conflicts(self, min_similarity: float = 0.3, limit: int = 50) -> list:
82
+ """Find potential conflicts between claims (keyword overlap heuristic)."""
83
+ conn = get_db(self.db_path)
84
+ claims = conn.execute("""
85
+ SELECT claim_id, text, epistemic_tag, composite_confidence, source_doi
86
+ FROM claims WHERE status != 'Unextractable'
87
+ ORDER BY composite_confidence DESC LIMIT 500
88
+ """).fetchall()
89
+ conn.close()
90
+
91
+ pairs = []
92
+ claims_list = [dict(c) for c in claims]
93
+
94
+ for i, a in enumerate(claims_list):
95
+ for b in claims_list[i+1:]:
96
+ if a.get("source_doi") == b.get("source_doi"):
97
+ continue
98
+
99
+ words_a = set(a["text"].lower().split()) - self._stopwords
100
+ words_b = set(b["text"].lower().split()) - self._stopwords
101
+
102
+ if not words_a or not words_b:
103
+ continue
104
+
105
+ overlap = len(words_a & words_b) / min(len(words_a), len(words_b))
106
+ if overlap >= min_similarity:
107
+ pairs.append({
108
+ "claim_a": a,
109
+ "claim_b": b,
110
+ "overlap": overlap,
111
+ })
112
+
113
+ if len(pairs) >= limit:
114
+ break
115
+ if len(pairs) >= limit:
116
+ break
117
+
118
+ pairs.sort(key=lambda x: -x["overlap"])
119
+ return pairs
120
+
121
+ def find_gaps(self, min_degree: int = 3) -> list:
122
+ """
123
+ Gap Analysis: Find well-connected entities with no direct edge between them.
124
+ These are high-value research opportunities.
125
+ """
126
+ conn = get_db(self.db_path)
127
+
128
+ # Get all nodes with their degree
129
+ nodes = conn.execute("""
130
+ SELECT n.node_id, n.label, n.node_type,
131
+ (SELECT COUNT(*) FROM graph_edges WHERE source_node = n.node_id OR target_node = n.node_id) as degree
132
+ FROM graph_nodes n
133
+ HAVING degree >= ?
134
+ ORDER BY degree DESC
135
+ """, (min_degree,)).fetchall()
136
+
137
+ gaps = []
138
+ node_list = [dict(n) for n in nodes]
139
+
140
+ for i, a in enumerate(node_list):
141
+ for b in node_list[i+1:]:
142
+ if a["node_type"] != b["node_type"]:
143
+ continue
144
+
145
+ # Check if edge exists
146
+ edge = conn.execute("""
147
+ SELECT 1 FROM graph_edges
148
+ WHERE (source_node = ? AND target_node = ?)
149
+ OR (source_node = ? AND target_node = ?)
150
+ """, (a["node_id"], b["node_id"], b["node_id"], a["node_id"])).fetchone()
151
+
152
+ if not edge:
153
+ info_gain = (a["degree"] + b["degree"]) / max(
154
+ max(n["degree"] for n in node_list), 1
155
+ )
156
+ gaps.append({
157
+ "entity_a": a["label"],
158
+ "entity_b": b["label"],
159
+ "a_degree": a["degree"],
160
+ "b_degree": b["degree"],
161
+ "information_gain": round(info_gain, 3),
162
+ })
163
+
164
+ conn.close()
165
+ gaps.sort(key=lambda g: -g["information_gain"])
166
+ return gaps[:20] # Top 20 gaps
167
+
168
+ def get_stats(self) -> dict:
169
+ conn = get_db(self.db_path)
170
+ stats = {
171
+ "total_nodes": conn.execute("SELECT COUNT(*) FROM graph_nodes").fetchone()[0],
172
+ "total_edges": conn.execute("SELECT COUNT(*) FROM graph_edges").fetchone()[0],
173
+ "observed_edges": conn.execute("SELECT COUNT(*) FROM graph_edges WHERE is_inferred = 0").fetchone()[0],
174
+ "inferred_edges": conn.execute("SELECT COUNT(*) FROM graph_edges WHERE is_inferred = 1").fetchone()[0],
175
+ }
176
+
177
+ # Edge type distribution
178
+ types = conn.execute(
179
+ "SELECT edge_type, COUNT(*) FROM graph_edges GROUP BY edge_type"
180
+ ).fetchall()
181
+ stats["edge_types"] = {dict(t)["edge_type"]: list(dict(t).values())[1] for t in types}
182
+
183
+ conn.close()
184
+ return stats
185
+
186
+ _stopwords = {'the', 'a', 'an', 'is', 'was', 'were', 'are', 'been',
187
+ 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
188
+ 'would', 'could', 'should', 'may', 'might', 'shall',
189
+ 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by',
190
+ 'from', 'and', 'or', 'but', 'not', 'no', 'this', 'that',
191
+ 'these', 'those', 'it', 'its', 'we', 'our', 'they'}