nkshirsa commited on
Commit
29a5047
·
verified ·
1 Parent(s): e7f848f

v2.0: phd_research_os_v2/layer2/extractor.py

Browse files
phd_research_os_v2/layer2/extractor.py ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Layer 2: Qualified Extraction with AI Council
3
+ ================================================
4
+ Extracts claims from parsed regions using the parallel-then-merge council.
5
+ Applies section-aware confidence modifiers.
6
+ All output constrained to valid schema.
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import re
12
+ from typing import Optional
13
+
14
+ from ..core.database import (
15
+ get_db, init_db, gen_id, now_iso, to_fixed, from_fixed
16
+ )
17
+
18
+ # Section confidence modifiers (fixed-point ×1000)
19
+ SECTION_MODIFIERS = {
20
+ "abstract": 700,
21
+ "introduction": 800,
22
+ "related_work": 800,
23
+ "methods": 1000, # Methods are protocol, not claims — but if claims extracted, full weight
24
+ "results": 1000,
25
+ "results_discussion": 900,
26
+ "discussion": 750,
27
+ "conclusion": 800,
28
+ "supplement": 1000,
29
+ "unknown": 850,
30
+ None: 850,
31
+ }
32
+
33
+ VALID_TAGS = ["Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"]
34
+
35
+ EXTRACTOR_PROMPT = """You are a scientific claim extractor for a PhD Research OS.
36
+
37
+ Extract precise, atomic claims from the text. For EACH claim provide:
38
+ - text: The exact claim statement (preserve qualifiers like "may", "suggests", "not significant")
39
+ - epistemic_tag: One of [Fact, Interpretation, Hypothesis, Conflict_Hypothesis]
40
+ * Fact: Directly supported by quantitative data in THIS paper
41
+ * Interpretation: Author's explanation that goes beyond raw data
42
+ * Hypothesis: Untested proposal using "may", "could", "we propose"
43
+ * Conflict_Hypothesis: Explicitly contradicts another established finding
44
+ - confidence: Float 0.0-1.0 (how strong is the evidence FOR this specific claim)
45
+ - missing_fields: What would make this claim more complete (empty list if complete)
46
+ - status: "Complete" or "Incomplete" (Incomplete if missing_fields is non-empty)
47
+ - qualifiers: List of hedging words or conditions (e.g., ["in 10 mM PBS", "n=5", "not statistically significant"])
48
+ - is_null_result: true if the claim reports a negative/null finding
49
+ - source_quote: The EXACT sentence from the text that supports this claim
50
+
51
+ CRITICAL RULES:
52
+ 1. PRESERVE all qualifiers — "may", "suggests", "under these conditions", "not significant"
53
+ 2. If a result is NOT statistically significant, mark is_null_result=true
54
+ 3. If the text says "X causes Y", mark causal_direction as "causal_claim"
55
+ 4. If the text says "X is associated with Y", mark causal_direction as "observed_correlation"
56
+
57
+ Output MUST be a valid JSON array. No markdown, no explanations."""
58
+
59
+ CRITIC_PROMPT = """You are a critical reviewer for a PhD Research OS.
60
+
61
+ Review extracted claims against the original text. Check:
62
+ 1. Missing important claims the extractor overlooked
63
+ 2. Incorrect epistemic tags (e.g., Interpretation tagged as Fact)
64
+ 3. Overly confident claims that should be Incomplete
65
+ 4. Dropped qualifiers (hedging words removed from claim text)
66
+ 5. Null results not flagged as is_null_result=true
67
+ 6. Causal claims from correlational data
68
+
69
+ Output JSON: {
70
+ "feedback": "overall critique",
71
+ "missing_claims": ["claim text 1", ...],
72
+ "tag_corrections": {"0": "Interpretation", ...},
73
+ "confidence_adjustments": {"0": 0.5, ...},
74
+ "qualifier_additions": {"0": ["qualifier1"], ...},
75
+ "null_result_flags": [0, 2]
76
+ }"""
77
+
78
+ CHAIRMAN_PROMPT = """You are the chairman of a scientific claim extraction council.
79
+
80
+ You receive: original text, extracted claims, and critic feedback.
81
+ Synthesize into final claims applying these rules:
82
+ 1. Apply critic's tag corrections where justified
83
+ 2. Apply critic's confidence adjustments
84
+ 3. Add any missing claims the critic identified
85
+ 4. Apply 0.7 completeness penalty for claims with significant missing fields
86
+ 5. Ensure ALL qualifiers from source text are preserved
87
+ 6. Flag null results appropriately
88
+
89
+ Output MUST be a valid JSON array of claims. No markdown."""
90
+
91
+
92
+ class QualifiedExtractor:
93
+ """
94
+ Layer 2: Extract claims using the AI Model Council.
95
+
96
+ Pipeline: Extractor → Critic → Chairman (sequential for now,
97
+ upgrade to parallel-then-merge when multi-model serving available)
98
+ """
99
+
100
+ def __init__(self, db_path: str = None, brain=None):
101
+ self.db_path = db_path or os.environ.get("RESEARCH_OS_DB", "data/research_os_v2.db")
102
+ self.brain = brain # ResearchOSBrain or compatible LLM interface
103
+
104
+ def extract_from_chunk(self, chunk: dict, source_doi: str = None) -> list:
105
+ """
106
+ Extract claims from a single section-aware chunk.
107
+ Returns list of claim dicts ready for DB insertion.
108
+ """
109
+ text = chunk.get("text", "")
110
+ section = chunk.get("section", "unknown")
111
+ page = chunk.get("page", 0)
112
+ parse_confidence = chunk.get("min_confidence", 1000)
113
+
114
+ if not text or len(text.strip()) < 50:
115
+ return []
116
+
117
+ # Run extraction (with or without brain)
118
+ if self.brain:
119
+ raw_claims = self._extract_with_brain(text, section)
120
+ else:
121
+ raw_claims = self._extract_mock(text, section)
122
+
123
+ # Post-process: apply section modifiers, validate, score
124
+ claims = []
125
+ section_mod = SECTION_MODIFIERS.get(section, 850)
126
+
127
+ for i, raw in enumerate(raw_claims):
128
+ if not isinstance(raw, dict) or not raw.get("text"):
129
+ continue
130
+
131
+ # Validate and fix epistemic tag
132
+ tag = raw.get("epistemic_tag", "Interpretation")
133
+ if tag not in VALID_TAGS:
134
+ tag = "Interpretation"
135
+
136
+ # Abstract claims forced to Interpretation (Epistemic Separation Engine)
137
+ if section == "abstract" and tag == "Fact":
138
+ tag = "Interpretation"
139
+
140
+ # Build confidence components
141
+ evidence_strength = to_fixed(min(1.0, max(0.0, float(raw.get("confidence", 0.5)))))
142
+ missing = raw.get("missing_fields", [])
143
+ if not isinstance(missing, list):
144
+ missing = []
145
+ completeness = 700 if missing else 1000
146
+
147
+ qualifiers = raw.get("qualifiers", [])
148
+ if not isinstance(qualifiers, list):
149
+ qualifiers = []
150
+ qualifier_penalty = max(500, 1000 - len(qualifiers) * 100)
151
+
152
+ # Status
153
+ is_null = bool(raw.get("is_null_result", False))
154
+ status = "Complete" if not missing else "Incomplete"
155
+
156
+ # Code-computed composite (Layer 5 will refine further)
157
+ # For now: evidence × section_modifier × completeness × qualifier
158
+ composite = (evidence_strength * section_mod // 1000
159
+ * completeness // 1000
160
+ * qualifier_penalty // 1000)
161
+
162
+ # Parser confidence caps claim confidence
163
+ composite = min(composite, parse_confidence)
164
+
165
+ claim = {
166
+ "claim_id": gen_id("CLM"),
167
+ "text": str(raw.get("text", "")),
168
+ "epistemic_tag": tag,
169
+ "evidence_strength": evidence_strength,
170
+ "section_modifier": section_mod,
171
+ "completeness_penalty": completeness,
172
+ "qualifier_penalty": qualifier_penalty,
173
+ "composite_confidence": composite,
174
+ "status": status,
175
+ "is_null_result": is_null,
176
+ "is_inherited_citation": bool(raw.get("is_inherited_citation", False)),
177
+ "causal_direction": raw.get("causal_direction", "unspecified"),
178
+ "qualifiers": qualifiers,
179
+ "missing_fields": missing,
180
+ "source_quote": raw.get("source_quote", ""),
181
+ "source_page": page,
182
+ "source_section": section,
183
+ "source_doc_id": chunk.get("doc_id"),
184
+ "source_doi": source_doi,
185
+ "source_region_id": (chunk.get("region_ids") or [None])[0],
186
+ "extraction_timestamp": now_iso(),
187
+ }
188
+ claims.append(claim)
189
+
190
+ return claims
191
+
192
+ def extract_from_document(self, doc_id: str, source_doi: str = None) -> dict:
193
+ """
194
+ Extract claims from all chunks of a document.
195
+ Uses Layer 0's section-aware chunking.
196
+ """
197
+ from ..layer0.parser import StructuralParser
198
+ parser = StructuralParser(self.db_path)
199
+ chunks = parser.get_section_chunks(doc_id)
200
+
201
+ all_claims = []
202
+ section_stats = {}
203
+
204
+ for chunk in chunks:
205
+ claims = self.extract_from_chunk(chunk, source_doi)
206
+ all_claims.extend(claims)
207
+
208
+ section = chunk.get("section", "unknown")
209
+ section_stats[section] = section_stats.get(section, 0) + len(claims)
210
+
211
+ # Store claims in database
212
+ conn = get_db(self.db_path)
213
+ for claim in all_claims:
214
+ conn.execute("""
215
+ INSERT INTO claims (claim_id, text, epistemic_tag,
216
+ evidence_strength, section_modifier, completeness_penalty,
217
+ qualifier_penalty, composite_confidence,
218
+ status, is_null_result, is_inherited_citation, causal_direction,
219
+ qualifiers, missing_fields, source_quote, source_page,
220
+ source_section, source_doc_id, source_doi, source_region_id,
221
+ extraction_timestamp, pipeline_version,
222
+ schema_version, created_at, updated_at)
223
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, '2.0', ?, ?)
224
+ """, (
225
+ claim["claim_id"], claim["text"], claim["epistemic_tag"],
226
+ claim["evidence_strength"], claim["section_modifier"],
227
+ claim["completeness_penalty"], claim["qualifier_penalty"],
228
+ claim["composite_confidence"],
229
+ claim["status"], claim["is_null_result"],
230
+ claim["is_inherited_citation"], claim["causal_direction"],
231
+ json.dumps(claim["qualifiers"]), json.dumps(claim["missing_fields"]),
232
+ claim.get("source_quote"), claim.get("source_page"),
233
+ claim.get("source_section"), claim.get("source_doc_id"),
234
+ claim.get("source_doi"), claim.get("source_region_id"),
235
+ claim.get("extraction_timestamp"), "2.1.0",
236
+ now_iso(), now_iso()
237
+ ))
238
+ conn.commit()
239
+ conn.close()
240
+
241
+ return {
242
+ "doc_id": doc_id,
243
+ "total_claims": len(all_claims),
244
+ "section_distribution": section_stats,
245
+ "epistemic_distribution": self._count_tags(all_claims),
246
+ "null_results": sum(1 for c in all_claims if c["is_null_result"]),
247
+ "incomplete": sum(1 for c in all_claims if c["status"] == "Incomplete"),
248
+ "avg_confidence": (sum(c["composite_confidence"] for c in all_claims) // max(len(all_claims), 1)),
249
+ }
250
+
251
+ def _extract_with_brain(self, text: str, section: str) -> list:
252
+ """Extract using the AI brain (local or API model)."""
253
+ messages = [
254
+ {"role": "system", "content": EXTRACTOR_PROMPT},
255
+ {"role": "user", "content": f"Section: {section}\n\nText:\n{text}"}
256
+ ]
257
+
258
+ try:
259
+ if hasattr(self.brain, '_generate_local') and self.brain.backend == "local":
260
+ raw = self.brain._generate_local(messages)
261
+ elif hasattr(self.brain, '_generate_api'):
262
+ raw = self.brain._generate_api(messages)
263
+ else:
264
+ return self._extract_mock(text, section)
265
+
266
+ # Parse JSON
267
+ text_clean = raw.strip()
268
+ if text_clean.startswith("```"):
269
+ parts = text_clean.split("```")
270
+ text_clean = parts[1] if len(parts) > 1 else text_clean
271
+ if text_clean.startswith("json"):
272
+ text_clean = text_clean[4:]
273
+ text_clean = text_clean.strip()
274
+
275
+ data = json.loads(text_clean)
276
+ return data if isinstance(data, list) else [data]
277
+ except Exception:
278
+ return self._extract_mock(text, section)
279
+
280
+ def _extract_mock(self, text: str, section: str) -> list:
281
+ """Mock extraction when no brain is available. Produces structurally valid output."""
282
+ # Extract sentences as potential claims
283
+ sentences = [s.strip() for s in re.split(r'[.!?]\s+', text) if len(s.strip()) > 30]
284
+
285
+ claims = []
286
+ for i, sent in enumerate(sentences[:5]): # Max 5 claims per chunk
287
+ # Simple heuristic classification
288
+ lower = sent.lower()
289
+
290
+ if any(w in lower for w in ["measured", "found", "detected", "achieved", "showed"]):
291
+ tag = "Fact"
292
+ confidence = 0.7
293
+ elif any(w in lower for w in ["suggest", "indicate", "consistent with", "interpret"]):
294
+ tag = "Interpretation"
295
+ confidence = 0.5
296
+ elif any(w in lower for w in ["may", "could", "hypothesize", "propose", "possible"]):
297
+ tag = "Hypothesis"
298
+ confidence = 0.3
299
+ elif any(w in lower for w in ["contradict", "unlike", "contrary"]):
300
+ tag = "Conflict_Hypothesis"
301
+ confidence = 0.4
302
+ else:
303
+ tag = "Interpretation"
304
+ confidence = 0.5
305
+
306
+ # Detect qualifiers
307
+ qualifiers = []
308
+ for q in ["may", "might", "could", "suggests", "possibly", "not significant",
309
+ "under these conditions", "in vitro", "preliminary"]:
310
+ if q in lower:
311
+ qualifiers.append(q)
312
+
313
+ is_null = any(w in lower for w in ["not significant", "no effect", "no difference",
314
+ "failed to", "did not"])
315
+
316
+ claims.append({
317
+ "text": sent + ".",
318
+ "epistemic_tag": tag,
319
+ "confidence": confidence,
320
+ "missing_fields": [],
321
+ "status": "Complete",
322
+ "qualifiers": qualifiers,
323
+ "is_null_result": is_null,
324
+ "is_inherited_citation": "[" in sent and "]" in sent,
325
+ "causal_direction": "causal_claim" if "cause" in lower else "observed_correlation" if "correlat" in lower else "unspecified",
326
+ "source_quote": sent + ".",
327
+ })
328
+
329
+ return claims
330
+
331
+ def _count_tags(self, claims: list) -> dict:
332
+ counts = {}
333
+ for c in claims:
334
+ tag = c.get("epistemic_tag", "unknown")
335
+ counts[tag] = counts.get(tag, 0) + 1
336
+ return counts