v2.0: phd_research_os_v2/app.py

Browse files

Files changed (1) hide show

phd_research_os_v2/app.py +630 -0

phd_research_os_v2/app.py ADDED Viewed

	@@ -0,0 +1,630 @@

+"""
+PhD Research OS v2.0 — Local Application
+==========================================
+A guided application that walks the user through all phases
+of setting up and using the Research OS.
+Launch: python -m phd_research_os_v2.app
+"""
+import os
+import sys
+import json
+import gradio as gr
+# Ensure package is importable
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from phd_research_os_v2.core.database import (
+    init_db, get_db, get_stats, get_state, set_state,
+    from_fixed, to_fixed, now_iso, gen_id, DB_PATH
+)
+from phd_research_os_v2.layer0.parser import StructuralParser
+from phd_research_os_v2.layer2.extractor import QualifiedExtractor
+from phd_research_os_v2.layer4.graph import KnowledgeGraph
+from phd_research_os_v2.layer5.scorer import CalibratedScorer
+# Initialize
+os.makedirs("data", exist_ok=True)
+os.makedirs("inbox", exist_ok=True)
+os.makedirs("vault", exist_ok=True)
+init_db(DB_PATH)
+# ============================================================
+# Phase Status Logic
+# ============================================================
+def get_phase_status():
+    """Get completion status for each phase."""
+    stats = get_stats(DB_PATH)
+    phase = int(get_state(DB_PATH, "setup_phase") or "0")
+    return {
+        "current_phase": phase,
+        "phase_0": {"name": "Foundation", "done": True, "desc": "Database initialized"},
+        "phase_1": {
+            "name": "Paper Ingestion",
+            "done": stats.get("documents", 0) > 0,
+            "desc": f"{stats.get('documents', 0)} documents ingested, {stats.get('regions', 0)} regions parsed"
+        },
+        "phase_2": {
+            "name": "Claim Extraction",
+            "done": stats.get("claims", 0) > 0,
+            "desc": f"{stats.get('claims', 0)} claims extracted"
+        },
+        "phase_3": {
+            "name": "Knowledge Graph",
+            "done": stats.get("graph_nodes", 0) > 0,
+            "desc": f"{stats.get('graph_nodes', 0)} nodes, {stats.get('graph_edges', 0)} edges"
+        },
+        "phase_4": {
+            "name": "Conflict Detection",
+            "done": stats.get("conflicts", 0) > 0,
+            "desc": f"{stats.get('conflicts', 0)} conflicts detected"
+        },
+        "phase_5": {
+            "name": "Calibrated Scoring",
+            "done": False,
+            "desc": "Score claims with code-computed confidence"
+        },
+        "phase_6": {
+            "name": "Research Goals",
+            "done": stats.get("goals", 0) > 0,
+            "desc": f"{stats.get('goals', 0)} active goals"
+        },
+    }
+def render_phase_overview():
+    """Render markdown overview of all phases."""
+    status = get_phase_status()
+    lines = ["# 🧬 PhD Research OS v2.0\n"]
+    lines.append("## System Status\n")
+    for i in range(7):
+        p = status[f"phase_{i}"]
+        icon = "✅" if p["done"] else "⬜"
+        lines.append(f"{icon} **Phase {i}: {p['name']}** — {p['desc']}")
+    stats = get_stats(DB_PATH)
+    lines.append(f"\n---\n### Database Summary")
+    lines.append(f"| Table | Count |")
+    lines.append(f"|-------|-------|")
+    for table, count in stats.items():
+        lines.append(f"| {table} | {count} |")
+    return "\n".join(lines)
+# ============================================================
+# Phase 1: Paper Ingestion
+# ============================================================
+def ingest_paper(file_obj, doc_type, title, doi):
+    """Ingest a paper through Layer 0."""
+    if file_obj is None:
+        return "❌ Please upload a file", "", render_phase_overview()
+    parser = StructuralParser(DB_PATH)
+    result = parser.ingest_document(
+        file_obj.name if hasattr(file_obj, 'name') else str(file_obj),
+        doc_type=doc_type or "main",
+        title=title or None,
+        doi=doi or None,
+    )
+    if result.get("error"):
+        return f"❌ {result['error']}", "", render_phase_overview()
+    # Format result
+    summary = f"""✅ **Document ingested successfully!**
+| Metric | Value |
+|--------|-------|
+| Document ID | `{result['doc_id']}` |
+| Parse Method | {result['parse_method']} |
+| Total Regions | {result['total_regions']} |
+| Average Quality | {result['avg_quality']:.2f} |
+| Sections Found | {', '.join(result.get('sections_found', [])) or 'None detected'} |
+**Regions by type:**
+"""
+    for rtype, count in result.get("regions_by_type", {}).items():
+        summary += f"- {rtype}: {count}\n"
+    # Show first few regions as preview
+    preview_rows = []
+    parser2 = StructuralParser(DB_PATH)
+    regions = parser2.get_extractable_regions(result["doc_id"])
+    for r in regions[:10]:
+        preview_rows.append([
+            r["region_id"][:12], r["region_type"], r.get("section", "—"),
+            r["content_text"][:100] + "..." if len(r.get("content_text", "")) > 100 else r.get("content_text", ""),
+            f"{from_fixed(r['parse_confidence']):.2f}",
+        ])
+    return summary, preview_rows, render_phase_overview()
+# ============================================================
+# Phase 2: Claim Extraction
+# ============================================================
+def extract_claims_from_doc(doc_id):
+    """Extract claims from a specific document."""
+    if not doc_id or not doc_id.strip():
+        return "❌ Please enter a document ID", []
+    extractor = QualifiedExtractor(DB_PATH)
+    result = extractor.extract_from_document(doc_id.strip())
+    summary = f"""✅ **Claims extracted!**
+| Metric | Value |
+|--------|-------|
+| Total Claims | {result['total_claims']} |
+| Null Results | {result['null_results']} |
+| Incomplete | {result['incomplete']} |
+| Avg Confidence | {from_fixed(result['avg_confidence']):.3f} |
+**By Section:** {json.dumps(result.get('section_distribution', {}), indent=2)}
+**By Epistemic Tag:** {json.dumps(result.get('epistemic_distribution', {}), indent=2)}
+"""
+    # Get claims for display
+    conn = get_db(DB_PATH)
+    rows = conn.execute("""
+        SELECT claim_id, text, epistemic_tag, composite_confidence,
+               source_section, status, is_null_result
+        FROM claims WHERE source_doc_id = ?
+        ORDER BY composite_confidence DESC
+    """, (doc_id.strip(),)).fetchall()
+    conn.close()
+    table_rows = []
+    for r in rows:
+        d = dict(r)
+        table_rows.append([
+            d["claim_id"][:12],
+            d["text"][:120] + ("..." if len(d.get("text","")) > 120 else ""),
+            d["epistemic_tag"],
+            f"{from_fixed(d['composite_confidence']):.3f}",
+            d.get("source_section", "—"),
+            "🔴 NULL" if d.get("is_null_result") else d.get("status", ""),
+        ])
+    return summary, table_rows
+def extract_from_text(text_input, section):
+    """Extract claims from raw text input."""
+    if not text_input or len(text_input.strip()) < 50:
+        return "❌ Please enter at least 50 characters of scientific text", []
+    extractor = QualifiedExtractor(DB_PATH)
+    chunk = {
+        "text": text_input,
+        "section": section or "unknown",
+        "page": 0,
+        "min_confidence": 900,
+        "doc_id": None,
+        "region_ids": [],
+    }
+    claims = extractor.extract_from_chunk(chunk)
+    table_rows = []
+    for c in claims:
+        table_rows.append([
+            c["claim_id"][:12],
+            c["text"][:120],
+            c["epistemic_tag"],
+            f"{from_fixed(c['composite_confidence']):.3f}",
+            ", ".join(c.get("qualifiers", [])) or "—",
+            "🔴 NULL" if c.get("is_null_result") else c.get("status", ""),
+        ])
+    return f"✅ Extracted {len(claims)} claims from text", table_rows
+# ============================================================
+# Phase 3: Knowledge Graph
+# ============================================================
+def build_graph():
+    """Build knowledge graph from all claims."""
+    conn = get_db(DB_PATH)
+    claims = conn.execute("SELECT * FROM claims LIMIT 500").fetchall()
+    conn.close()
+    graph = KnowledgeGraph(DB_PATH)
+    # Add all claims as nodes
+    for row in claims:
+        c = dict(row)
+        graph.add_claim_node(c["claim_id"], c["text"], {
+            "tag": c["epistemic_tag"],
+            "confidence": c.get("composite_confidence", 0),
+            "section": c.get("source_section"),
+        })
+    stats = graph.get_stats()
+    return f"""✅ **Knowledge graph built!**
+| Metric | Value |
+|--------|-------|
+| Total Nodes | {stats['total_nodes']} |
+| Total Edges | {stats['total_edges']} |
+| Observed Edges | {stats['observed_edges']} |
+| Inferred Edges | {stats['inferred_edges']} |
+"""
+def find_gaps():
+    """Run gap analysis on the knowledge graph."""
+    graph = KnowledgeGraph(DB_PATH)
+    gaps = graph.find_gaps()
+    if not gaps:
+        return "No gaps found (need more nodes with edges to detect structural holes)"
+    lines = ["## 🔍 Research Gaps Detected\n"]
+    for g in gaps[:10]:
+        lines.append(f"- **{g['entity_a']}** ↔ **{g['entity_b']}** "
+                     f"(info gain: {g['information_gain']:.3f}, "
+                     f"degrees: {g['a_degree']}/{g['b_degree']})")
+    return "\n".join(lines)
+# ============================================================
+# Phase 4: Conflict Detection
+# ============================================================
+def detect_conflicts():
+    """Run conflict detection."""
+    graph = KnowledgeGraph(DB_PATH)
+    pairs = graph.find_conflicts(min_similarity=0.3, limit=20)
+    if not pairs:
+        return "No potential conflicts found", []
+    conn = get_db(DB_PATH)
+    table_rows = []
+    for p in pairs:
+        # Store conflict
+        conflict_id = gen_id("CONF")
+        conn.execute("""
+            INSERT OR IGNORE INTO conflicts (conflict_id, claim_a_id, claim_b_id,
+                conflict_type, hypothesis_confidence, comparability_confidence,
+                schema_version, created_at)
+            VALUES (?, ?, ?, 'value_mismatch', 'low', ?, '2.0', ?)
+        """, (conflict_id, p["claim_a"]["claim_id"], p["claim_b"]["claim_id"],
+              to_fixed(p["overlap"]), now_iso()))
+        table_rows.append([
+            conflict_id[:12],
+            p["claim_a"]["text"][:80],
+            p["claim_b"]["text"][:80],
+            f"{p['overlap']:.2f}",
+            "Unresolved",
+        ])
+    conn.commit()
+    conn.close()
+    return f"✅ Found {len(pairs)} potential conflicts", table_rows
+# ============================================================
+# Phase 5: Scoring
+# ============================================================
+def rescore_all():
+    """Rescore all claims with code-computed confidence."""
+    scorer = CalibratedScorer(DB_PATH)
+    count = scorer.rescore_all_claims()
+    return f"✅ Rescored {count} claims with code-computed confidence (3-score system)"
+# ============================================================
+# Phase 6: Goals & Decisions
+# ============================================================
+def create_goal(description, priority):
+    if not description:
+        return "❌ Please enter a goal description"
+    conn = get_db(DB_PATH)
+    goal_id = gen_id("GOAL")
+    conn.execute("""
+        INSERT INTO goals (goal_id, description, priority, status, schema_version, created_at, updated_at)
+        VALUES (?, ?, ?, 'Active', '2.0', ?, ?)
+    """, (goal_id, description, priority or "medium", now_iso(), now_iso()))
+    conn.commit()
+    conn.close()
+    return f"✅ Goal created: `{goal_id}`"
+# ============================================================
+# Claim Browser
+# ============================================================
+def browse_claims(tag_filter, min_conf, section_filter):
+    """Browse claims with filters."""
+    conn = get_db(DB_PATH)
+    conditions = []
+    params = []
+    if tag_filter and tag_filter != "All":
+        conditions.append("epistemic_tag = ?")
+        params.append(tag_filter)
+    if min_conf and min_conf > 0:
+        conditions.append("composite_confidence >= ?")
+        params.append(to_fixed(min_conf))
+    if section_filter and section_filter != "All":
+        conditions.append("source_section = ?")
+        params.append(section_filter)
+    where = " AND ".join(conditions) if conditions else "1=1"
+    rows = conn.execute(f"""
+        SELECT claim_id, text, epistemic_tag, composite_confidence,
+               evidence_quality, truth_likelihood, qualifier_strength_score,
+               source_section, status, is_null_result, qualifiers
+        FROM claims WHERE {where}
+        ORDER BY composite_confidence DESC LIMIT 100
+    """, params).fetchall()
+    conn.close()
+    table_rows = []
+    for r in rows:
+        d = dict(r)
+        quals = json.loads(d.get("qualifiers", "[]")) if isinstance(d.get("qualifiers"), str) else d.get("qualifiers", [])
+        table_rows.append([
+            d["claim_id"][:12],
+            d["text"][:150],
+            d["epistemic_tag"],
+            f"{from_fixed(d.get('composite_confidence', 0)):.3f}",
+            f"{from_fixed(d.get('evidence_quality', 0)):.3f}" if d.get('evidence_quality') else "—",
+            f"{from_fixed(d.get('truth_likelihood', 0)):.3f}" if d.get('truth_likelihood') else "—",
+            d.get("source_section", "—"),
+            "🔴" if d.get("is_null_result") else "—",
+        ])
+    return table_rows
+# ============================================================
+# THE GUIDED UI
+# ============================================================
+THEME = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="slate",
+    neutral_hue="slate",
+).set(
+    body_background_fill="*neutral_950",
+    body_background_fill_dark="*neutral_950",
+    block_background_fill="*neutral_900",
+    block_background_fill_dark="*neutral_900",
+    input_background_fill="*neutral_800",
+    input_background_fill_dark="*neutral_800",
+)
+with gr.Blocks(theme=THEME, title="PhD Research OS v2.0") as app:
+    # ── Header ──
+    overview = gr.Markdown(value=render_phase_overview())
+    with gr.Tabs() as tabs:
+        # ══════════════════════════════════════════════════════
+        # TAB 1: PHASE 1 — Paper Ingestion (Layer 0)
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("📄 Phase 1: Ingest Papers"):
+            gr.Markdown("""### Layer 0: Structural PDF Ingestion
+Upload a PDF, and the system will parse it into section-aware regions with quality scores.
+Each region gets: section tag, bounding box, parse confidence, cross-references.""")
+            with gr.Row():
+                file_input = gr.File(label="Upload PDF or text file", file_types=[".pdf", ".txt", ".md", ".csv"])
+                with gr.Column():
+                    doc_type = gr.Dropdown(["main", "supplement", "dataset"], value="main", label="Document Type")
+                    title_input = gr.Textbox(label="Title (optional)")
+                    doi_input = gr.Textbox(label="DOI (optional)", placeholder="10.1234/example")
+            ingest_btn = gr.Button("📥 Ingest Document", variant="primary")
+            ingest_status = gr.Markdown()
+            region_preview = gr.Dataframe(
+                headers=["Region ID", "Type", "Section", "Content Preview", "Quality"],
+                label="Parsed Regions",
+            )
+            ingest_btn.click(
+                ingest_paper,
+                inputs=[file_input, doc_type, title_input, doi_input],
+                outputs=[ingest_status, region_preview, overview],
+            )
+        # ══════════════════════════════════════════════════════
+        # TAB 2: PHASE 2 — Claim Extraction (Layer 2)
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("🔬 Phase 2: Extract Claims"):
+            gr.Markdown("""### Layer 2: Qualified Claim Extraction
+Extract epistemic-tagged claims from ingested documents or raw text.
+Claims are tagged with qualifiers, null results, and section-aware confidence.""")
+            with gr.Tabs():
+                with gr.Tab("From Document"):
+                    doc_id_input = gr.Textbox(label="Document ID", placeholder="DOC_XXXXXXXX")
+                    extract_doc_btn = gr.Button("🔬 Extract Claims", variant="primary")
+                    extract_doc_status = gr.Markdown()
+                    extract_doc_table = gr.Dataframe(
+                        headers=["Claim ID", "Text", "Tag", "Confidence", "Section", "Status"],
+                    )
+                    extract_doc_btn.click(
+                        extract_claims_from_doc,
+                        inputs=[doc_id_input],
+                        outputs=[extract_doc_status, extract_doc_table],
+                    )
+                with gr.Tab("From Text"):
+                    text_input = gr.Textbox(label="Scientific Text", lines=8,
+                        placeholder="Paste scientific text here...")
+                    section_input = gr.Dropdown(
+                        ["abstract", "introduction", "methods", "results", "discussion", "conclusion", "unknown"],
+                        value="results", label="Section"
+                    )
+                    extract_text_btn = gr.Button("🔬 Extract Claims", variant="primary")
+                    extract_text_status = gr.Markdown()
+                    extract_text_table = gr.Dataframe(
+                        headers=["Claim ID", "Text", "Tag", "Confidence", "Qualifiers", "Status"],
+                    )
+                    extract_text_btn.click(
+                        extract_from_text,
+                        inputs=[text_input, section_input],
+                        outputs=[extract_text_status, extract_text_table],
+                    )
+        # ══════════════════════════════════════════════════════
+        # TAB 3: PHASE 3 — Knowledge Graph (Layer 4)
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("🕸️ Phase 3: Knowledge Graph"):
+            gr.Markdown("""### Layer 4: Knowledge Graph + Gap Analysis
+Build a graph from extracted claims. Detect structural holes where evidence is missing.""")
+            with gr.Row():
+                build_btn = gr.Button("🕸️ Build Graph from Claims", variant="primary")
+                gap_btn = gr.Button("🔍 Find Research Gaps", variant="secondary")
+            graph_status = gr.Markdown()
+            gap_results = gr.Markdown()
+            build_btn.click(build_graph, outputs=graph_status)
+            gap_btn.click(find_gaps, outputs=gap_results)
+        # ══════════════════════════════════════════════════════
+        # TAB 4: PHASE 4 — Conflict Detection
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("⚔️ Phase 4: Conflicts"):
+            gr.Markdown("""### Conflict Detection & Resolution
+Find contradictions between claims from different sources.
+All conflict hypotheses are tagged confidence="low" — human review required.""")
+            detect_btn = gr.Button("⚔️ Detect Conflicts", variant="primary")
+            conflict_status = gr.Markdown()
+            conflict_table = gr.Dataframe(
+                headers=["Conflict ID", "Claim A", "Claim B", "Similarity", "Status"],
+            )
+            detect_btn.click(detect_conflicts, outputs=[conflict_status, conflict_table])
+        # ══════════════════════════════════════════════════════
+        # TAB 5: PHASE 5 — Calibrated Scoring (Layer 5)
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("📊 Phase 5: Scoring"):
+            gr.Markdown("""### Layer 5: Code-Computed Calibrated Scoring
+Rescore all claims using the 3-score system:
+- **Evidence Quality**: evidence × study_quality × journal_tier × completeness × section
+- **Truth Likelihood**: evidence_quality + corroboration - conflict_penalty
+- **Qualifier Strength**: 1.0 - qualifier_count×0.1 - null_penalty
+*The LLM provides components. The CODE computes final scores.*""")
+            rescore_btn = gr.Button("📊 Rescore All Claims", variant="primary")
+            rescore_status = gr.Markdown()
+            rescore_btn.click(rescore_all, outputs=rescore_status)
+        # ══════════════════════════════════════════════════════
+        # TAB 6: PHASE 6 — Research Goals & Decisions
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("🎯 Phase 6: Goals"):
+            gr.Markdown("### Set Research Goals\nDefine what you're trying to achieve. The system will link claims and gaps to your goals.")
+            with gr.Row():
+                goal_desc = gr.Textbox(label="Goal Description", placeholder="Achieve sub-fM LOD for cardiac troponin...")
+                goal_priority = gr.Dropdown(["high", "medium", "low"], value="high", label="Priority")
+            goal_btn = gr.Button("🎯 Create Goal", variant="primary")
+            goal_status = gr.Markdown()
+            goal_btn.click(create_goal, inputs=[goal_desc, goal_priority], outputs=goal_status)
+        # ══════════════════════════════════════════════════════
+        # TAB 7: Claim Browser
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("📋 Browse Claims"):
+            gr.Markdown("### Claim Browser\nFilter and explore all extracted claims.")
+            with gr.Row():
+                tag_filter = gr.Dropdown(
+                    ["All", "Fact", "Interpretation", "Hypothesis", "Conflict_Hypothesis"],
+                    value="All", label="Epistemic Tag"
+                )
+                conf_slider = gr.Slider(0, 1, value=0, step=0.05, label="Minimum Confidence")
+                section_filter = gr.Dropdown(
+                    ["All", "abstract", "introduction", "methods", "results", "discussion", "conclusion"],
+                    value="All", label="Section"
+                )
+            browse_btn = gr.Button("🔍 Search", variant="primary")
+            claims_table = gr.Dataframe(
+                headers=["ID", "Text", "Tag", "Composite", "Evidence Q", "Truth L", "Section", "Null?"],
+            )
+            browse_btn.click(
+                browse_claims,
+                inputs=[tag_filter, conf_slider, section_filter],
+                outputs=claims_table,
+            )
+        # ══════════════════════════════════════════════════════
+        # TAB 8: System Settings
+        # ══════════════════════════════════════════════════════
+        with gr.Tab("⚙️ Settings"):
+            gr.Markdown(f"""### System Configuration
+| Setting | Value |
+|---------|-------|
+| Database | `{DB_PATH}` |
+| Schema Version | 2.0 |
+| Pipeline Version | 2.1.0 |
+### Local Model Setup
+To use AI-powered extraction (instead of heuristic), set up a local model:
+```bash
+# Option 1: Ollama (simplest)
+curl -fsSL https://ollama.com/install.sh | sh
+ollama pull qwen3:8b
+# Option 2: Set API key for cloud fallback
+export ANTHROPIC_API_KEY=sk-...
+# or
+export OPENAI_API_KEY=sk-...
+```
+### Upgrade Parser
+For best PDF parsing, install Marker:
+```bash
+pip install marker-pdf
+```
+### Layer Status
+""")
+            refresh_btn = gr.Button("🔄 Refresh Status")
+            status_display = gr.Markdown(value=render_phase_overview())
+            refresh_btn.click(render_phase_overview, outputs=status_display)
+# ============================================================
+# Launch
+# ============================================================
+if __name__ == "__main__":
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+    )