Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto commited on 4 days ago

Commit

91dde0d

1 Parent(s): 8eff23e

feat(agents): retrieve_context corpus dispatch (reference vs clinical)

Browse files

Files changed (4) hide show

src/agents/schemas.py +9 -1
src/agents/tools.py +41 -9
src/api/routes.py +8 -1
tests/agents/test_tools_clinical_corpus.py +49 -0

src/agents/schemas.py CHANGED Viewed

@@ -6,7 +6,7 @@ names lowercase + snake_case so prompts and JSON outputs align.
 """
 from __future__ import annotations
-from typing import Any
 from pydantic import BaseModel, Field
@@ -38,6 +38,14 @@ class RetrieveContextInput(BaseModel):
     """Input for `retrieve_context` — natural-language query into the KB."""
     query: str = Field(..., min_length=2, description="Search query for the knowledge base")
     k: int = Field(4, ge=1, le=10, description="Number of chunks to return")
 # --- Pipeline tool outputs --------------------------------------------------

 """
 from __future__ import annotations
+from typing import Any, Literal
 from pydantic import BaseModel, Field
     """Input for `retrieve_context` — natural-language query into the KB."""
     query: str = Field(..., min_length=2, description="Search query for the knowledge base")
     k: int = Field(4, ge=1, le=10, description="Number of chunks to return")
+    corpus: Literal["reference", "clinical"] = Field(
+        "reference",
+        description=(
+            "Which corpus to query. 'reference' = curated FAISS index (default). "
+            "'clinical' = TF-IDF index over peer-reviewed Alzheimer's/Parkinson's "
+            "papers with Turkish+English query expansion."
+        ),
+    )
 # --- Pipeline tool outputs --------------------------------------------------

src/agents/tools.py CHANGED Viewed

@@ -157,11 +157,41 @@ def _make_mri_executor(processed_dir: Path) -> Callable[[MRIPipelineInput], MRIP
     return execute
-def _make_retrieve_executor(rag_index_dir: Path | None) -> Callable[[RetrieveContextInput], RetrieveContextOutput]:
-    """Closure: capture the index dir; lazy-load the retriever on first call."""
-    state: dict[str, Any] = {"retriever": None}
     def execute(inp: RetrieveContextInput) -> RetrieveContextOutput:
         if rag_index_dir is None or not (rag_index_dir / "index.bin").exists():
             return RetrieveContextOutput(query=inp.query, chunks=[])
         if state["retriever"] is None:
@@ -176,6 +206,7 @@ def _make_retrieve_executor(rag_index_dir: Path | None) -> Callable[[RetrieveCon
 def build_default_tools(
     rag_index_dir: Path | None,
     processed_dir: Path = Path("data/processed"),
 ) -> list[Tool]:
     """Return the 5 tools the orchestrator gets by default."""
     return [
@@ -217,15 +248,16 @@ def build_default_tools(
         Tool(
             name="retrieve_context",
             description=(
-                "Retrieve up to k passages from the curated reference knowledge "
-                "base. Use AFTER a pipeline tool returns, to ground your final "
-                "synthesis in cited literature. Formulate a focused query "
-                "based on the pipeline output (e.g., 'BBB permeability of "
-                "small lipophilic molecules' or 'ComBat site harmonization')."
             ),
             input_model=RetrieveContextInput,
             output_model=RetrieveContextOutput,
-            execute=_make_retrieve_executor(rag_index_dir),
         ),
         Tool(
             name="run_fusion",

     return execute
+def _make_retrieve_executor(
+    rag_index_dir: Path | None,
+    clinical_rag_index_path: Path | None = None,
+) -> Callable[[RetrieveContextInput], RetrieveContextOutput]:
+    """Closure: capture both index sources; lazy-load each on first use."""
+    state: dict[str, Any] = {"retriever": None, "clinical_payload": None}
     def execute(inp: RetrieveContextInput) -> RetrieveContextOutput:
+        if inp.corpus == "clinical":
+            if clinical_rag_index_path is None or not Path(clinical_rag_index_path).exists():
+                logger.warning(
+                    "retrieve_context corpus=clinical but no index path configured (path=%s)",
+                    clinical_rag_index_path,
+                )
+                return RetrieveContextOutput(query=inp.query, chunks=[])
+            if state["clinical_payload"] is None:
+                from src.rag.clinical.loader import load_index
+                state["clinical_payload"] = load_index(Path(clinical_rag_index_path))
+            from src.rag.clinical.retrieve import retrieve_clinical
+            result = retrieve_clinical(state["clinical_payload"], inp.query, top_k=inp.k)
+            return RetrieveContextOutput(
+                query=inp.query,
+                chunks=[
+                    {
+                        "source": ev.source,
+                        "page_start": ev.page_start,
+                        "page_end": ev.page_end,
+                        "text": ev.sentence,
+                        "score": ev.score,
+                    }
+                    for ev in result.evidence
+                ],
+            )
+        # corpus == "reference" — existing FAISS path.
         if rag_index_dir is None or not (rag_index_dir / "index.bin").exists():
             return RetrieveContextOutput(query=inp.query, chunks=[])
         if state["retriever"] is None:
 def build_default_tools(
     rag_index_dir: Path | None,
     processed_dir: Path = Path("data/processed"),
+    clinical_rag_index_path: Path | None = None,
 ) -> list[Tool]:
     """Return the 5 tools the orchestrator gets by default."""
     return [
         Tool(
             name="retrieve_context",
             description=(
+                "Retrieve up to k passages from a knowledge base. corpus='clinical' "
+                "queries the peer-reviewed Alzheimer's/Parkinson's papers (TF-IDF, "
+                "supports Turkish keywords like 'egzersiz', 'beslenme', 'unutkanlik'); "
+                "default corpus='reference' queries the curated FAISS index. Use "
+                "AFTER a pipeline tool returns, to ground your final synthesis in "
+                "cited literature."
             ),
             input_model=RetrieveContextInput,
             output_model=RetrieveContextOutput,
+            execute=_make_retrieve_executor(rag_index_dir, clinical_rag_index_path),
         ),
         Tool(
             name="run_fusion",

src/api/routes.py CHANGED Viewed

@@ -616,7 +616,14 @@ def _build_orchestrator():
         timeout=30.0,
     )
     rag_dir = _DEFAULT_RAG_INDEX_DIR if _DEFAULT_RAG_INDEX_DIR.exists() else None
-    tools = build_default_tools(rag_index_dir=rag_dir)
     model = os.environ.get(_AGENT_MODEL_ENV, _AGENT_DEFAULT_MODEL)
     return Orchestrator(
         llm_client=client,

         timeout=30.0,
     )
     rag_dir = _DEFAULT_RAG_INDEX_DIR if _DEFAULT_RAG_INDEX_DIR.exists() else None
+    clinical_idx = Path(os.environ.get(
+        "CLINICAL_RAG_INDEX_PATH",
+        "data/external_rag/index/rag_index.pkl",
+    ))
+    tools = build_default_tools(
+        rag_index_dir=rag_dir,
+        clinical_rag_index_path=clinical_idx if clinical_idx.exists() else None,
+    )
     model = os.environ.get(_AGENT_MODEL_ENV, _AGENT_DEFAULT_MODEL)
     return Orchestrator(
         llm_client=client,

tests/agents/test_tools_clinical_corpus.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Tests: retrieve_context tool dispatches by `corpus`."""
+from __future__ import annotations
+from pathlib import Path
+from src.agents.tools import build_default_tools
+from tests.fixtures.build_tiny_clinical_index import build as build_tiny
+class TestClinicalCorpus:
+    def test_default_corpus_is_reference(self, tmp_path: Path) -> None:
+        clinical_idx = build_tiny(tmp_path / "tiny.pkl")
+        tools = {t.name: t for t in build_default_tools(
+            rag_index_dir=None,
+            clinical_rag_index_path=clinical_idx,
+        )}
+        tool = tools["retrieve_context"]
+        out = tool.execute(tool.input_model.model_validate({"query": "test query"}))
+        assert hasattr(out, "chunks")
+        # rag_index_dir=None means reference returns empty.
+        assert out.chunks == []
+    def test_clinical_corpus_returns_evidence(self, tmp_path: Path) -> None:
+        clinical_idx = build_tiny(tmp_path / "tiny.pkl")
+        tools = {t.name: t for t in build_default_tools(
+            rag_index_dir=None,
+            clinical_rag_index_path=clinical_idx,
+        )}
+        tool = tools["retrieve_context"]
+        out = tool.execute(tool.input_model.model_validate({
+            "query": "exercise and Alzheimer",
+            "corpus": "clinical",
+        }))
+        assert len(out.chunks) > 0
+        for c in out.chunks:
+            assert "source" in c and "text" in c
+    def test_clinical_corpus_without_index_returns_empty(self, tmp_path: Path) -> None:
+        # No clinical index path configured.
+        tools = {t.name: t for t in build_default_tools(
+            rag_index_dir=None,
+            clinical_rag_index_path=None,
+        )}
+        tool = tools["retrieve_context"]
+        out = tool.execute(tool.input_model.model_validate({
+            "query": "egzersiz Alzheimer",
+            "corpus": "clinical",
+        }))
+        assert out.chunks == []