Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto Claude Sonnet 4.6 commited on 5 days ago

Commit

460fcc2

1 Parent(s): 978f645

feat(agents): Tool dataclass + registry + 4 tool wrappers (3 pipelines + RAG)

Browse files

Files changed (5) hide show

src/agents/__init__.py +0 -0
src/agents/schemas.py +87 -0
src/agents/tools.py +205 -0
tests/agents/__init__.py +0 -0
tests/agents/test_tools.py +95 -0

src/agents/__init__.py ADDED Viewed

File without changes

src/agents/schemas.py ADDED Viewed

	@@ -0,0 +1,87 @@

+"""Pydantic input/output schemas for orchestrator tools and the agent result.
+These schemas double as OpenAI function-calling parameter definitions
+(via `model_json_schema()`) and as runtime validation gates. Keep field
+names lowercase + snake_case so prompts and JSON outputs align.
+"""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+# --- Pipeline tool inputs ---------------------------------------------------
+class BBBPipelineInput(BaseModel):
+    """Input for `run_bbb_pipeline` — a single SMILES string."""
+    smiles: str = Field(..., description="A single molecular SMILES string, e.g. 'CCO'")
+    top_k: int = Field(5, ge=1, le=20, description="Top-k SHAP attributions to return")
+class EEGPipelineInput(BaseModel):
+    """Input for `run_eeg_pipeline` — path to an EEG file (.fif or .edf)."""
+    input_path: str = Field(..., description="Path to EEG recording file (.fif or .edf)")
+    epoch_duration_s: float = Field(2.0, gt=0.1, le=60.0)
+class MRIPipelineInput(BaseModel):
+    """Input for `run_mri_pipeline` — directory of NIfTI files + sites CSV."""
+    input_dir: str = Field(..., description="Directory containing .nii.gz volumes")
+    sites_csv: str = Field(..., description="CSV mapping subject_id → site")
+class RetrieveContextInput(BaseModel):
+    """Input for `retrieve_context` — natural-language query into the KB."""
+    query: str = Field(..., min_length=2, description="Search query for the knowledge base")
+    k: int = Field(4, ge=1, le=10, description="Number of chunks to return")
+# --- Pipeline tool outputs --------------------------------------------------
+class BBBPipelineOutput(BaseModel):
+    smiles: str
+    label: int
+    label_text: str
+    confidence: float
+    top_features: list[dict[str, Any]]
+    drift_z: float | None = None
+class EEGPipelineOutput(BaseModel):
+    input_path: str
+    output_path: str
+    rows: int
+    columns: int
+    duration_sec: float
+class MRIPipelineOutput(BaseModel):
+    input_dir: str
+    output_path: str
+    rows: int
+    columns: int
+    duration_sec: float
+class RetrieveContextOutput(BaseModel):
+    query: str
+    chunks: list[dict[str, Any]]
+# --- Agent result -----------------------------------------------------------
+class ToolTraceItem(BaseModel):
+    """One step in the orchestrator's tool-call trace."""
+    name: str
+    args: dict[str, Any]
+    result: dict[str, Any] | None = None
+    error: str | None = None
+class AgentResult(BaseModel):
+    """Final orchestrator response: synthesized text + full trace."""
+    text: str
+    trace: list[ToolTraceItem] = Field(default_factory=list)
+    model: str | None = None
+    finish_reason: str = "complete"  # complete | max_steps | error

src/agents/tools.py ADDED Viewed

	@@ -0,0 +1,205 @@

+"""Tool dataclass + registry. Wraps each pipeline + the RAG retriever as a
+function-callable tool the orchestrator can invoke.
+Public entry: `build_default_tools(rag_index_dir)` returns the 4 tools.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable
+from pydantic import BaseModel, ValidationError
+from src.agents.schemas import (
+    BBBPipelineInput,
+    BBBPipelineOutput,
+    EEGPipelineInput,
+    EEGPipelineOutput,
+    MRIPipelineInput,
+    MRIPipelineOutput,
+    RetrieveContextInput,
+    RetrieveContextOutput,
+)
+from src.core.logger import get_logger
+logger = get_logger(__name__)
+@dataclass
+class Tool:
+    """One callable tool exposed to the orchestrator.
+    `execute(input_model_instance) -> output_model_instance` is the contract.
+    `invoke(args_dict)` validates the dict, runs execute, returns a plain dict.
+    """
+    name: str
+    description: str
+    input_model: type[BaseModel]
+    output_model: type[BaseModel]
+    execute: Callable[[Any], BaseModel]
+    def openai_schema(self) -> dict[str, Any]:
+        """OpenAI/OpenRouter function-calling schema for this tool."""
+        params = self.input_model.model_json_schema()
+        # OpenAI doesn't accept top-level $defs / title in some clients —
+        # strip the cosmetic ones; keep properties/required/type.
+        cleaned = {
+            "type": "object",
+            "properties": params.get("properties", {}),
+            "required": params.get("required", []),
+        }
+        return {
+            "type": "function",
+            "function": {
+                "name": self.name,
+                "description": self.description,
+                "parameters": cleaned,
+            },
+        }
+    def invoke(self, args: dict[str, Any]) -> dict[str, Any]:
+        try:
+            inp = self.input_model.model_validate(args)
+        except ValidationError as e:
+            raise ValueError(f"invalid input for {self.name}: {e}") from e
+        out = self.execute(inp)
+        return out.model_dump()
+# ---------------------------------------------------------------------------
+# Tool implementations — thin wrappers around existing pipelines + RAG.
+# Heavy work stays in the underlying modules; these only adapt I/O.
+# ---------------------------------------------------------------------------
+def _execute_bbb(inp: BBBPipelineInput) -> BBBPipelineOutput:
+    """Predict + SHAP for a single SMILES, reusing the existing model surface."""
+    from src.api import routes as api_routes
+    from src.api.schemas import BBBPredictRequest
+    response = api_routes.predict_bbb(
+        BBBPredictRequest(smiles=inp.smiles, top_k=inp.top_k)
+    )
+    return BBBPipelineOutput(
+        smiles=inp.smiles,
+        label=response.label,
+        label_text=response.label_text,
+        confidence=response.confidence,
+        top_features=[f.model_dump() for f in response.top_features],
+        drift_z=response.drift_z,
+    )
+def _execute_eeg(inp: EEGPipelineInput) -> EEGPipelineOutput:
+    """Run the EEG pipeline via the existing route function (run_eeg)."""
+    from src.api.schemas import EEGRequest
+    from src.api import routes as api_routes
+    out_path = Path("data/processed/eeg_features.parquet")
+    response = api_routes.run_eeg(
+        EEGRequest(
+            input_path=inp.input_path,
+            output_path=str(out_path),
+            epoch_duration_s=inp.epoch_duration_s,
+        )
+    )
+    return EEGPipelineOutput(
+        input_path=inp.input_path,
+        output_path=response.output_path,
+        rows=response.rows,
+        columns=response.columns,
+        duration_sec=response.duration_sec,
+    )
+def _execute_mri(inp: MRIPipelineInput) -> MRIPipelineOutput:
+    """Run the MRI pipeline via the existing route function (run_mri)."""
+    from src.api.schemas import MRIRequest
+    from src.api import routes as api_routes
+    out_path = Path("data/processed/mri_features.parquet")
+    response = api_routes.run_mri(
+        MRIRequest(
+            input_dir=inp.input_dir,
+            sites_csv=inp.sites_csv,
+            output_path=str(out_path),
+        )
+    )
+    return MRIPipelineOutput(
+        input_dir=inp.input_dir,
+        output_path=response.output_path,
+        rows=response.rows,
+        columns=response.columns,
+        duration_sec=response.duration_sec,
+    )
+def _make_retrieve_executor(rag_index_dir: Path | None) -> Callable[[RetrieveContextInput], RetrieveContextOutput]:
+    """Closure: capture the index dir; lazy-load the retriever on first call."""
+    state: dict[str, Any] = {"retriever": None}
+    def execute(inp: RetrieveContextInput) -> RetrieveContextOutput:
+        if rag_index_dir is None or not (rag_index_dir / "index.bin").exists():
+            return RetrieveContextOutput(query=inp.query, chunks=[])
+        if state["retriever"] is None:
+            from src.rag.retrieve import RAGRetriever
+            state["retriever"] = RAGRetriever.load(rag_index_dir)
+        hits = state["retriever"].search(inp.query, k=inp.k)
+        return RetrieveContextOutput(query=inp.query, chunks=hits)
+    return execute
+def build_default_tools(rag_index_dir: Path | None) -> list[Tool]:
+    """Return the 4 tools the orchestrator gets by default."""
+    return [
+        Tool(
+            name="run_bbb_pipeline",
+            description=(
+                "Predict blood-brain-barrier permeability for a SINGLE SMILES "
+                "string. Use this when the user input looks like a molecule "
+                "(short alphanumeric string with no file extension, e.g. 'CCO', "
+                "'c1ccccc1'). Returns label, confidence, top SHAP features, drift."
+            ),
+            input_model=BBBPipelineInput,
+            output_model=BBBPipelineOutput,
+            execute=_execute_bbb,
+        ),
+        Tool(
+            name="run_eeg_pipeline",
+            description=(
+                "Run the EEG signal-processing pipeline (bandpass + ICA + "
+                "epoching + feature extraction) on an EEG recording file. Use "
+                "when input_path ends in .fif or .edf. Returns row/column "
+                "counts + duration."
+            ),
+            input_model=EEGPipelineInput,
+            output_model=EEGPipelineOutput,
+            execute=_execute_eeg,
+        ),
+        Tool(
+            name="run_mri_pipeline",
+            description=(
+                "Run the multi-site MRI ComBat-harmonization pipeline. Use "
+                "when input is a directory containing .nii.gz volumes paired "
+                "with a sites.csv. Returns row/column counts + duration."
+            ),
+            input_model=MRIPipelineInput,
+            output_model=MRIPipelineOutput,
+            execute=_execute_mri,
+        ),
+        Tool(
+            name="retrieve_context",
+            description=(
+                "Retrieve up to k passages from the curated reference knowledge "
+                "base. Use AFTER a pipeline tool returns, to ground your final "
+                "synthesis in cited literature. Formulate a focused query "
+                "based on the pipeline output (e.g., 'BBB permeability of "
+                "small lipophilic molecules' or 'ComBat site harmonization')."
+            ),
+            input_model=RetrieveContextInput,
+            output_model=RetrieveContextOutput,
+            execute=_make_retrieve_executor(rag_index_dir),
+        ),
+    ]

tests/agents/__init__.py ADDED Viewed

File without changes

tests/agents/test_tools.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Tests for src.agents.tools — Tool dataclass + registry + 4 tool wrappers."""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from pydantic import BaseModel
+from src.agents.tools import (
+    Tool,
+    build_default_tools,
+    BBBPipelineInput,
+    EEGPipelineInput,
+    MRIPipelineInput,
+    RetrieveContextInput,
+)
+class _DummyInput(BaseModel):
+    x: int
+    y: str = "default"
+class _DummyOutput(BaseModel):
+    result: int
+class TestTool:
+    def test_openai_schema_shape(self) -> None:
+        tool = Tool(
+            name="dummy",
+            description="A dummy tool",
+            input_model=_DummyInput,
+            output_model=_DummyOutput,
+            execute=lambda inp: _DummyOutput(result=inp.x * 2),
+        )
+        schema = tool.openai_schema()
+        assert schema["type"] == "function"
+        assert schema["function"]["name"] == "dummy"
+        assert schema["function"]["description"] == "A dummy tool"
+        params = schema["function"]["parameters"]
+        assert params["type"] == "object"
+        assert "x" in params["properties"]
+        assert "x" in params["required"]
+        assert "y" not in params["required"]  # has default
+    def test_invoke_validates_and_returns_dict(self) -> None:
+        tool = Tool(
+            name="dummy",
+            description="d",
+            input_model=_DummyInput,
+            output_model=_DummyOutput,
+            execute=lambda inp: _DummyOutput(result=inp.x * 2),
+        )
+        out = tool.invoke({"x": 5})
+        assert out == {"result": 10}
+    def test_invoke_invalid_input_raises(self) -> None:
+        tool = Tool(
+            name="dummy",
+            description="d",
+            input_model=_DummyInput,
+            output_model=_DummyOutput,
+            execute=lambda inp: _DummyOutput(result=inp.x * 2),
+        )
+        with pytest.raises(ValueError, match="invalid input"):
+            tool.invoke({"y": "missing-x"})
+class TestBuildDefaultTools:
+    def test_default_set_has_four_tools(self, tmp_path: Path) -> None:
+        # build with placeholder paths; tools won't be invoked here
+        tools = build_default_tools(rag_index_dir=None)
+        names = {t.name for t in tools}
+        assert names == {
+            "run_bbb_pipeline",
+            "run_eeg_pipeline",
+            "run_mri_pipeline",
+            "retrieve_context",
+        }
+    def test_each_tool_has_pydantic_input_model(self) -> None:
+        tools = build_default_tools(rag_index_dir=None)
+        for t in tools:
+            assert issubclass(t.input_model, BaseModel)
+            assert issubclass(t.output_model, BaseModel)
+    def test_input_models_have_smiles_paths(self) -> None:
+        # verify the field names downstream system prompt depends on
+        assert "smiles" in BBBPipelineInput.model_fields
+        assert "input_path" in EEGPipelineInput.model_fields
+        assert "input_dir" in MRIPipelineInput.model_fields
+        assert "sites_csv" in MRIPipelineInput.model_fields
+        assert "query" in RetrieveContextInput.model_fields
+        assert "k" in RetrieveContextInput.model_fields