Spaces:

mekosotto
/

hackathon

Running

App Files Files Community

mekosotto commited on 7 days ago

Commit

8f586ea

1 Parent(s): cc8c965

feat(core): add MLflow tracking helper with disable env-flag

Browse files

Files changed (3) hide show

conftest.py +23 -0
src/core/tracking.py +67 -0
tests/core/test_tracking.py +82 -0

conftest.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Repo-wide pytest fixtures.
+Pins MLflow's tracking URI to a per-session tmp directory so pipeline tests
+don't litter `./mlruns/` in the working tree, and so test runs are isolated
+from production MLflow state.
+"""
+from __future__ import annotations
+import os
+import tempfile
+from pathlib import Path
+from typing import Iterator
+import pytest
+@pytest.fixture(autouse=True, scope="session")
+def _isolate_mlflow_tracking_uri() -> Iterator[None]:
+    tmp_root = Path(tempfile.mkdtemp(prefix="mlflow_test_"))
+    os.environ["MLFLOW_TRACKING_URI"] = f"file://{tmp_root}"
+    yield
+    # Don't rmtree — pytest tmpdir cleanup or OS handles it; rmtree
+    # races with mlflow background writes on slow CI.

src/core/tracking.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""MLflow tracking helper used by all three pipelines.
+Wraps `mlflow.start_run` so each pipeline can log params, metrics, and an
+output artifact in one block. Honors `NEUROBRIDGE_DISABLE_MLFLOW=1` for
+environments where the tracking server is not reachable (offline demos, CI
+without mlflow service). When disabled, yields `None` and does no I/O.
+Tracking URI source of truth: the standard `MLFLOW_TRACKING_URI` env var.
+Tests pin this via the repo-wide conftest.py autouse fixture.
+"""
+from __future__ import annotations
+import contextlib
+import os
+from pathlib import Path
+from typing import Iterator
+import mlflow
+from src.core.logger import get_logger
+logger = get_logger(__name__)
+_DISABLE_FLAG = "NEUROBRIDGE_DISABLE_MLFLOW"
+@contextlib.contextmanager
+def track_pipeline_run(
+    experiment_name: str,
+    params: dict[str, object],
+    metrics: dict[str, float],
+    artifact_path: Path,
+) -> Iterator[str | None]:
+    """Context manager that creates an MLflow run for one pipeline invocation.
+    On enter: creates/loads `experiment_name`, starts a run, logs params + metrics.
+    On exit: logs `artifact_path` as an artifact and ends the run.
+    Yields the active `run_id` (str), or `None` if MLflow is disabled.
+    Args:
+        experiment_name: e.g. "bbb_pipeline" / "eeg_pipeline" / "mri_pipeline".
+        params: Run parameters (input path, hyper-params, etc.). Stringified by MLflow.
+        metrics: Numeric metrics (row counts, durations).
+        artifact_path: Path to the produced Parquet — logged as a run artifact.
+    """
+    if os.environ.get(_DISABLE_FLAG) == "1":
+        logger.info("MLflow disabled via %s=1; skipping run tracking", _DISABLE_FLAG)
+        yield None
+        return
+    mlflow.set_experiment(experiment_name)
+    with mlflow.start_run() as run:
+        for key, value in params.items():
+            mlflow.log_param(key, value)
+        for key, value in metrics.items():
+            mlflow.log_metric(key, value)
+        try:
+            yield run.info.run_id
+        finally:
+            if Path(artifact_path).exists():
+                mlflow.log_artifact(str(artifact_path))
+            else:
+                logger.warning(
+                    "artifact_path %s does not exist; skipping artifact log",
+                    artifact_path,
+                )

tests/core/test_tracking.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Tests for src.core.tracking."""
+from __future__ import annotations
+from pathlib import Path
+import mlflow
+import pandas as pd
+from src.core import tracking
+class TestTrackPipelineRun:
+    def test_creates_run_with_experiment_name(self, tmp_path: Path):
+        out = tmp_path / "out.parquet"
+        pd.DataFrame({"a": [1]}).to_parquet(out)
+        with tracking.track_pipeline_run(
+            experiment_name="bbb_pipeline",
+            params={"input_path": "x.csv"},
+            metrics={"rows_in": 6.0, "rows_out": 4.0},
+            artifact_path=out,
+        ) as run_id:
+            assert run_id is not None
+        runs = mlflow.search_runs(experiment_names=["bbb_pipeline"])
+        assert len(runs) >= 1
+    def test_logs_params(self, tmp_path: Path):
+        out = tmp_path / "out.parquet"
+        pd.DataFrame({"a": [1]}).to_parquet(out)
+        with tracking.track_pipeline_run(
+            experiment_name="bbb_pipeline_params",
+            params={"n_bits": 2048, "radius": 2},
+            metrics={},
+            artifact_path=out,
+        ):
+            pass
+        runs = mlflow.search_runs(experiment_names=["bbb_pipeline_params"])
+        assert "params.n_bits" in runs.columns
+        assert runs.iloc[0]["params.n_bits"] == "2048"
+    def test_logs_metrics(self, tmp_path: Path):
+        out = tmp_path / "out.parquet"
+        pd.DataFrame({"a": [1]}).to_parquet(out)
+        with tracking.track_pipeline_run(
+            experiment_name="eeg_pipeline_metrics",
+            params={},
+            metrics={"duration_sec": 1.234, "rows_out": 100.0},
+            artifact_path=out,
+        ):
+            pass
+        runs = mlflow.search_runs(experiment_names=["eeg_pipeline_metrics"])
+        assert runs.iloc[0]["metrics.duration_sec"] == 1.234
+        assert runs.iloc[0]["metrics.rows_out"] == 100.0
+    def test_logs_artifact(self, tmp_path: Path):
+        out = tmp_path / "out.parquet"
+        pd.DataFrame({"a": [1]}).to_parquet(out)
+        with tracking.track_pipeline_run(
+            experiment_name="mri_pipeline_artifact",
+            params={},
+            metrics={},
+            artifact_path=out,
+        ) as run_id:
+            pass
+        artifacts = mlflow.MlflowClient().list_artifacts(run_id)
+        assert any(a.path.endswith("out.parquet") for a in artifacts)
+    def test_disabled_via_env_returns_no_op(self, monkeypatch, tmp_path: Path):
+        """Setting NEUROBRIDGE_DISABLE_MLFLOW=1 must skip MLflow entirely
+        (used by live demo when the tracking server is down)."""
+        monkeypatch.setenv("NEUROBRIDGE_DISABLE_MLFLOW", "1")
+        out = tmp_path / "out.parquet"
+        pd.DataFrame({"a": [1]}).to_parquet(out)
+        with tracking.track_pipeline_run(
+            experiment_name="should_not_appear",
+            params={"x": 1},
+            metrics={"y": 2.0},
+            artifact_path=out,
+        ) as run_id:
+            assert run_id is None
+        # No "should_not_appear" experiment was created
+        names = [e.name for e in mlflow.search_experiments()]
+        assert "should_not_appear" not in names