import pytest import os # Redirect HF Cache to a writable directory os.environ["HF_HOME"] = os.path.join(os.getcwd(), ".hf_cache") from app.pipeline.regex_rules import RegexDetector from app.pipeline.hybrid import DeidPipeline def test_regex_dates(): detector = RegexDetector() text = "Admitted on 01/15/2023 and Jan '24" results = detector.detect(text) labels = [r["label"] for r in results] assert "DATE" in labels assert len(results) >= 2 def test_regex_age(): detector = RegexDetector() text = "The patient is 92 years old and another is age: 95." results = detector.detect(text) assert len(results) == 2 assert all(r["label"] == "AGE" for r in results) def test_regex_mrn(): detector = RegexDetector() text = "MRN: A1B2C3D4E5. EHR# 99887766." results = detector.detect(text) assert len(results) == 2 assert all(r["label"] == "MRN" for r in results) def test_regex_edge_cases(): detector = RegexDetector() # Edge cases: serial numbers and phone with leading parenthesis text = "SN: XYZ-9000-SER, Phone: (555) 010-9988" results = detector.detect(text) labels = {r["label"]: r["text"] for r in results} assert "DEVICE_ID" in labels assert labels["DEVICE_ID"] == "SN: XYZ-9000-SER" assert "PHONE" in labels assert labels["PHONE"] == "(555) 010-9988" def test_hybrid_merging_overlaps(): # Mocking transformer output for unit test class MockTransformer: def detect(self, text): # transformer catches "John Doe" return [{"start": 0, "end": 8, "label": "PERSON", "text": "John Doe", "source": "transformer", "score": 0.9}] pipeline = DeidPipeline() pipeline.transformer_detector = MockTransformer() # Text where regex and transformer might overlap text = "John Doe born 01/01/1980" # Regex will catch 01/01/1980 result = pipeline.deidentify(text) entities = result["entities"] # Should have both PERSON and DATE labels = [e["label"] for e in entities] assert "PERSON" in labels assert "DATE" in labels def test_nested_merging(): pipeline = DeidPipeline() # Mock: outer transformer span "Hospital Room 101", inner regex Zip Code "90210" # Wait, 90210 is a zip. class MockTransformer: def detect(self, text): return [{"start": 0, "end": 20, "label": "LOCATION", "text": "Beverly Hills 90210", "source": "transformer", "score": 0.8}] pipeline.transformer_detector = MockTransformer() text = "Beverly Hills 90210" # Regex ZIP will catch 90210 result = pipeline.deidentify(text) entities = result["entities"] # Since zip is inside location, and current logic keeps outer if transformers... # Actually if zip is nested in transformer, we currently continue (keep outer). # Check if that's what we want. In de-id, masking the whole "Beverly Hills 90210" is fine. assert len(entities) == 1 assert entities[0]["label"] == "LOCATION" if __name__ == "__main__": pytest.main([__file__])