Spaces:

Nithins03
/

clinical-deidentify

Sleeping

App Files Files Community

clinical-deidentify / tests /test_pipeline.py

Nithins03

Add premium web interface for document de-identification

321a6ce about 2 months ago

raw

history blame contribute delete

3.08 kB

	import pytest
	import os

	# Redirect HF Cache to a writable directory
	os.environ["HF_HOME"] = os.path.join(os.getcwd(), ".hf_cache")

	from app.pipeline.regex_rules import RegexDetector
	from app.pipeline.hybrid import DeidPipeline

	def test_regex_dates():
	detector = RegexDetector()
	text = "Admitted on 01/15/2023 and Jan '24"
	results = detector.detect(text)
	labels = [r["label"] for r in results]
	assert "DATE" in labels
	assert len(results) >= 2

	def test_regex_age():
	detector = RegexDetector()
	text = "The patient is 92 years old and another is age: 95."
	results = detector.detect(text)
	assert len(results) == 2
	assert all(r["label"] == "AGE" for r in results)

	def test_regex_mrn():
	detector = RegexDetector()
	text = "MRN: A1B2C3D4E5. EHR# 99887766."
	results = detector.detect(text)
	assert len(results) == 2
	assert all(r["label"] == "MRN" for r in results)

	def test_regex_edge_cases():
	detector = RegexDetector()
	# Edge cases: serial numbers and phone with leading parenthesis
	text = "SN: XYZ-9000-SER, Phone: (555) 010-9988"
	results = detector.detect(text)
	labels = {r["label"]: r["text"] for r in results}

	assert "DEVICE_ID" in labels
	assert labels["DEVICE_ID"] == "SN: XYZ-9000-SER"

	assert "PHONE" in labels
	assert labels["PHONE"] == "(555) 010-9988"

	def test_hybrid_merging_overlaps():
	# Mocking transformer output for unit test
	class MockTransformer:
	def detect(self, text):
	# transformer catches "John Doe"
	return [{"start": 0, "end": 8, "label": "PERSON", "text": "John Doe", "source": "transformer", "score": 0.9}]

	pipeline = DeidPipeline()
	pipeline.transformer_detector = MockTransformer()

	# Text where regex and transformer might overlap
	text = "John Doe born 01/01/1980"
	# Regex will catch 01/01/1980
	result = pipeline.deidentify(text)

	entities = result["entities"]
	# Should have both PERSON and DATE
	labels = [e["label"] for e in entities]
	assert "PERSON" in labels
	assert "DATE" in labels

	def test_nested_merging():
	pipeline = DeidPipeline()
	# Mock: outer transformer span "Hospital Room 101", inner regex Zip Code "90210"
	# Wait, 90210 is a zip.
	class MockTransformer:
	def detect(self, text):
	return [{"start": 0, "end": 20, "label": "LOCATION", "text": "Beverly Hills 90210", "source": "transformer", "score": 0.8}]

	pipeline.transformer_detector = MockTransformer()
	text = "Beverly Hills 90210"
	# Regex ZIP will catch 90210
	result = pipeline.deidentify(text)

	entities = result["entities"]
	# Since zip is inside location, and current logic keeps outer if transformers...
	# Actually if zip is nested in transformer, we currently continue (keep outer).
	# Check if that's what we want. In de-id, masking the whole "Beverly Hills 90210" is fine.
	assert len(entities) == 1
	assert entities[0]["label"] == "LOCATION"

	if __name__ == "__main__":
	pytest.main([__file__])