| import re |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class RegexPIIRemover: |
| """Remove PII using regex patterns""" |
| |
| def __init__(self): |
| """Initialize PII removal patterns""" |
| self.patterns = { |
| |
| 'ssn': re.compile(r'\b\d{3}-\d{2}-\d{4}\b'), |
| |
| |
| 'phone': re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'), |
| |
| |
| 'email': re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'), |
| |
| |
| 'date': re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'), |
| |
| |
| 'mrn': re.compile(r'\b(MRN|Medical Record Number)[:\s]+\w+\b', re.IGNORECASE), |
| |
| |
| 'zip': re.compile(r'\b\d{5}(-\d{4})?\b'), |
| |
| |
| 'patient_name': re.compile(r'(Patient|Name)[:\s]+([A-Z][a-z]+\s[A-Z][a-z]+)', re.IGNORECASE), |
| |
| |
| 'dob': re.compile(r'(DOB|Date of Birth)[:\s]+\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', re.IGNORECASE), |
| } |
| |
| logger.info(f"RegexPIIRemover initialized with {len(self.patterns)} patterns") |
| |
| def remove_pii(self, text: str) -> tuple[str, int]: |
| """ |
| Remove PII from text |
| |
| Args: |
| text: Input text |
| |
| Returns: |
| tuple: (cleaned_text, count_of_pii_removed) |
| """ |
| cleaned_text = text |
| total_removed = 0 |
| |
| for pii_type, pattern in self.patterns.items(): |
| matches = pattern.findall(cleaned_text) |
| count = len(matches) |
| |
| if count > 0: |
| logger.debug(f"Found {count} instances of {pii_type}") |
| total_removed += count |
| |
| |
| if pii_type == 'patient_name': |
| cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) |
| elif pii_type in ['dob', 'mrn']: |
| cleaned_text = pattern.sub(r'\1: [REDACTED]', cleaned_text) |
| else: |
| cleaned_text = pattern.sub('[REDACTED]', cleaned_text) |
| |
| logger.info(f"Removed {total_removed} PII entities") |
| |
| return cleaned_text, total_removed |