Upload 4 files
Browse files- Dockerfile +15 -0
- README.md +191 -0
- main.py +240 -0
- requirements.txt +7 -0
Dockerfile
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
|
| 7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 8 |
+
|
| 9 |
+
RUN python -c "from gliner import GLiNER; GLiNER.from_pretrained('urchade/gliner_small-v2.1')"
|
| 10 |
+
|
| 11 |
+
COPY . .
|
| 12 |
+
|
| 13 |
+
ENV PORT=8000
|
| 14 |
+
|
| 15 |
+
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port ${PORT} --workers 1"]
|
README.md
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Celarium
|
| 2 |
+
|
| 3 |
+
**Context-Aware Privacy Middleware for AI Agents & LLMs**
|
| 4 |
+
|
| 5 |
+
Celarium acts as a smart firewall between your users and Large Language Models. It intercepts sensitive data, replaces it with context-aware, consistent fake entities, and restores the original values after the LLM responds.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Why Celarium?
|
| 10 |
+
|
| 11 |
+
Unlike simple regex tools that redact data ([REDACTED]), Celarium maintains semantic consistency.
|
| 12 |
+
|
| 13 |
+
- **Context-Aware AI**: Uses GLiNER (Generalist Lightweight NER) to detect entities based on context, not just patterns.
|
| 14 |
+
- **Handles**: MRN, SSN, Insurance Policy, Group IDs, Hospital Names (regex + AI detection)
|
| 15 |
+
- **Data Consistency**: If "John Doe" becomes "Robert Smith", his email becomes robert.smith@example.com automatically.
|
| 16 |
+
- **Batch Processing**: Natively handles JSON Lists and complex objects without hitting token limits.
|
| 17 |
+
- **Smart Restoration**: The LLM "thinks" it's talking to Robert Smith. When the response comes back, Celarium swaps it back to John Doe.
|
| 18 |
+
|
| 19 |
+
---
|
| 20 |
+
|
| 21 |
+
## 🛠 How It Works
|
| 22 |
+
|
| 23 |
+
```mermaid
|
| 24 |
+
graph LR
|
| 25 |
+
A[User Input] -->|Contains PII| B(Celarium)
|
| 26 |
+
B -->|Anonymized Data| C[LLM / Agent]
|
| 27 |
+
C -->|Response with Fakes| B
|
| 28 |
+
B -->|Restored Data| D[User Output]
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
1. **Intercept**: Send raw data (Text or JSON) to Celarium.
|
| 32 |
+
2. **Anonymize**: Celarium uses a Hybrid Engine (GLiNER AI + Strict Regex) to generate realistic fakes.
|
| 33 |
+
3. **Process**: Send the clean data to OpenAI/Claude/Gemini.
|
| 34 |
+
4. **Restore**: Send the LLM's response back to Celarium to swap the names back.
|
| 35 |
+
|
| 36 |
+
---
|
| 37 |
+
|
| 38 |
+
## Quick Start
|
| 39 |
+
|
| 40 |
+
### Option A: Hosted API
|
| 41 |
+
The easiest way to use Celarium is via the hosted API. No installation required.
|
| 42 |
+
Base URL:
|
| 43 |
+
1. Anonymize Data
|
| 44 |
+
Send text or a list of JSON objects. The system auto-detects PII (Names, Emails, Phones, Medical IDs).
|
| 45 |
+
code
|
| 46 |
+
|
| 47 |
+
curl -X POST \
|
| 48 |
+
-H "X-API-Key: sk_test_celarium_founder_001" \
|
| 49 |
+
-H "Content-Type: application/json" \
|
| 50 |
+
-d '{
|
| 51 |
+
"text": "Patient John Doe (DOB 1985-07-14) admitted to Springfield General. MRN-998877."
|
| 52 |
+
}'
|
| 53 |
+
Response:
|
| 54 |
+
code
|
| 55 |
+
JSON
|
| 56 |
+
{
|
| 57 |
+
"anonymized_text": "Patient Michael Smith (DOB 1962-03-12) admitted to Oak Ridge Medical Center. MRN-112233.",
|
| 58 |
+
"session_id": "abc123uuid...",
|
| 59 |
+
"entities_found": 4
|
| 60 |
+
}
|
| 61 |
+
2. Process with LLM
|
| 62 |
+
Send the anonymized_text to OpenAI, Claude, or your local model. The LLM sees "Michael Smith" and processes it safely.
|
| 63 |
+
3. Restore Data
|
| 64 |
+
Send the LLM's response back to Celarium to swap the names back.
|
| 65 |
+
code
|
| 66 |
+
|
| 67 |
+
curl -X POST \
|
| 68 |
+
-H "X-API-Key: sk_test_celarium_founder_001" \
|
| 69 |
+
-H "Content-Type: application/json" \
|
| 70 |
+
-d '{
|
| 71 |
+
"session_id": "abc123uuid...",
|
| 72 |
+
"text": "Summary: Michael Smith was treated at Oak Ridge..."
|
| 73 |
+
}'
|
| 74 |
+
### Option B: Local Python
|
| 75 |
+
Requires Python 3.10+
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
pip install -r requirements.txt
|
| 79 |
+
python main.py
|
| 80 |
+
```
|
| 81 |
+
|
| 82 |
+
The API runs on [http://localhost:8000](http://localhost:8000)
|
| 83 |
+
|
| 84 |
+
---
|
| 85 |
+
|
| 86 |
+
## Usage Examples
|
| 87 |
+
|
| 88 |
+
### 1. Medical / Clinical Data (Unstructured)
|
| 89 |
+
|
| 90 |
+
Celarium detects specialized medical fields and formatting.
|
| 91 |
+
|
| 92 |
+
```bash
|
| 93 |
+
curl -X POST http://localhost:8000/v1/anonymize \
|
| 94 |
+
-H "X-API-Key: sk_test_celarium_founder_001" \
|
| 95 |
+
-H "Content-Type: application/json" \
|
| 96 |
+
-d '{
|
| 97 |
+
"text": "Patient John Doe, DOB 1985-07-14, SSN 123-45-6789, MRN MRN-998877, admitted to Springfield General Hospital with Dr. House."
|
| 98 |
+
}'
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
**Response Example:**
|
| 102 |
+
|
| 103 |
+
```json
|
| 104 |
+
{
|
| 105 |
+
"anonymized_text": "Patient Michael Stevens, DOB 1962-03-12, SSN 542-11-9021, MRN MRN-112233, admitted to Oak Ridge Medical Center with Dr. Wilson.",
|
| 106 |
+
"session_id": "abc123uuid",
|
| 107 |
+
"entities_found": 6
|
| 108 |
+
}
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
### 2. Batch Processing (JSON Lists)
|
| 114 |
+
|
| 115 |
+
Send entire database records. Celarium automatically handles list iteration and context preservation.
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
curl -X POST http://localhost:8000/v1/anonymize \
|
| 119 |
+
-H "X-API-Key: sk_test_celarium_founder_001" \
|
| 120 |
+
-H "Content-Type: application/json" \
|
| 121 |
+
-d '{
|
| 122 |
+
"text": [
|
| 123 |
+
{ "name": "Carlos Rivera", "email": "carlos@outlook.com", "policy": "POL-12345" },
|
| 124 |
+
{ "name": "Sarah Jones", "email": "sarah.j@gmail.com", "policy": "POL-98765" }
|
| 125 |
+
]
|
| 126 |
+
}'
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
**Response Example:**
|
| 130 |
+
|
| 131 |
+
```json
|
| 132 |
+
{
|
| 133 |
+
"anonymized_text": "[\n { \"name\": \"David Kim\", \"email\": \"davidkim99@example.com\", \"policy\": \"POL-554433\" },\n { \"name\": \"Emily White\", \"email\": \"emilywhite22@example.com\", \"policy\": \"POL-112211\" }\n]",
|
| 134 |
+
"session_id": "xyz789uuid",
|
| 135 |
+
"entities_found": 6
|
| 136 |
+
}
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
---
|
| 140 |
+
|
| 141 |
+
### 3. Restore Data
|
| 142 |
+
|
| 143 |
+
After your LLM generates a response using the fake names, swap them back.
|
| 144 |
+
|
| 145 |
+
```bash
|
| 146 |
+
curl -X POST http://localhost:8000/v1/restore \
|
| 147 |
+
-H "X-API-Key: sk_test_celarium_founder_001" \
|
| 148 |
+
-H "Content-Type: application/json" \
|
| 149 |
+
-d '{
|
| 150 |
+
"session_id": "abc123uuid",
|
| 151 |
+
"text": "We have updated the records for Michael Stevens regarding MRN-112233."
|
| 152 |
+
}'
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
**Response Example:**
|
| 156 |
+
|
| 157 |
+
```json
|
| 158 |
+
{
|
| 159 |
+
"restored_text": "We have updated the records for John Doe regarding MRN-998877."
|
| 160 |
+
}
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## 🚀 Deployment
|
| 166 |
+
|
| 167 |
+
- Designed for Railway, Heroku, or AWS.
|
| 168 |
+
- Push to GitHub.
|
| 169 |
+
- Connect to Railway/Heroku.
|
| 170 |
+
- Deploy.
|
| 171 |
+
|
| 172 |
+
The included Dockerfile handles the AI model download during the build phase. The server automatically optimizes for CPU usage.
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
## 🛡 Security & Compliance
|
| 177 |
+
|
| 178 |
+
- **Ephemeral Storage**: Mappings are stored in-memory. If the server restarts, the data is gone.
|
| 179 |
+
- **PII Never Logs**: We do not log the input text or the mappings to disk.
|
| 180 |
+
- **Strict Regex Fallback**: If the AI misses a pattern, our strict Regex engine catches SSNs, Phones, and Emails as a failsafe.
|
| 181 |
+
- **Address Protection**: Entire address blocks (Street + City + State) are replaced to prevent location leakage.
|
| 182 |
+
|
| 183 |
+
---
|
| 184 |
+
|
| 185 |
+
## Status
|
| 186 |
+
|
| 187 |
+
This is a **proof-of-concept**. We're actively looking for:
|
| 188 |
+
|
| 189 |
+
- Early adopters to validate the approach
|
| 190 |
+
- Feedback on detection accuracy
|
| 191 |
+
- Use cases beyond healthcare
|
main.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import uvicorn
|
| 2 |
+
import os
|
| 3 |
+
import uuid
|
| 4 |
+
import re
|
| 5 |
+
import random
|
| 6 |
+
import json
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from typing import Union, List, Dict, Any
|
| 9 |
+
from fastapi import FastAPI, HTTPException, Security
|
| 10 |
+
from fastapi.security import APIKeyHeader
|
| 11 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
from pydantic import BaseModel
|
| 13 |
+
from faker import Faker
|
| 14 |
+
from gliner import GLiNER
|
| 15 |
+
|
| 16 |
+
app = FastAPI(title="Celarium AI")
|
| 17 |
+
|
| 18 |
+
app.add_middleware(
|
| 19 |
+
CORSMiddleware,
|
| 20 |
+
allow_origins=["*"],
|
| 21 |
+
allow_credentials=True,
|
| 22 |
+
allow_methods=["*"],
|
| 23 |
+
allow_headers=["*"],
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
|
| 27 |
+
VALID_API_KEYS = {"sk_test_celarium_founder_001", "sk_test_celarium_beta_001"}
|
| 28 |
+
SESSIONS = {}
|
| 29 |
+
fake = Faker()
|
| 30 |
+
|
| 31 |
+
# Load Model
|
| 32 |
+
print("Loading GLiNER...")
|
| 33 |
+
model = GLiNER.from_pretrained("urchade/gliner_small-v2.1")
|
| 34 |
+
print("Loaded.")
|
| 35 |
+
|
| 36 |
+
# Regex & Labels
|
| 37 |
+
REGEX_PATTERNS = {
|
| 38 |
+
"EMAIL_ADDRESS": r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
| 39 |
+
"PHONE_NUMBER": r'(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
|
| 40 |
+
"MRN": r'\bMRN[-_]\w+\b',
|
| 41 |
+
"SSN": r'\b\d{3}-\d{2}-\d{4}\b',
|
| 42 |
+
"INSURANCE_GROUP": r'\bG\d{5,}\b',
|
| 43 |
+
"INSURANCE_POLICY": r'\b(POL|POLICY)[-_]?\d+\b',
|
| 44 |
+
"FULL_ADDRESS": r'\d+\s+[A-Za-z0-9\s\.]+,\s+[A-Za-z\s\.]+,\s+[A-Z]{2}\s+\d{5}(?:-\d{4})?'
|
| 45 |
+
}
|
| 46 |
+
AI_LABELS = ["person", "physical address", "organization", "date of birth"]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# Generators
|
| 50 |
+
def generate_clean_name():
|
| 51 |
+
return f"{fake.first_name()} {fake.last_name()}"
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def generate_matching_email(fake_name: str):
|
| 55 |
+
if not fake_name: return f"user{random.randint(1000, 9999)}@example.com"
|
| 56 |
+
parts = fake_name.lower().split()
|
| 57 |
+
base = f"{parts[0]}{parts[1]}" if len(parts) >= 2 else parts[0]
|
| 58 |
+
return f"{base}{random.randint(100, 9999)}@example.com"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# --- UPDATED GENERATORS ---
|
| 62 |
+
|
| 63 |
+
def generate_clean_phone():
|
| 64 |
+
"""Matches the requested format: +1-XXX-XXX-XXXX"""
|
| 65 |
+
return f"+1-{random.randint(200, 999)}-{random.randint(200, 999)}-{random.randint(1000, 9999)}"
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def generate_medical_org():
|
| 69 |
+
"""Generates realistic Healthcare/Clinical names"""
|
| 70 |
+
suffixes = [
|
| 71 |
+
"Medical Center", "Regional Health", "General Hospital",
|
| 72 |
+
"Health Group", "Family Clinic", "Community Care",
|
| 73 |
+
"Medical Associates", "Health System", "Diagnostics Lab"
|
| 74 |
+
]
|
| 75 |
+
# 50% chance of City-based name (e.g. "Austin Regional Health")
|
| 76 |
+
# 50% chance of Name-based name (e.g. "Rivera Medical Group")
|
| 77 |
+
prefix = fake.city() if random.random() > 0.5 else fake.last_name()
|
| 78 |
+
return f"{prefix} {random.choice(suffixes)}"
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def get_fake_value(label: str, context: dict) -> str:
|
| 82 |
+
label = label.upper()
|
| 83 |
+
|
| 84 |
+
if "PERSON" in label:
|
| 85 |
+
val = generate_clean_name()
|
| 86 |
+
context["last_person"] = val
|
| 87 |
+
return val
|
| 88 |
+
|
| 89 |
+
if "EMAIL" in label:
|
| 90 |
+
return generate_matching_email(context.get("last_person", ""))
|
| 91 |
+
|
| 92 |
+
if "PHONE" in label:
|
| 93 |
+
return generate_clean_phone() # <--- Uses new format
|
| 94 |
+
|
| 95 |
+
if "ADDRESS" in label or "LOCATION" in label:
|
| 96 |
+
# Fixes address leak by generating full block
|
| 97 |
+
return f"{fake.street_address()}, {fake.city()}, {fake.state_abbr()} {fake.zipcode()}"
|
| 98 |
+
|
| 99 |
+
if "MRN" in label:
|
| 100 |
+
return f"MRN-{fake.random_number(digits=8, fix_len=True)}"
|
| 101 |
+
if "SSN" in label:
|
| 102 |
+
return fake.ssn()
|
| 103 |
+
if "DATE" in label:
|
| 104 |
+
return str(fake.date_of_birth(minimum_age=18, maximum_age=90))
|
| 105 |
+
if "POLICY" in label:
|
| 106 |
+
return f"POL-{fake.random_number(digits=9, fix_len=True)}"
|
| 107 |
+
if "GROUP" in label:
|
| 108 |
+
return f"G{fake.random_number(digits=5, fix_len=True)}"
|
| 109 |
+
|
| 110 |
+
if "ORGANIZATION" in label:
|
| 111 |
+
return generate_medical_org() # <--- Uses new medical generator
|
| 112 |
+
|
| 113 |
+
return f"REDACTED_{uuid.uuid4().hex[:6]}"
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def analyze_and_replace(text: str) -> (str, dict):
|
| 117 |
+
"""Core logic to anonymize a single string block"""
|
| 118 |
+
findings = []
|
| 119 |
+
# Regex
|
| 120 |
+
for label, pattern in REGEX_PATTERNS.items():
|
| 121 |
+
for match in re.finditer(pattern, text):
|
| 122 |
+
findings.append({"start": match.start(), "end": match.end(), "label": label, "score": 1.0})
|
| 123 |
+
# AI
|
| 124 |
+
try:
|
| 125 |
+
ai_preds = model.predict_entities(text, AI_LABELS, threshold=0.35)
|
| 126 |
+
for p in ai_preds:
|
| 127 |
+
findings.append({"start": p["start"], "end": p["end"], "label": p["label"], "score": p["score"]})
|
| 128 |
+
except:
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
# Merge
|
| 132 |
+
findings.sort(key=lambda x: x["start"])
|
| 133 |
+
merged = []
|
| 134 |
+
for f in findings:
|
| 135 |
+
if not merged:
|
| 136 |
+
merged.append(f)
|
| 137 |
+
continue
|
| 138 |
+
last = merged[-1]
|
| 139 |
+
if f["start"] < last["end"]:
|
| 140 |
+
if f["score"] > last["score"] or (f["end"] - f["start"]) > (last["end"] - last["start"]):
|
| 141 |
+
merged[-1] = f
|
| 142 |
+
else:
|
| 143 |
+
merged.append(f)
|
| 144 |
+
|
| 145 |
+
# Generate Fakes
|
| 146 |
+
mapping = {}
|
| 147 |
+
replacements = []
|
| 148 |
+
context = {"last_person": ""}
|
| 149 |
+
used_fakes = set()
|
| 150 |
+
|
| 151 |
+
for ent in merged:
|
| 152 |
+
original = text[ent["start"]:ent["end"]]
|
| 153 |
+
# Skip JSON Keys
|
| 154 |
+
if original.lower() in ["person_name", "date_of_birth", "ssn", "mrn", "email", "phone", "address"]:
|
| 155 |
+
continue
|
| 156 |
+
|
| 157 |
+
fake_val = get_fake_value(ent["label"], context)
|
| 158 |
+
if fake_val in used_fakes:
|
| 159 |
+
fake_val = f"{fake_val}_{random.randint(1, 99)}"
|
| 160 |
+
used_fakes.add(fake_val)
|
| 161 |
+
|
| 162 |
+
mapping[fake_val] = original
|
| 163 |
+
replacements.append({"start": ent["start"], "end": ent["end"], "fake": fake_val})
|
| 164 |
+
|
| 165 |
+
# Replace
|
| 166 |
+
replacements.sort(key=lambda x: x["start"], reverse=True)
|
| 167 |
+
text_chars = list(text)
|
| 168 |
+
for r in replacements:
|
| 169 |
+
text_chars[r["start"]:r["end"]] = list(r["fake"])
|
| 170 |
+
|
| 171 |
+
return "".join(text_chars), mapping
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# --- ENDPOINTS ---
|
| 175 |
+
|
| 176 |
+
async def get_api_key(api_key: str = Security(api_key_header)):
|
| 177 |
+
if not api_key or api_key not in VALID_API_KEYS:
|
| 178 |
+
raise HTTPException(401, "Invalid API Key")
|
| 179 |
+
return api_key
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
class AnonymizeRequest(BaseModel):
|
| 183 |
+
text: Union[str, List[Any], Dict[str, Any]]
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
class RestoreRequest(BaseModel):
|
| 187 |
+
session_id: str
|
| 188 |
+
text: str
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
@app.post("/v1/anonymize")
|
| 192 |
+
async def anonymize(req: AnonymizeRequest, api_key: str = Security(get_api_key)):
|
| 193 |
+
input_data = req.text
|
| 194 |
+
global_mapping = {}
|
| 195 |
+
final_output_str = ""
|
| 196 |
+
|
| 197 |
+
# LOGIC: Handle List vs Single String
|
| 198 |
+
if isinstance(input_data, list):
|
| 199 |
+
# Process each item individually to avoid Token Limit
|
| 200 |
+
anonymized_list = []
|
| 201 |
+
for item in input_data:
|
| 202 |
+
item_str = json.dumps(item)
|
| 203 |
+
anon_str, item_map = analyze_and_replace(item_str)
|
| 204 |
+
anonymized_list.append(json.loads(anon_str)) # Convert back to dict
|
| 205 |
+
global_mapping.update(item_map)
|
| 206 |
+
|
| 207 |
+
# Return as formatted JSON string
|
| 208 |
+
final_output_str = json.dumps(anonymized_list, indent=2)
|
| 209 |
+
|
| 210 |
+
else:
|
| 211 |
+
# Single object or string
|
| 212 |
+
text_to_process = json.dumps(input_data) if isinstance(input_data, dict) else str(input_data)
|
| 213 |
+
final_output_str, global_mapping = analyze_and_replace(text_to_process)
|
| 214 |
+
|
| 215 |
+
session_id = str(uuid.uuid4())
|
| 216 |
+
SESSIONS[session_id] = {"mapping": global_mapping, "created": datetime.now(), "api_key": api_key}
|
| 217 |
+
|
| 218 |
+
return {
|
| 219 |
+
"anonymized_text": final_output_str,
|
| 220 |
+
"session_id": session_id,
|
| 221 |
+
"entities_found": len(global_mapping)
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
@app.post("/v1/restore")
|
| 226 |
+
async def restore(req: RestoreRequest, api_key: str = Security(get_api_key)):
|
| 227 |
+
session = SESSIONS.get(req.session_id)
|
| 228 |
+
if not session or session["api_key"] != api_key:
|
| 229 |
+
raise HTTPException(404, "Session not found")
|
| 230 |
+
|
| 231 |
+
restored = req.text
|
| 232 |
+
for fake_v, real_v in session["mapping"].items():
|
| 233 |
+
restored = restored.replace(fake_v, real_v)
|
| 234 |
+
|
| 235 |
+
return {"restored_text": restored}
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
if __name__ == "__main__":
|
| 239 |
+
port = int(os.getenv("PORT", 8000))
|
| 240 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.104.1
|
| 2 |
+
uvicorn==0.24.0
|
| 3 |
+
faker==20.1.0
|
| 4 |
+
python-dotenv==1.0.0
|
| 5 |
+
pydantic==2.5.0
|
| 6 |
+
gliner==0.1.11
|
| 7 |
+
torch==2.0.1
|