File size: 2,656 Bytes
07d284c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87656ae
07d284c
a03138e
07d284c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
from fastapi.responses import HTMLResponse
from gradio import Server
from transformers import pipeline
import spaces

app = Server()

print("Loading OpenAI Privacy Filter model...")
classifier = pipeline(
    task="token-classification",
    model="openai/privacy-filter",
)
print("Model loaded successfully.")

@app.api(name="predict")
@spaces.GPU
def predict(text: str) -> list:
    """Detect PII in the given text and return aggregated spans."""
    results = classifier(text)
    
    merged = []
    current_entity = None
    
    for res in results:
        label = res["entity"]
        if label == "O":
            if current_entity:
                merged.append(current_entity)
                current_entity = None
            continue
            
        base_label = label.split("-", 1)[-1] if "-" in label else label
        
        if label.startswith("B-") or label.startswith("S-"):
            if current_entity:
                merged.append(current_entity)
            current_entity = {
                "entity": base_label,
                "score": float(res["score"]),
                "start": res["start"],
                "end": res["end"],
                "word": res["word"]
            }
        elif label.startswith("I-") or label.startswith("E-"):
            if current_entity and current_entity["entity"] == base_label:
                # Extend current entity
                current_entity["end"] = res["end"]
                current_entity["word"] += res["word"].replace("##", "").replace("Ġ", " ")
            else:
                if current_entity:
                    merged.append(current_entity)
                current_entity = {
                    "entity": base_label,
                    "score": float(res["score"]),
                    "start": res["start"],
                    "end": res["end"],
                    "word": res["word"]
                }
        else:
            # No prefix
            if current_entity:
                merged.append(current_entity)
            current_entity = {
                "entity": base_label,
                "score": float(res["score"]),
                "start": res["start"],
                "end": res["end"],
                "word": res["word"]
            }
            
    if current_entity:
        merged.append(current_entity)
        
    return merged

@app.get("/")
async def homepage():
    html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html")
    with open(html_path, "r", encoding="utf-8") as f:
        return HTMLResponse(content=f.read())

if __name__ == "__main__":
    app.launch(show_error=True)