import os import random from datasets import load_dataset from fastapi.responses import HTMLResponse from gradio import Server from transformers import pipeline import spaces dataset = load_dataset("BramVanroy/CommonCrawl-CreativeCommons-fine", "nld", split="train") app = Server() print("Loading OpenAI Privacy Filter model...") classifier = pipeline( task="token-classification", model="openai/privacy-filter", ) print("Model loaded successfully.") @app.api(name="predict") @spaces.GPU def predict(text: str) -> list: """Detect PII in the given text and return aggregated spans.""" results = classifier(text) merged = [] current_entity = None for res in results: label = res["entity"] if label == "O": if current_entity: merged.append(current_entity) current_entity = None continue base_label = label.split("-", 1)[-1] if "-" in label else label if label.startswith("B-") or label.startswith("S-"): if current_entity: merged.append(current_entity) current_entity = { "entity": base_label, "score": float(res["score"]), "start": res["start"], "end": res["end"], "word": res["word"] } elif label.startswith("I-") or label.startswith("E-"): if current_entity and current_entity["entity"] == base_label: # Extend current entity current_entity["end"] = res["end"] current_entity["word"] += res["word"].replace("##", "").replace("Ġ", " ") else: if current_entity: merged.append(current_entity) current_entity = { "entity": base_label, "score": float(res["score"]), "start": res["start"], "end": res["end"], "word": res["word"] } else: # No prefix if current_entity: merged.append(current_entity) current_entity = { "entity": base_label, "score": float(res["score"]), "start": res["start"], "end": res["end"], "word": res["word"] } if current_entity: merged.append(current_entity) return merged @app.get("/random_document") async def random_document(): idx = random.randint(0, len(dataset) - 1) doc = dict(dataset[idx]) return {"_index": idx, "_total": len(dataset), **doc} @app.get("/") async def homepage(): html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html") with open(html_path, "r", encoding="utf-8") as f: return HTMLResponse(content=f.read()) if __name__ == "__main__": app.launch(show_error=True)