Spaces:
Running on Zero
Running on Zero
| """ | |
| OpenAI Privacy Filter — inference module for DLP Paste-Proxy. | |
| Thin wrapper around `transformers.pipeline("token-classification", ...)` for | |
| `openai/privacy-filter`. The pipeline's `aggregation_strategy="simple"` takes | |
| care of BIOES → char-level span reconstruction for us. | |
| Public API is unchanged from earlier revisions: | |
| predict_text(text) -> (source_text, spans) | |
| where each span is {label, start, end, text}. | |
| """ | |
| from __future__ import annotations | |
| import functools | |
| import os | |
| import torch | |
| MODEL_REPO = os.getenv("MODEL_ID", "openai/privacy-filter") | |
| HF_TOKEN = os.getenv("HF_TOKEN", None) | |
| def _get_pipeline(): | |
| from transformers import pipeline | |
| return pipeline( | |
| task="token-classification", | |
| model=MODEL_REPO, | |
| aggregation_strategy="simple", | |
| device=0, | |
| torch_dtype=torch.bfloat16, | |
| token=HF_TOKEN, | |
| ) | |
| def predict_text(text: str) -> tuple[str, list[dict]]: | |
| """Returns (source_text, spans). `spans` is a list of | |
| {label, start, end, text} with character offsets into `text`.""" | |
| if not text or not text.strip(): | |
| return text, [] | |
| pipe = _get_pipeline() | |
| results = pipe(text) | |
| spans = [] | |
| for r in results: | |
| label = r.get("entity_group") or r.get("entity") | |
| if not label or label == "O": | |
| continue | |
| s, e = int(r["start"]), int(r["end"]) | |
| if e <= s or s < 0 or e > len(text): | |
| continue | |
| spans.append({"label": label, "start": s, "end": e, "text": text[s:e]}) | |
| return text, spans | |