ysharma's picture
ysharma HF Staff
Update opf.py
f949461 verified
"""
OpenAI Privacy Filter — inference module for DLP Paste-Proxy.
Thin wrapper around `transformers.pipeline("token-classification", ...)` for
`openai/privacy-filter`. The pipeline's `aggregation_strategy="simple"` takes
care of BIOES → char-level span reconstruction for us.
Public API is unchanged from earlier revisions:
predict_text(text) -> (source_text, spans)
where each span is {label, start, end, text}.
"""
from __future__ import annotations
import functools
import os
import torch
MODEL_REPO = os.getenv("MODEL_ID", "openai/privacy-filter")
HF_TOKEN = os.getenv("HF_TOKEN", None)
@functools.lru_cache(maxsize=1)
def _get_pipeline():
from transformers import pipeline
return pipeline(
task="token-classification",
model=MODEL_REPO,
aggregation_strategy="simple",
device=0,
torch_dtype=torch.bfloat16,
token=HF_TOKEN,
)
def predict_text(text: str) -> tuple[str, list[dict]]:
"""Returns (source_text, spans). `spans` is a list of
{label, start, end, text} with character offsets into `text`."""
if not text or not text.strip():
return text, []
pipe = _get_pipeline()
results = pipe(text)
spans = []
for r in results:
label = r.get("entity_group") or r.get("entity")
if not label or label == "O":
continue
s, e = int(r["start"]), int(r["end"])
if e <= s or s < 0 or e > len(text):
continue
spans.append({"label": label, "start": s, "end": e, "text": text[s:e]})
return text, spans