""" OpenAI Privacy Filter — inference module for DLP Paste-Proxy. Thin wrapper around `transformers.pipeline("token-classification", ...)` for `openai/privacy-filter`. The pipeline's `aggregation_strategy="simple"` takes care of BIOES → char-level span reconstruction for us. Public API is unchanged from earlier revisions: predict_text(text) -> (source_text, spans) where each span is {label, start, end, text}. """ from __future__ import annotations import functools import os import torch MODEL_REPO = os.getenv("MODEL_ID", "openai/privacy-filter") HF_TOKEN = os.getenv("HF_TOKEN", None) @functools.lru_cache(maxsize=1) def _get_pipeline(): from transformers import pipeline return pipeline( task="token-classification", model=MODEL_REPO, aggregation_strategy="simple", device=0, torch_dtype=torch.bfloat16, token=HF_TOKEN, ) def predict_text(text: str) -> tuple[str, list[dict]]: """Returns (source_text, spans). `spans` is a list of {label, start, end, text} with character offsets into `text`.""" if not text or not text.strip(): return text, [] pipe = _get_pipeline() results = pipe(text) spans = [] for r in results: label = r.get("entity_group") or r.get("entity") if not label or label == "O": continue s, e = int(r["start"]), int(r["end"]) if e <= s or s < 0 or e > len(text): continue spans.append({"label": label, "start": s, "end": e, "text": text[s:e]}) return text, spans