Spaces:

ysharma
/

OPF-SmartRedact-Paste

Running on Zero

ysharma HF Staff

Update opf.py

f949461 verified 14 days ago

1.57 kB

	"""
	OpenAI Privacy Filter — inference module for DLP Paste-Proxy.

	Thin wrapper around `transformers.pipeline("token-classification", ...)` for
	`openai/privacy-filter`. The pipeline's `aggregation_strategy="simple"` takes
	care of BIOES → char-level span reconstruction for us.

	Public API is unchanged from earlier revisions:
	predict_text(text) -> (source_text, spans)
	where each span is {label, start, end, text}.
	"""

	from __future__ import annotations

	import functools
	import os

	import torch

	MODEL_REPO = os.getenv("MODEL_ID", "openai/privacy-filter")
	HF_TOKEN = os.getenv("HF_TOKEN", None)


	@functools.lru_cache(maxsize=1)
	def _get_pipeline():
	from transformers import pipeline
	return pipeline(
	task="token-classification",
	model=MODEL_REPO,
	aggregation_strategy="simple",
	device=0,
	torch_dtype=torch.bfloat16,
	token=HF_TOKEN,
	)


	def predict_text(text: str) -> tuple[str, list[dict]]:
	"""Returns (source_text, spans). `spans` is a list of
	{label, start, end, text} with character offsets into `text`."""
	if not text or not text.strip():
	return text, []
	pipe = _get_pipeline()
	results = pipe(text)
	spans = []
	for r in results:
	label = r.get("entity_group") or r.get("entity")
	if not label or label == "O":
	continue
	s, e = int(r["start"]), int(r["end"])
	if e <= s or s < 0 or e > len(text):
	continue
	spans.append({"label": label, "start": s, "end": e, "text": text[s:e]})
	return text, spans