"""
OpenAI Privacy Filter — inference module for DLP Paste-Proxy.

Thin wrapper around `transformers.pipeline("token-classification", ...)` for
`openai/privacy-filter`. The pipeline's `aggregation_strategy="simple"` takes
care of BIOES → char-level span reconstruction for us.

Public API is unchanged from earlier revisions:
    predict_text(text) -> (source_text, spans)
where each span is {label, start, end, text}.
"""

from __future__ import annotations

import functools
import os

import torch

MODEL_REPO = os.getenv("MODEL_ID", "openai/privacy-filter")
HF_TOKEN = os.getenv("HF_TOKEN", None)


@functools.lru_cache(maxsize=1)
def _get_pipeline():
    from transformers import pipeline
    return pipeline(
        task="token-classification",
        model=MODEL_REPO,
        aggregation_strategy="simple",
        device=0,
        torch_dtype=torch.bfloat16,
        token=HF_TOKEN,
    )


def predict_text(text: str) -> tuple[str, list[dict]]:
    """Returns (source_text, spans). `spans` is a list of
    {label, start, end, text} with character offsets into `text`."""
    if not text or not text.strip():
        return text, []
    pipe = _get_pipeline()
    results = pipe(text)
    spans = []
    for r in results:
        label = r.get("entity_group") or r.get("entity")
        if not label or label == "O":
            continue
        s, e = int(r["start"]), int(r["end"])
        if e <= s or s < 0 or e > len(text):
            continue
        spans.append({"label": label, "start": s, "end": e, "text": text[s:e]})
    return text, spans