Initial upload: BERT+GAT PII redactor

Browse files

Files changed (7) hide show

README.md +102 -0
config.json +86 -0
configuration_bert_gat.py +36 -0
model.safetensors +3 -0
modeling_bert_gat.py +230 -0
tokenizer.json +0 -0
tokenizer_config.json +14 -0

README.md ADDED Viewed

	@@ -0,0 +1,102 @@

+---
+language:
+- en
+license: mit
+library_name: transformers
+tags:
+- pii
+- privacy
+- redaction
+- token-classification
+- ner
+- bert
+- gat
+- graph-attention-network
+pipeline_tag: token-classification
+---
+# PII Redactor — BERT + Graph Attention Network
+Token-level PII detection model that combines a BERT contextual encoder
+with a Graph Attention Network (GAT) refinement stage. The graph mixes
+sequential-window edges with top-k attention edges drawn from BERT's last
+layer, letting the GAT exploit both locality and the long-range
+dependencies BERT already discovered.
+The model emits BIO tags over 15 PII categories: `SSN`, `BANK_ACCOUNT`,
+`ROUTING_NUMBER`, `CREDIT_CARD`, `CVV`, `CARD_EXPIRY`, `IBAN`, `DOB`,
+`FULL_NAME`, `EMAIL`, `PHONE`, `ADDRESS`, `PASSPORT`, `DRIVERS_LICENSE`,
+`TAX_ID`.
+## Quick start
+```python
+from transformers import AutoModel, AutoTokenizer
+REPO = "your-username/pii-redactor-bert-gat"   # <-- replace
+tokenizer = AutoTokenizer.from_pretrained(REPO, trust_remote_code=True)
+model     = AutoModel.from_pretrained(REPO, trust_remote_code=True)
+model.eval()
+result = model.predict(
+    "Email me at john.doe@example.com or call 555-123-4567.",
+    tokenizer,
+)
+print(result["redacted"])
+# -> "Email me at [EMAIL] or call [PHONE]."
+print(result["spans"])
+# -> [{'start': 12, 'end': 32, 'label': 'EMAIL', 'value': 'john.doe@example.com'}, ...]
+```
+`trust_remote_code=True` is required because the architecture (BERT + GAT)
+is custom and ships as `modeling_bert_gat.py` in this repository.
+## Architecture
+```
+input_ids ──► BERT encoder (with output_attentions=True)
+                    │
+                    ▼
+            token embeddings + last-layer attention
+                    │
+                    ▼
+            build_token_graph(window=3, top_k=5)
+                    │
+                    ▼
+            stack of GATConv layers (heads=4, hidden=128)
+                    │
+                    ▼
+            residual + LayerNorm  ──►  classifier ──►  BIO logits
+```
+## Inputs / outputs
+* **Input:** raw text string.
+* **Output:** dict with `original`, `redacted`, and `spans` (list of
+  `{start, end, label, value}`).
+## Intended use
+* Pre-processing user-generated text before logging or storing.
+* Building privacy-preserving data pipelines.
+* Demonstrating BERT + graph-network hybrids for NER.
+## Limitations
+* Trained on synthetic English PII; real-world distributions may differ.
+* Latency is higher than vanilla BERT-NER because the graph is built and
+  the GAT runs per sample.
+* Coverage is limited to the 15 categories above.
+## Requirements
+```text
+torch>=2.0
+transformers>=4.30
+torch-geometric>=2.3
+```
+## License
+MIT.

config.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "architectures": [
+    "BertGATForTokenClassification"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_bert_gat.BertGATConfig",
+    "AutoModel": "modeling_bert_gat.BertGATForTokenClassification"
+  },
+  "bert_model_name": "distilbert-base-uncased",
+  "dropout": 0.0,
+  "dtype": "float32",
+  "gat_heads": 4,
+  "gat_hidden": 128,
+  "gat_layers": 2,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20",
+    "21": "LABEL_21",
+    "22": "LABEL_22",
+    "23": "LABEL_23",
+    "24": "LABEL_24",
+    "25": "LABEL_25",
+    "26": "LABEL_26",
+    "27": "LABEL_27",
+    "28": "LABEL_28",
+    "29": "LABEL_29",
+    "30": "LABEL_30"
+  },
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_21": 21,
+    "LABEL_22": 22,
+    "LABEL_23": 23,
+    "LABEL_24": 24,
+    "LABEL_25": 25,
+    "LABEL_26": 26,
+    "LABEL_27": 27,
+    "LABEL_28": 28,
+    "LABEL_29": 29,
+    "LABEL_3": 3,
+    "LABEL_30": 30,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "max_length": 256,
+  "model_type": "bert_gat_pii",
+  "top_k_attn": 5,
+  "transformers_version": "5.1.0",
+  "window": 3
+}

configuration_bert_gat.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+HuggingFace-compatible config for the BERT+GAT PII redactor.
+When the model repo is loaded with ``trust_remote_code=True``,
+``transformers`` will instantiate this class from ``config.json``.
+"""
+from transformers import PretrainedConfig
+class BertGATConfig(PretrainedConfig):
+    model_type = "bert_gat_pii"
+    def __init__(
+        self,
+        bert_model_name: str = "distilbert-base-uncased",
+        num_labels: int = 31,
+        gat_heads: int = 4,
+        gat_hidden: int = 128,
+        gat_layers: int = 2,
+        dropout: float = 0.1,
+        window: int = 3,
+        top_k_attn: int = 5,
+        max_length: int = 512,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.bert_model_name = bert_model_name
+        self.num_labels = num_labels
+        self.gat_heads = gat_heads
+        self.gat_hidden = gat_hidden
+        self.gat_layers = gat_layers
+        self.dropout = dropout
+        self.window = window
+        self.top_k_attn = top_k_attn
+        self.max_length = max_length

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c8cf6d12b41debd9a1a5d1b92360b35affd6fdae539109c39539e6db608d70e
+size 269749308

modeling_bert_gat.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+HuggingFace-compatible wrapper around BertGATPIIModel.
+Self-contained on purpose: the Hub repo doesn't import ``pii_redactor``,
+so we redeclare the architecture here.  This is the file ``transformers``
+loads when a user does::
+    from transformers import AutoModel
+    model = AutoModel.from_pretrained(
+        "your-username/pii-redactor-bert-gat", trust_remote_code=True
+    )
+"""
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModel, AutoTokenizer, PreTrainedModel
+# Dual-mode import: works both when this file is loaded as part of a
+# package (HuggingFace's ``trust_remote_code=True`` flow) and when it's
+# imported as a sibling module by a script like ``convert_checkpoint.py``.
+try:
+    from .configuration_bert_gat import BertGATConfig
+except ImportError:
+    from configuration_bert_gat import BertGATConfig
+try:
+    from torch_geometric.nn import GATConv
+except ImportError as e:  # pragma: no cover
+    raise ImportError(
+        "torch-geometric is required. Install with: pip install torch-geometric"
+    ) from e
+# --------------------------------------------------------------------------- #
+# Label space (kept in sync with pii_redactor.config)
+# --------------------------------------------------------------------------- #
+PII_TYPES = [
+    "SSN", "BANK_ACCOUNT", "ROUTING_NUMBER", "CREDIT_CARD", "CVV",
+    "CARD_EXPIRY", "IBAN", "DOB", "FULL_NAME", "EMAIL", "PHONE",
+    "ADDRESS", "PASSPORT", "DRIVERS_LICENSE", "TAX_ID",
+]
+LABELS = ["O"] + sum(([f"B-{t}", f"I-{t}"] for t in PII_TYPES), [])
+ID2LABEL = {i: l for i, l in enumerate(LABELS)}
+# --------------------------------------------------------------------------- #
+# Graph builder (mirrors pii_redactor.models.graph_builder)
+# --------------------------------------------------------------------------- #
+def _build_token_graph(
+    seq_len: int,
+    attn_weights: torch.Tensor,
+    window: int,
+    top_k: int,
+    device: torch.device,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    src_list, dst_list, wt_list = [], [], []
+    for i in range(seq_len):
+        for j in range(max(0, i - window), min(seq_len, i + window + 1)):
+            if i != j:
+                src_list.append(i)
+                dst_list.append(j)
+                wt_list.append(1.0)
+    avg_attn = attn_weights.mean(dim=0)
+    topk_vals, topk_idx = avg_attn.topk(min(top_k, seq_len), dim=-1)
+    for i in range(seq_len):
+        for ki in range(topk_idx.shape[1]):
+            j = topk_idx[i, ki].item()
+            wt = topk_vals[i, ki].item()
+            if i != j and wt > 1e-4:
+                src_list.append(i)
+                dst_list.append(j)
+                wt_list.append(wt)
+    edge_index = torch.tensor([src_list, dst_list], dtype=torch.long, device=device)
+    edge_attr = torch.tensor(wt_list, dtype=torch.float, device=device).unsqueeze(1)
+    return edge_index, edge_attr
+# --------------------------------------------------------------------------- #
+# Model
+# --------------------------------------------------------------------------- #
+class BertGATForTokenClassification(PreTrainedModel):
+    config_class = BertGATConfig
+    base_model_prefix = "bert_gat_pii"
+    # This model has no tied weights (no shared embeddings, no encoder-
+    # decoder).  Different transformers versions look for either the
+    # old ``_tied_weights_keys`` (list) or the newer
+    # ``all_tied_weights_keys`` (dict); declaring both empty keeps
+    # ``from_pretrained``'s post-load tied-weight bookkeeping happy.
+    _tied_weights_keys: list = []
+    all_tied_weights_keys: dict = {}
+    def __init__(self, config: BertGATConfig):
+        super().__init__(config)
+        # Instantiate the BERT trunk EMPTY (no weight download here).  The
+        # outer ``from_pretrained`` populates everything — including these
+        # parameters — from the saved state dict.  Calling
+        # ``AutoModel.from_pretrained`` here would clash with the meta-
+        # device context the outer loader sets up.
+        bert_config = AutoConfig.from_pretrained(config.bert_model_name)
+        bert_config.output_attentions = True
+        self.bert = AutoModel.from_config(bert_config)
+        bert_dim = self.bert.config.hidden_size
+        self.dropout = nn.Dropout(config.dropout)
+        self.window = config.window
+        self.top_k = config.top_k_attn
+        self.gat_layers = nn.ModuleList()
+        in_dim = bert_dim
+        for _ in range(config.gat_layers):
+            self.gat_layers.append(
+                GATConv(in_dim, config.gat_hidden, heads=config.gat_heads,
+                        concat=True, dropout=config.dropout, edge_dim=1)
+            )
+            in_dim = config.gat_hidden * config.gat_heads
+        self.layer_norm = nn.LayerNorm(in_dim)
+        self.residual_proj = nn.Linear(bert_dim, in_dim)
+        self.classifier = nn.Linear(in_dim, config.num_labels)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+    ):
+        B, L = input_ids.shape
+        device = input_ids.device
+        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
+        token_embs = bert_out.last_hidden_state
+        last_attn = bert_out.attentions[-1]
+        gat_outputs = []
+        for b in range(B):
+            seq_len = int(attention_mask[b].sum().item())
+            attn_b = last_attn[b, :, :seq_len, :seq_len]
+            edge_idx, edge_attr = _build_token_graph(
+                seq_len, attn_b, self.window, self.top_k, device,
+            )
+            h_real = token_embs[b, :seq_len]
+            h_res = h_real
+            for gat in self.gat_layers:
+                h_real = self.dropout(h_real)
+                h_real = gat(h_real, edge_idx, edge_attr=edge_attr)
+                h_real = F.elu(h_real)
+            h_real = self.layer_norm(h_real + self.residual_proj(h_res))
+            pad_len = L - seq_len
+            if pad_len > 0:
+                pad = torch.zeros(pad_len, h_real.shape[-1], device=device)
+                h_real = torch.cat([h_real, pad], dim=0)
+            gat_outputs.append(h_real)
+        gat_embs = torch.stack(gat_outputs, dim=0)
+        logits = self.classifier(self.dropout(gat_embs))
+        loss = None
+        if labels is not None:
+            loss = nn.CrossEntropyLoss(ignore_index=-100)(
+                logits.view(-1, self.config.num_labels), labels.view(-1)
+            )
+        return {"loss": loss, "logits": logits}
+    # ---- Convenience inference helpers -------------------------------------
+    @torch.no_grad()
+    def predict(
+        self,
+        text: str,
+        tokenizer: AutoTokenizer,
+        device: Optional[torch.device] = None,
+    ) -> dict:
+        device = device or next(self.parameters()).device
+        enc = tokenizer(
+            text,
+            return_tensors="pt",
+            return_offsets_mapping=True,
+            truncation=True,
+            max_length=self.config.max_length,
+        )
+        input_ids = enc["input_ids"].to(device)
+        attention_mask = enc["attention_mask"].to(device)
+        offsets = enc["offset_mapping"].squeeze(0).tolist()
+        out = self(input_ids, attention_mask)
+        preds = out["logits"].squeeze(0).argmax(dim=-1).cpu().tolist()
+        # preds and offsets are aligned 1:1 by index; iterate them
+        # together (zip-style) so that special tokens — whose offset is
+        # (0, 0) — and their matching prediction are skipped as a pair.
+        spans: List[dict] = []
+        cur_lbl, cur_start, cur_end = None, None, None
+        for pred_id, (tok_s, tok_e) in zip(preds, offsets):
+            if tok_s == tok_e:
+                continue
+            pred_lbl = ID2LABEL[pred_id]
+            if pred_lbl.startswith("B-"):
+                if cur_lbl:
+                    spans.append({"start": cur_start, "end": cur_end, "label": cur_lbl})
+                cur_lbl, cur_start, cur_end = pred_lbl[2:], tok_s, tok_e
+            elif pred_lbl.startswith("I-") and cur_lbl == pred_lbl[2:]:
+                cur_end = tok_e
+            else:
+                if cur_lbl:
+                    spans.append({"start": cur_start, "end": cur_end, "label": cur_lbl})
+                cur_lbl, cur_start, cur_end = None, None, None
+        if cur_lbl:
+            spans.append({"start": cur_start, "end": cur_end, "label": cur_lbl})
+        for sp in spans:
+            sp["value"] = text[sp["start"]:sp["end"]]
+        redacted = text
+        for sp in sorted(spans, key=lambda s: s["start"], reverse=True):
+            redacted = redacted[:sp["start"]] + f"[{sp['label']}]" + redacted[sp["end"]:]
+        return {"original": text, "redacted": redacted, "spans": spans}
+# Hooks so AutoModel can find this class via the config's auto_map.
+BertGATConfig.register_for_auto_class("AutoConfig")
+BertGATForTokenClassification.register_for_auto_class("AutoModel")

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "is_local": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}