Add files using upload-large-folder tool

Browse files

Files changed (8) hide show

.claude/settings.local.json +11 -0
.gitattributes +1 -0
README.md +80 -0
config.json +120 -0
model.safetensors +3 -0
tokenizer.json +3 -0
tokenizer_config.json +11 -0
viterbi_calibration.json +14 -0

.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(hf --help)",
+      "Bash(hf auth:*)",
+      "Bash(hf repos:*)",
+      "Bash(hf upload:*)",
+      "Bash(hf upload-large-folder:*)"
+    ]
+  }
+}

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+---
+license: apache-2.0
+pipeline_tag: token-classification
+library_name: burn
+tags:
+- rust
+- burn
+- privacy
+- PII
+- NER
+- token-classification
+- openai
+base_model: openai/privacy-filter
+---
+# OpenAI Privacy Filter — Rust/Burn Weights
+Safetensors weights for [openai/privacy-filter](https://huggingface.co/openai/privacy-filter), packaged for inference with [privacy-filter-rs](https://github.com/eugenehp/privacy-filter-rs) (pure-Rust, Burn ML framework).
+## Contents
+| File | Size | Description |
+|---|---|---|
+| `model.safetensors` | 2.6 GB | Model weights (bfloat16) |
+| `config.json` | 3 KB | HuggingFace model configuration |
+| `tokenizer.json` | 27 MB | BPE tokenizer (o200k_base) |
+| `tokenizer_config.json` | 234 B | Tokenizer metadata |
+| `viterbi_calibration.json` | 372 B | Viterbi decoder operating points |
+## Model Details
+- **Architecture**: Bidirectional transformer encoder with Sparse MoE
+- **Parameters**: 1.5B total, ~50M active per token (top-4 of 128 experts)
+- **Hidden size**: 640, **Layers**: 8, **Heads**: 14 Q / 2 KV (GQA)
+- **Context**: 128,000 tokens (YaRN RoPE, sliding window 257)
+- **Output**: 33 BIOES token classes over 8 privacy categories
+- **Dtype**: bfloat16 (converted to f32 at load time by the Rust runtime)
+## Privacy Categories
+1. `account_number`
+2. `private_address`
+3. `private_date`
+4. `private_email`
+5. `private_person`
+6. `private_phone`
+7. `private_url`
+8. `secret`
+## Usage with privacy-filter-rs
+```bash
+# Clone the Rust project
+git clone https://github.com/eugenehp/privacy-filter-rs
+cd privacy-filter-rs
+# Download weights into ./data (this repo)
+# git clone https://huggingface.co/eugenehp/privacy-filter-rs data
+# Run inference
+cargo run --release -- -m data "My name is Alice Smith"
+```
+```rust
+use privacy_filter_rs::{PrivacyFilterInference, backend::{B, Device}};
+use std::path::Path;
+let device = <Device as Default>::default();
+let engine = PrivacyFilterInference::<B>::load(Path::new("data"), device)?;
+let spans = engine.predict("My name is Alice Smith")?;
+for s in &spans {
+    println!("{}: {} (score: {:.4})", s.entity_group, s.word, s.score);
+}
+// private_person:  Alice Smith (score: 1.0000)
+```
+## License
+Apache 2.0 — same as the upstream [openai/privacy-filter](https://huggingface.co/openai/privacy-filter) model.

config.json ADDED Viewed

	@@ -0,0 +1,120 @@

+{
+  "architectures": [
+    "OpenAIPrivacyFilterForTokenClassification"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "classifier_dropout": 0.0,
+  "default_n_ctx": 128000,
+  "dtype": "bfloat16",
+  "eos_token_id": 199999,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 640,
+  "id2label": {
+    "0": "O",
+    "1": "B-account_number",
+    "2": "I-account_number",
+    "3": "E-account_number",
+    "4": "S-account_number",
+    "5": "B-private_address",
+    "6": "I-private_address",
+    "7": "E-private_address",
+    "8": "S-private_address",
+    "9": "B-private_date",
+    "10": "I-private_date",
+    "11": "E-private_date",
+    "12": "S-private_date",
+    "13": "B-private_email",
+    "14": "I-private_email",
+    "15": "E-private_email",
+    "16": "S-private_email",
+    "17": "B-private_person",
+    "18": "I-private_person",
+    "19": "E-private_person",
+    "20": "S-private_person",
+    "21": "B-private_phone",
+    "22": "I-private_phone",
+    "23": "E-private_phone",
+    "24": "S-private_phone",
+    "25": "B-private_url",
+    "26": "I-private_url",
+    "27": "E-private_url",
+    "28": "S-private_url",
+    "29": "B-secret",
+    "30": "I-secret",
+    "31": "E-secret",
+    "32": "S-secret"
+  },
+  "initial_context_length": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 640,
+  "label2id": {
+    "B-account_number": 1,
+    "B-private_address": 5,
+    "B-private_date": 9,
+    "B-private_email": 13,
+    "B-private_person": 17,
+    "B-private_phone": 21,
+    "B-private_url": 25,
+    "B-secret": 29,
+    "E-account_number": 3,
+    "E-private_address": 7,
+    "E-private_date": 11,
+    "E-private_email": 15,
+    "E-private_person": 19,
+    "E-private_phone": 23,
+    "E-private_url": 27,
+    "E-secret": 31,
+    "I-account_number": 2,
+    "I-private_address": 6,
+    "I-private_date": 10,
+    "I-private_email": 14,
+    "I-private_person": 18,
+    "I-private_phone": 22,
+    "I-private_url": 26,
+    "I-secret": 30,
+    "O": 0,
+    "S-account_number": 4,
+    "S-private_address": 8,
+    "S-private_date": 12,
+    "S-private_email": 16,
+    "S-private_person": 20,
+    "S-private_phone": 24,
+    "S-private_url": 28,
+    "S-secret": 32
+  },
+  "max_position_embeddings": 131072,
+  "model_type": "openai_privacy_filter",
+  "num_attention_heads": 14,
+  "num_experts_per_tok": 4,
+  "num_hidden_layers": 8,
+  "num_key_value_heads": 2,
+  "num_local_experts": 128,
+  "output_router_logits": false,
+  "pad_token_id": 199999,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "beta_fast": 32.0,
+    "beta_slow": 1.0,
+    "factor": 32.0,
+    "original_max_position_embeddings": 4096,
+    "rope_theta": 150000.0,
+    "rope_type": "yarn",
+    "truncate": false
+  },
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": 128,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.6.0.dev0",
+  "use_cache": true,
+  "vocab_size": 200064,
+  "transformers.js_config": {
+    "use_external_data_format": {
+      "model.onnx": 3,
+      "model_fp16.onnx": 2,
+      "model": 1
+    }
+  }
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06f66b87650b988b04e218285f9fe3df6a4943416b6ffa8171f07bc56cf12a9d
+size 2798989498

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "backend": "tokenizers",
+  "eos_token": "<|endoftext|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "TokenizersBackend"
+}

viterbi_calibration.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "operating_points": {
+    "default": {
+      "biases": {
+        "transition_bias_background_stay": 0.0,
+        "transition_bias_background_to_start": 0.0,
+        "transition_bias_end_to_background": 0.0,
+        "transition_bias_end_to_start": 0.0,
+        "transition_bias_inside_to_continue": 0.0,
+        "transition_bias_inside_to_end": 0.0
+      }
+    }
+  }
+}