eugenehp commited on
Commit
d2620e1
·
verified ·
1 Parent(s): 5ef3241

Add files using upload-large-folder tool

Browse files
.claude/settings.local.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(hf --help)",
5
+ "Bash(hf auth:*)",
6
+ "Bash(hf repos:*)",
7
+ "Bash(hf upload:*)",
8
+ "Bash(hf upload-large-folder:*)"
9
+ ]
10
+ }
11
+ }
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: token-classification
4
+ library_name: burn
5
+ tags:
6
+ - rust
7
+ - burn
8
+ - privacy
9
+ - PII
10
+ - NER
11
+ - token-classification
12
+ - openai
13
+ base_model: openai/privacy-filter
14
+ ---
15
+
16
+ # OpenAI Privacy Filter — Rust/Burn Weights
17
+
18
+ Safetensors weights for [openai/privacy-filter](https://huggingface.co/openai/privacy-filter), packaged for inference with [privacy-filter-rs](https://github.com/eugenehp/privacy-filter-rs) (pure-Rust, Burn ML framework).
19
+
20
+ ## Contents
21
+
22
+ | File | Size | Description |
23
+ |---|---|---|
24
+ | `model.safetensors` | 2.6 GB | Model weights (bfloat16) |
25
+ | `config.json` | 3 KB | HuggingFace model configuration |
26
+ | `tokenizer.json` | 27 MB | BPE tokenizer (o200k_base) |
27
+ | `tokenizer_config.json` | 234 B | Tokenizer metadata |
28
+ | `viterbi_calibration.json` | 372 B | Viterbi decoder operating points |
29
+
30
+ ## Model Details
31
+
32
+ - **Architecture**: Bidirectional transformer encoder with Sparse MoE
33
+ - **Parameters**: 1.5B total, ~50M active per token (top-4 of 128 experts)
34
+ - **Hidden size**: 640, **Layers**: 8, **Heads**: 14 Q / 2 KV (GQA)
35
+ - **Context**: 128,000 tokens (YaRN RoPE, sliding window 257)
36
+ - **Output**: 33 BIOES token classes over 8 privacy categories
37
+ - **Dtype**: bfloat16 (converted to f32 at load time by the Rust runtime)
38
+
39
+ ## Privacy Categories
40
+
41
+ 1. `account_number`
42
+ 2. `private_address`
43
+ 3. `private_date`
44
+ 4. `private_email`
45
+ 5. `private_person`
46
+ 6. `private_phone`
47
+ 7. `private_url`
48
+ 8. `secret`
49
+
50
+ ## Usage with privacy-filter-rs
51
+
52
+ ```bash
53
+ # Clone the Rust project
54
+ git clone https://github.com/eugenehp/privacy-filter-rs
55
+ cd privacy-filter-rs
56
+
57
+ # Download weights into ./data (this repo)
58
+ # git clone https://huggingface.co/eugenehp/privacy-filter-rs data
59
+
60
+ # Run inference
61
+ cargo run --release -- -m data "My name is Alice Smith"
62
+ ```
63
+
64
+ ```rust
65
+ use privacy_filter_rs::{PrivacyFilterInference, backend::{B, Device}};
66
+ use std::path::Path;
67
+
68
+ let device = <Device as Default>::default();
69
+ let engine = PrivacyFilterInference::<B>::load(Path::new("data"), device)?;
70
+
71
+ let spans = engine.predict("My name is Alice Smith")?;
72
+ for s in &spans {
73
+ println!("{}: {} (score: {:.4})", s.entity_group, s.word, s.score);
74
+ }
75
+ // private_person: Alice Smith (score: 1.0000)
76
+ ```
77
+
78
+ ## License
79
+
80
+ Apache 2.0 — same as the upstream [openai/privacy-filter](https://huggingface.co/openai/privacy-filter) model.
config.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OpenAIPrivacyFilterForTokenClassification"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": 0.0,
9
+ "default_n_ctx": 128000,
10
+ "dtype": "bfloat16",
11
+ "eos_token_id": 199999,
12
+ "head_dim": 64,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 640,
15
+ "id2label": {
16
+ "0": "O",
17
+ "1": "B-account_number",
18
+ "2": "I-account_number",
19
+ "3": "E-account_number",
20
+ "4": "S-account_number",
21
+ "5": "B-private_address",
22
+ "6": "I-private_address",
23
+ "7": "E-private_address",
24
+ "8": "S-private_address",
25
+ "9": "B-private_date",
26
+ "10": "I-private_date",
27
+ "11": "E-private_date",
28
+ "12": "S-private_date",
29
+ "13": "B-private_email",
30
+ "14": "I-private_email",
31
+ "15": "E-private_email",
32
+ "16": "S-private_email",
33
+ "17": "B-private_person",
34
+ "18": "I-private_person",
35
+ "19": "E-private_person",
36
+ "20": "S-private_person",
37
+ "21": "B-private_phone",
38
+ "22": "I-private_phone",
39
+ "23": "E-private_phone",
40
+ "24": "S-private_phone",
41
+ "25": "B-private_url",
42
+ "26": "I-private_url",
43
+ "27": "E-private_url",
44
+ "28": "S-private_url",
45
+ "29": "B-secret",
46
+ "30": "I-secret",
47
+ "31": "E-secret",
48
+ "32": "S-secret"
49
+ },
50
+ "initial_context_length": 4096,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 640,
53
+ "label2id": {
54
+ "B-account_number": 1,
55
+ "B-private_address": 5,
56
+ "B-private_date": 9,
57
+ "B-private_email": 13,
58
+ "B-private_person": 17,
59
+ "B-private_phone": 21,
60
+ "B-private_url": 25,
61
+ "B-secret": 29,
62
+ "E-account_number": 3,
63
+ "E-private_address": 7,
64
+ "E-private_date": 11,
65
+ "E-private_email": 15,
66
+ "E-private_person": 19,
67
+ "E-private_phone": 23,
68
+ "E-private_url": 27,
69
+ "E-secret": 31,
70
+ "I-account_number": 2,
71
+ "I-private_address": 6,
72
+ "I-private_date": 10,
73
+ "I-private_email": 14,
74
+ "I-private_person": 18,
75
+ "I-private_phone": 22,
76
+ "I-private_url": 26,
77
+ "I-secret": 30,
78
+ "O": 0,
79
+ "S-account_number": 4,
80
+ "S-private_address": 8,
81
+ "S-private_date": 12,
82
+ "S-private_email": 16,
83
+ "S-private_person": 20,
84
+ "S-private_phone": 24,
85
+ "S-private_url": 28,
86
+ "S-secret": 32
87
+ },
88
+ "max_position_embeddings": 131072,
89
+ "model_type": "openai_privacy_filter",
90
+ "num_attention_heads": 14,
91
+ "num_experts_per_tok": 4,
92
+ "num_hidden_layers": 8,
93
+ "num_key_value_heads": 2,
94
+ "num_local_experts": 128,
95
+ "output_router_logits": false,
96
+ "pad_token_id": 199999,
97
+ "rms_norm_eps": 1e-05,
98
+ "rope_parameters": {
99
+ "beta_fast": 32.0,
100
+ "beta_slow": 1.0,
101
+ "factor": 32.0,
102
+ "original_max_position_embeddings": 4096,
103
+ "rope_theta": 150000.0,
104
+ "rope_type": "yarn",
105
+ "truncate": false
106
+ },
107
+ "router_aux_loss_coef": 0.001,
108
+ "sliding_window": 128,
109
+ "tie_word_embeddings": false,
110
+ "transformers_version": "5.6.0.dev0",
111
+ "use_cache": true,
112
+ "vocab_size": 200064,
113
+ "transformers.js_config": {
114
+ "use_external_data_format": {
115
+ "model.onnx": 3,
116
+ "model_fp16.onnx": 2,
117
+ "model": 1
118
+ }
119
+ }
120
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f66b87650b988b04e218285f9fe3df6a4943416b6ffa8171f07bc56cf12a9d
3
+ size 2798989498
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
3
+ size 27868174
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "eos_token": "<|endoftext|>",
4
+ "model_input_names": [
5
+ "input_ids",
6
+ "attention_mask"
7
+ ],
8
+ "model_max_length": 128000,
9
+ "pad_token": "<|endoftext|>",
10
+ "tokenizer_class": "TokenizersBackend"
11
+ }
viterbi_calibration.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "operating_points": {
3
+ "default": {
4
+ "biases": {
5
+ "transition_bias_background_stay": 0.0,
6
+ "transition_bias_background_to_start": 0.0,
7
+ "transition_bias_end_to_background": 0.0,
8
+ "transition_bias_end_to_start": 0.0,
9
+ "transition_bias_inside_to_continue": 0.0,
10
+ "transition_bias_inside_to_end": 0.0
11
+ }
12
+ }
13
+ }
14
+ }