ferxalb commited on
Commit
bed1aab
·
verified ·
1 Parent(s): 16ecc29

Upload matex-privacy-sentinel-v0.1 trained on Modal A10

Browse files
Files changed (6) hide show
  1. README.md +32 -0
  2. USAGE.txt +7 -0
  3. config.json +154 -0
  4. custom_label_space.json +30 -0
  5. finetune_summary.json +85 -0
  6. model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model: openai/privacy-filter
4
+ license: apache-2.0
5
+ tags:
6
+ - token-classification
7
+ - privacy
8
+ - pii
9
+ - secrets
10
+ - code-security
11
+ - matex
12
+ datasets:
13
+ - enosislabs/matex-privacy-sentinel-dataset
14
+ ---
15
+
16
+ # MaTE X Privacy Sentinel v0.1
17
+
18
+ Fine-tuned checkpoint based on OpenAI Privacy Filter for local privacy/security redaction in MaTE X.
19
+
20
+ ## Dataset
21
+
22
+ `enosislabs/matex-privacy-sentinel-dataset`
23
+
24
+ ## Usage
25
+
26
+ ```bash
27
+ opf --checkpoint . "DATABASE_URL=postgres://demo_user:demo_pass@db.local/matex"
28
+ ```
29
+
30
+ ## Limitation
31
+
32
+ This is a privacy/security aid, not a compliance guarantee. Run your own canary evaluation before production.
USAGE.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Finetuned checkpoint generated by `opf train`.
2
+
3
+ Run local inference:
4
+ opf --checkpoint /__modal/volumes/vo-XIgHS4JYPE8ddRlAcjQTwm/matex-privacy-sentinel-v0.1 --device cuda "Alice was born on 1990-01-02."
5
+
6
+ Run eval:
7
+ opf eval /path/to/eval.jsonl --checkpoint /__modal/volumes/vo-XIgHS4JYPE8ddRlAcjQTwm/matex-privacy-sentinel-v0.1 --device cuda
config.json ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bidirectional_context": true,
3
+ "bidirectional_left_context": 128,
4
+ "bidirectional_right_context": 128,
5
+ "category_version": "matex_privacy_sentinel_v0_1",
6
+ "default_n_ctx": 128000,
7
+ "encoding": "o200k_base",
8
+ "experts_per_token": 4,
9
+ "head_dim": 64,
10
+ "hidden_size": 640,
11
+ "inference_contract_version": 1,
12
+ "initial_context_length": 4096,
13
+ "intermediate_size": 640,
14
+ "max_position_embeddings": 131072,
15
+ "model_type": "privacy_filter",
16
+ "ner_class_names": [
17
+ "O",
18
+ "B-account_number",
19
+ "I-account_number",
20
+ "E-account_number",
21
+ "S-account_number",
22
+ "B-private_address",
23
+ "I-private_address",
24
+ "E-private_address",
25
+ "S-private_address",
26
+ "B-private_email",
27
+ "I-private_email",
28
+ "E-private_email",
29
+ "S-private_email",
30
+ "B-private_person",
31
+ "I-private_person",
32
+ "E-private_person",
33
+ "S-private_person",
34
+ "B-private_phone",
35
+ "I-private_phone",
36
+ "E-private_phone",
37
+ "S-private_phone",
38
+ "B-private_url",
39
+ "I-private_url",
40
+ "E-private_url",
41
+ "S-private_url",
42
+ "B-private_date",
43
+ "I-private_date",
44
+ "E-private_date",
45
+ "S-private_date",
46
+ "B-secret",
47
+ "I-secret",
48
+ "E-secret",
49
+ "S-secret",
50
+ "B-api_key",
51
+ "I-api_key",
52
+ "E-api_key",
53
+ "S-api_key",
54
+ "B-auth_token",
55
+ "I-auth_token",
56
+ "E-auth_token",
57
+ "S-auth_token",
58
+ "B-session_cookie",
59
+ "I-session_cookie",
60
+ "E-session_cookie",
61
+ "S-session_cookie",
62
+ "B-database_uri",
63
+ "I-database_uri",
64
+ "E-database_uri",
65
+ "S-database_uri",
66
+ "B-cloud_credential",
67
+ "I-cloud_credential",
68
+ "E-cloud_credential",
69
+ "S-cloud_credential",
70
+ "B-env_value",
71
+ "I-env_value",
72
+ "E-env_value",
73
+ "S-env_value",
74
+ "B-private_file_path",
75
+ "I-private_file_path",
76
+ "E-private_file_path",
77
+ "S-private_file_path",
78
+ "B-internal_url",
79
+ "I-internal_url",
80
+ "E-internal_url",
81
+ "S-internal_url",
82
+ "B-workspace_identity",
83
+ "I-workspace_identity",
84
+ "E-workspace_identity",
85
+ "S-workspace_identity",
86
+ "B-customer_data",
87
+ "I-customer_data",
88
+ "E-customer_data",
89
+ "S-customer_data",
90
+ "B-prompt_sensitive",
91
+ "I-prompt_sensitive",
92
+ "E-prompt_sensitive",
93
+ "S-prompt_sensitive",
94
+ "B-agent_memory_sensitive",
95
+ "I-agent_memory_sensitive",
96
+ "E-agent_memory_sensitive",
97
+ "S-agent_memory_sensitive",
98
+ "B-stacktrace_sensitive",
99
+ "I-stacktrace_sensitive",
100
+ "E-stacktrace_sensitive",
101
+ "S-stacktrace_sensitive",
102
+ "B-repo_secret",
103
+ "I-repo_secret",
104
+ "E-repo_secret",
105
+ "S-repo_secret",
106
+ "B-payment_token",
107
+ "I-payment_token",
108
+ "E-payment_token",
109
+ "S-payment_token",
110
+ "B-personal_document_id",
111
+ "I-personal_document_id",
112
+ "E-personal_document_id",
113
+ "S-personal_document_id"
114
+ ],
115
+ "num_attention_heads": 14,
116
+ "num_experts": 128,
117
+ "num_hidden_layers": 8,
118
+ "num_key_value_heads": 2,
119
+ "num_labels": 97,
120
+ "param_dtype": "bfloat16",
121
+ "rope_ntk_alpha": 1.0,
122
+ "rope_ntk_beta": 32.0,
123
+ "rope_scaling_factor": 32.0,
124
+ "rope_theta": 150000,
125
+ "sliding_window": 257,
126
+ "span_class_names": [
127
+ "O",
128
+ "account_number",
129
+ "private_address",
130
+ "private_email",
131
+ "private_person",
132
+ "private_phone",
133
+ "private_url",
134
+ "private_date",
135
+ "secret",
136
+ "api_key",
137
+ "auth_token",
138
+ "session_cookie",
139
+ "database_uri",
140
+ "cloud_credential",
141
+ "env_value",
142
+ "private_file_path",
143
+ "internal_url",
144
+ "workspace_identity",
145
+ "customer_data",
146
+ "prompt_sensitive",
147
+ "agent_memory_sensitive",
148
+ "stacktrace_sensitive",
149
+ "repo_secret",
150
+ "payment_token",
151
+ "personal_document_id"
152
+ ],
153
+ "vocab_size": 200064
154
+ }
custom_label_space.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "category_version": "matex_privacy_sentinel_v0_1",
3
+ "span_class_names": [
4
+ "O",
5
+ "account_number",
6
+ "private_address",
7
+ "private_email",
8
+ "private_person",
9
+ "private_phone",
10
+ "private_url",
11
+ "private_date",
12
+ "secret",
13
+ "api_key",
14
+ "auth_token",
15
+ "session_cookie",
16
+ "database_uri",
17
+ "cloud_credential",
18
+ "env_value",
19
+ "private_file_path",
20
+ "internal_url",
21
+ "workspace_identity",
22
+ "customer_data",
23
+ "prompt_sensitive",
24
+ "agent_memory_sensitive",
25
+ "stacktrace_sensitive",
26
+ "repo_secret",
27
+ "payment_token",
28
+ "personal_document_id"
29
+ ]
30
+ }
finetune_summary.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "artifacts": {
3
+ "config_json": "/__modal/volumes/vo-XIgHS4JYPE8ddRlAcjQTwm/matex-privacy-sentinel-v0.1/config.json",
4
+ "model_safetensors": "/__modal/volumes/vo-XIgHS4JYPE8ddRlAcjQTwm/matex-privacy-sentinel-v0.1/model.safetensors",
5
+ "summary_json": "/__modal/volumes/vo-XIgHS4JYPE8ddRlAcjQTwm/matex-privacy-sentinel-v0.1/finetune_summary.json"
6
+ },
7
+ "base_checkpoint": "/root/.opf/privacy_filter",
8
+ "batch_size": 4,
9
+ "best_epoch": 1,
10
+ "best_metric": 7.897648645950944e-05,
11
+ "best_metric_name": "validation_loss",
12
+ "checkpoint_category_version": "v2",
13
+ "device": "cuda",
14
+ "elapsed_s": 223.644200789,
15
+ "encoding": "o200k_base",
16
+ "epoch_metrics": [
17
+ {
18
+ "elapsed_s": 213.92298557200002,
19
+ "epoch": 1,
20
+ "optimizer_steps": 1688,
21
+ "train_batches": 1688,
22
+ "train_loss": 0.07091577861013018,
23
+ "train_token_accuracy": 0.9923756289082941,
24
+ "train_tokens": 684122,
25
+ "validation_batches": 94,
26
+ "validation_loss": 7.897648645950944e-05,
27
+ "validation_token_accuracy": 1.0,
28
+ "validation_tokens": 36432
29
+ }
30
+ ],
31
+ "epochs": 1,
32
+ "generated_at_unix": 1777695794.6015186,
33
+ "grad_accum_steps": 1,
34
+ "label_space_json_path": "/__modal/volumes/vo-1GyZvIAPD7wnYD3s0C2QXS/dataset/configs/custom_label_space.json",
35
+ "label_space_source": "label-space-json",
36
+ "learning_rate": 1e-05,
37
+ "max_grad_norm": 1.0,
38
+ "num_output_labels": 97,
39
+ "num_train_examples": 6750,
40
+ "num_train_windows": 6750,
41
+ "num_validation_examples": 375,
42
+ "num_validation_windows": 375,
43
+ "output_checkpoint_dir": "/__modal/volumes/vo-XIgHS4JYPE8ddRlAcjQTwm/matex-privacy-sentinel-v0.1",
44
+ "output_head_reinitialized": true,
45
+ "output_head_rows_copied": 97,
46
+ "output_head_rows_copied_exact": 33,
47
+ "output_head_rows_copied_fallback": 64,
48
+ "resolved_category_version": "matex_privacy_sentinel_v0_1",
49
+ "resolved_n_ctx": 128000,
50
+ "schema_version": 1,
51
+ "serialized_param_dtype": "bfloat16",
52
+ "span_class_names": [
53
+ "O",
54
+ "account_number",
55
+ "private_address",
56
+ "private_email",
57
+ "private_person",
58
+ "private_phone",
59
+ "private_url",
60
+ "private_date",
61
+ "secret",
62
+ "api_key",
63
+ "auth_token",
64
+ "session_cookie",
65
+ "database_uri",
66
+ "cloud_credential",
67
+ "env_value",
68
+ "private_file_path",
69
+ "internal_url",
70
+ "workspace_identity",
71
+ "customer_data",
72
+ "prompt_sensitive",
73
+ "agent_memory_sensitive",
74
+ "stacktrace_sensitive",
75
+ "repo_secret",
76
+ "payment_token",
77
+ "personal_document_id"
78
+ ],
79
+ "train_dataset": "/data/dataset/compiled/train.jsonl",
80
+ "train_dataset_variant": "full",
81
+ "validation_dataset": "/data/dataset/compiled/validation.jsonl",
82
+ "validation_dataset_variant": "full",
83
+ "validation_split": null,
84
+ "weight_decay": 0.01
85
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcde9d8b17eeabef5f101d86ec33421e3add1d18089fe7a6b686606af6ed2b2c
3
+ size 2799065896