apararti commited on
Commit
a1650a1
·
verified ·
1 Parent(s): 1cba10c

Upload Russian privacy-filter fine-tune

Browse files
Files changed (5) hide show
  1. README.md +97 -0
  2. USAGE.txt +7 -0
  3. config.json +74 -0
  4. finetune_summary.json +69 -0
  5. model.safetensors +3 -0
README.md ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: openai/privacy-filter
3
+ pipeline_tag: token-classification
4
+ language:
5
+ - ru
6
+ tags:
7
+ - privacy
8
+ - pii
9
+ - token-classification
10
+ - russian
11
+ - opf
12
+ model-index:
13
+ - name: privacy-filter-ru
14
+ results:
15
+ - task:
16
+ type: token-classification
17
+ name: Token Classification
18
+ dataset:
19
+ name: ru_realistic_eval_v1
20
+ type: local
21
+ metrics:
22
+ - type: f1
23
+ value: 0.9916
24
+ name: Raw span F1
25
+ - task:
26
+ type: token-classification
27
+ name: Token Classification
28
+ dataset:
29
+ name: ru_raw_hard_v3_eval
30
+ type: local
31
+ metrics:
32
+ - type: f1
33
+ value: 1.0
34
+ name: Raw span F1
35
+ ---
36
+
37
+ # privacy-filter-ru
38
+
39
+ Russian PII fine-tune of [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter).
40
+
41
+ This checkpoint is the raw-model production candidate from the local `raw_hardening_v3` run. It is intended to run without deterministic post-processing.
42
+
43
+ ## Labels
44
+
45
+ - `private_person`
46
+ - `private_phone`
47
+ - `private_email`
48
+ - `private_address`
49
+ - `private_date`
50
+ - `private_url`
51
+ - `account_number`
52
+ - `secret`
53
+
54
+ ## Training
55
+
56
+ - Base checkpoint: `checkpoints/production_candidate_ru_v2`
57
+ - Original base model: `openai/privacy-filter`
58
+ - Epochs: 1
59
+ - Learning rate: `1e-6`
60
+ - Batch size: 1
61
+ - Gradient accumulation steps: 16
62
+ - Serialization dtype: `bfloat16`
63
+ - Train examples: 17,000
64
+ - Validation examples: 2,000
65
+
66
+ The v3 training mix targeted raw-model behavior that previously depended on a deterministic runtime layer: phone/account/secret label separation and person/date boundary cleanup.
67
+
68
+ ## Raw Evaluation
69
+
70
+ No deterministic post-processing was used for these metrics.
71
+
72
+ | Eval | v2 raw span F1 | v3 raw span F1 | v2 mismatch rows | v3 mismatch rows |
73
+ | --- | ---: | ---: | ---: | ---: |
74
+ | synthetic test | 1.0000 | 1.0000 | 0 | 0 |
75
+ | ru_realistic_eval_v1 | 0.8787 | 0.9916 | 158 | 11 |
76
+ | ru_phone_account_confusion_v1 | 1.0000 | 1.0000 | 0 | 0 |
77
+ | ru_date_negative_v1 | 1.0000 | 1.0000 | 0 | 0 |
78
+ | ru_raw_hard_v3_eval | 0.8350 | 1.0000 | 297 | 0 |
79
+ | ru_person_hard_eval | 0.8074 | 0.8074 | 183 | 183 |
80
+ | alexen2 | 0.8644 | 0.8547 | 228 | 241 |
81
+ | Rubai heldout | 0.8054 | 0.8036 | 3,131 | 3,136 |
82
+
83
+ ## Usage
84
+
85
+ ```bash
86
+ opf --checkpoint apararti/privacy-filter-ru --device cuda "Мой номер 8 999 863 37 84, зовут Андрей Макаров."
87
+ ```
88
+
89
+ For a local checkout:
90
+
91
+ ```bash
92
+ opf --checkpoint ./privacy-filter-ru --device cuda "Перезвоните Наталье Никитиной на 8 903 914 81 88."
93
+ ```
94
+
95
+ ## Notes
96
+
97
+ This is a fine-tuned checkpoint, not the original OpenAI model. It is optimized for Russian PII filtering and should be validated on domain-specific shadow traffic before production rollout.
USAGE.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Finetuned checkpoint generated by `opf train`.
2
+
3
+ Run local inference:
4
+ opf --checkpoint /home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3 --device cuda "Alice was born on 1990-01-02."
5
+
6
+ Run eval:
7
+ opf eval /path/to/eval.jsonl --checkpoint /home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3 --device cuda
config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bidirectional_context": true,
3
+ "bidirectional_left_context": 128,
4
+ "bidirectional_right_context": 128,
5
+ "category_version": "v2",
6
+ "default_n_ctx": 128000,
7
+ "encoding": "o200k_base",
8
+ "experts_per_token": 4,
9
+ "head_dim": 64,
10
+ "hidden_size": 640,
11
+ "inference_contract_version": 1,
12
+ "initial_context_length": 4096,
13
+ "intermediate_size": 640,
14
+ "max_position_embeddings": 131072,
15
+ "model_type": "privacy_filter",
16
+ "ner_class_names": [
17
+ "O",
18
+ "B-account_number",
19
+ "I-account_number",
20
+ "E-account_number",
21
+ "S-account_number",
22
+ "B-private_address",
23
+ "I-private_address",
24
+ "E-private_address",
25
+ "S-private_address",
26
+ "B-private_date",
27
+ "I-private_date",
28
+ "E-private_date",
29
+ "S-private_date",
30
+ "B-private_email",
31
+ "I-private_email",
32
+ "E-private_email",
33
+ "S-private_email",
34
+ "B-private_person",
35
+ "I-private_person",
36
+ "E-private_person",
37
+ "S-private_person",
38
+ "B-private_phone",
39
+ "I-private_phone",
40
+ "E-private_phone",
41
+ "S-private_phone",
42
+ "B-private_url",
43
+ "I-private_url",
44
+ "E-private_url",
45
+ "S-private_url",
46
+ "B-secret",
47
+ "I-secret",
48
+ "E-secret",
49
+ "S-secret"
50
+ ],
51
+ "num_attention_heads": 14,
52
+ "num_experts": 128,
53
+ "num_hidden_layers": 8,
54
+ "num_key_value_heads": 2,
55
+ "num_labels": 33,
56
+ "param_dtype": "bfloat16",
57
+ "rope_ntk_alpha": 1.0,
58
+ "rope_ntk_beta": 32.0,
59
+ "rope_scaling_factor": 32.0,
60
+ "rope_theta": 150000,
61
+ "sliding_window": 257,
62
+ "span_class_names": [
63
+ "O",
64
+ "account_number",
65
+ "private_address",
66
+ "private_date",
67
+ "private_email",
68
+ "private_person",
69
+ "private_phone",
70
+ "private_url",
71
+ "secret"
72
+ ],
73
+ "vocab_size": 200064
74
+ }
finetune_summary.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "artifacts": {
3
+ "config_json": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/config.json",
4
+ "model_safetensors": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/model.safetensors",
5
+ "summary_json": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/finetune_summary.json"
6
+ },
7
+ "base_checkpoint": "checkpoints/production_candidate_ru_v2",
8
+ "batch_size": 1,
9
+ "best_epoch": 1,
10
+ "best_metric": 0.01566733591659785,
11
+ "best_metric_name": "validation_loss",
12
+ "checkpoint_category_version": "v2",
13
+ "device": "cuda",
14
+ "elapsed_s": 438.167345628,
15
+ "encoding": "o200k_base",
16
+ "epoch_metrics": [
17
+ {
18
+ "elapsed_s": 436.7827820130001,
19
+ "epoch": 1,
20
+ "optimizer_steps": 1063,
21
+ "train_batches": 17000,
22
+ "train_loss": 0.019168339122784792,
23
+ "train_token_accuracy": 0.9947700011454941,
24
+ "train_tokens": 305545,
25
+ "validation_batches": 2000,
26
+ "validation_loss": 0.01566733591659785,
27
+ "validation_token_accuracy": 0.9952379626277534,
28
+ "validation_tokens": 35909
29
+ }
30
+ ],
31
+ "epochs": 1,
32
+ "generated_at_unix": 1777207898.9233346,
33
+ "grad_accum_steps": 16,
34
+ "label_space_json_path": null,
35
+ "label_space_source": "checkpoint",
36
+ "learning_rate": 1e-06,
37
+ "max_grad_norm": 1.0,
38
+ "num_output_labels": 33,
39
+ "num_train_examples": 17000,
40
+ "num_train_windows": 17000,
41
+ "num_validation_examples": 2000,
42
+ "num_validation_windows": 2000,
43
+ "output_checkpoint_dir": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3",
44
+ "output_head_reinitialized": false,
45
+ "output_head_rows_copied": 0,
46
+ "output_head_rows_copied_exact": 0,
47
+ "output_head_rows_copied_fallback": 0,
48
+ "resolved_category_version": "v2",
49
+ "resolved_n_ctx": 512,
50
+ "schema_version": 1,
51
+ "serialized_param_dtype": "bfloat16",
52
+ "span_class_names": [
53
+ "O",
54
+ "account_number",
55
+ "private_address",
56
+ "private_date",
57
+ "private_email",
58
+ "private_person",
59
+ "private_phone",
60
+ "private_url",
61
+ "secret"
62
+ ],
63
+ "train_dataset": "data/processed/raw_hardening_v3_train.jsonl",
64
+ "train_dataset_variant": "full",
65
+ "validation_dataset": "data/processed/raw_hardening_v3_val.jsonl",
66
+ "validation_dataset_variant": "full",
67
+ "validation_split": null,
68
+ "weight_decay": 0.01
69
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dad75869b6130d1cefcaae0efc77eb1885524a7fc2aa47b5a96528d2c0a1904c
3
+ size 2798983976