Upload Russian privacy-filter fine-tune
Browse files- README.md +97 -0
- USAGE.txt +7 -0
- config.json +74 -0
- finetune_summary.json +69 -0
- model.safetensors +3 -0
README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: openai/privacy-filter
|
| 3 |
+
pipeline_tag: token-classification
|
| 4 |
+
language:
|
| 5 |
+
- ru
|
| 6 |
+
tags:
|
| 7 |
+
- privacy
|
| 8 |
+
- pii
|
| 9 |
+
- token-classification
|
| 10 |
+
- russian
|
| 11 |
+
- opf
|
| 12 |
+
model-index:
|
| 13 |
+
- name: privacy-filter-ru
|
| 14 |
+
results:
|
| 15 |
+
- task:
|
| 16 |
+
type: token-classification
|
| 17 |
+
name: Token Classification
|
| 18 |
+
dataset:
|
| 19 |
+
name: ru_realistic_eval_v1
|
| 20 |
+
type: local
|
| 21 |
+
metrics:
|
| 22 |
+
- type: f1
|
| 23 |
+
value: 0.9916
|
| 24 |
+
name: Raw span F1
|
| 25 |
+
- task:
|
| 26 |
+
type: token-classification
|
| 27 |
+
name: Token Classification
|
| 28 |
+
dataset:
|
| 29 |
+
name: ru_raw_hard_v3_eval
|
| 30 |
+
type: local
|
| 31 |
+
metrics:
|
| 32 |
+
- type: f1
|
| 33 |
+
value: 1.0
|
| 34 |
+
name: Raw span F1
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
# privacy-filter-ru
|
| 38 |
+
|
| 39 |
+
Russian PII fine-tune of [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter).
|
| 40 |
+
|
| 41 |
+
This checkpoint is the raw-model production candidate from the local `raw_hardening_v3` run. It is intended to run without deterministic post-processing.
|
| 42 |
+
|
| 43 |
+
## Labels
|
| 44 |
+
|
| 45 |
+
- `private_person`
|
| 46 |
+
- `private_phone`
|
| 47 |
+
- `private_email`
|
| 48 |
+
- `private_address`
|
| 49 |
+
- `private_date`
|
| 50 |
+
- `private_url`
|
| 51 |
+
- `account_number`
|
| 52 |
+
- `secret`
|
| 53 |
+
|
| 54 |
+
## Training
|
| 55 |
+
|
| 56 |
+
- Base checkpoint: `checkpoints/production_candidate_ru_v2`
|
| 57 |
+
- Original base model: `openai/privacy-filter`
|
| 58 |
+
- Epochs: 1
|
| 59 |
+
- Learning rate: `1e-6`
|
| 60 |
+
- Batch size: 1
|
| 61 |
+
- Gradient accumulation steps: 16
|
| 62 |
+
- Serialization dtype: `bfloat16`
|
| 63 |
+
- Train examples: 17,000
|
| 64 |
+
- Validation examples: 2,000
|
| 65 |
+
|
| 66 |
+
The v3 training mix targeted raw-model behavior that previously depended on a deterministic runtime layer: phone/account/secret label separation and person/date boundary cleanup.
|
| 67 |
+
|
| 68 |
+
## Raw Evaluation
|
| 69 |
+
|
| 70 |
+
No deterministic post-processing was used for these metrics.
|
| 71 |
+
|
| 72 |
+
| Eval | v2 raw span F1 | v3 raw span F1 | v2 mismatch rows | v3 mismatch rows |
|
| 73 |
+
| --- | ---: | ---: | ---: | ---: |
|
| 74 |
+
| synthetic test | 1.0000 | 1.0000 | 0 | 0 |
|
| 75 |
+
| ru_realistic_eval_v1 | 0.8787 | 0.9916 | 158 | 11 |
|
| 76 |
+
| ru_phone_account_confusion_v1 | 1.0000 | 1.0000 | 0 | 0 |
|
| 77 |
+
| ru_date_negative_v1 | 1.0000 | 1.0000 | 0 | 0 |
|
| 78 |
+
| ru_raw_hard_v3_eval | 0.8350 | 1.0000 | 297 | 0 |
|
| 79 |
+
| ru_person_hard_eval | 0.8074 | 0.8074 | 183 | 183 |
|
| 80 |
+
| alexen2 | 0.8644 | 0.8547 | 228 | 241 |
|
| 81 |
+
| Rubai heldout | 0.8054 | 0.8036 | 3,131 | 3,136 |
|
| 82 |
+
|
| 83 |
+
## Usage
|
| 84 |
+
|
| 85 |
+
```bash
|
| 86 |
+
opf --checkpoint apararti/privacy-filter-ru --device cuda "Мой номер 8 999 863 37 84, зовут Андрей Макаров."
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
For a local checkout:
|
| 90 |
+
|
| 91 |
+
```bash
|
| 92 |
+
opf --checkpoint ./privacy-filter-ru --device cuda "Перезвоните Наталье Никитиной на 8 903 914 81 88."
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
## Notes
|
| 96 |
+
|
| 97 |
+
This is a fine-tuned checkpoint, not the original OpenAI model. It is optimized for Russian PII filtering and should be validated on domain-specific shadow traffic before production rollout.
|
USAGE.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Finetuned checkpoint generated by `opf train`.
|
| 2 |
+
|
| 3 |
+
Run local inference:
|
| 4 |
+
opf --checkpoint /home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3 --device cuda "Alice was born on 1990-01-02."
|
| 5 |
+
|
| 6 |
+
Run eval:
|
| 7 |
+
opf eval /path/to/eval.jsonl --checkpoint /home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3 --device cuda
|
config.json
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bidirectional_context": true,
|
| 3 |
+
"bidirectional_left_context": 128,
|
| 4 |
+
"bidirectional_right_context": 128,
|
| 5 |
+
"category_version": "v2",
|
| 6 |
+
"default_n_ctx": 128000,
|
| 7 |
+
"encoding": "o200k_base",
|
| 8 |
+
"experts_per_token": 4,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_size": 640,
|
| 11 |
+
"inference_contract_version": 1,
|
| 12 |
+
"initial_context_length": 4096,
|
| 13 |
+
"intermediate_size": 640,
|
| 14 |
+
"max_position_embeddings": 131072,
|
| 15 |
+
"model_type": "privacy_filter",
|
| 16 |
+
"ner_class_names": [
|
| 17 |
+
"O",
|
| 18 |
+
"B-account_number",
|
| 19 |
+
"I-account_number",
|
| 20 |
+
"E-account_number",
|
| 21 |
+
"S-account_number",
|
| 22 |
+
"B-private_address",
|
| 23 |
+
"I-private_address",
|
| 24 |
+
"E-private_address",
|
| 25 |
+
"S-private_address",
|
| 26 |
+
"B-private_date",
|
| 27 |
+
"I-private_date",
|
| 28 |
+
"E-private_date",
|
| 29 |
+
"S-private_date",
|
| 30 |
+
"B-private_email",
|
| 31 |
+
"I-private_email",
|
| 32 |
+
"E-private_email",
|
| 33 |
+
"S-private_email",
|
| 34 |
+
"B-private_person",
|
| 35 |
+
"I-private_person",
|
| 36 |
+
"E-private_person",
|
| 37 |
+
"S-private_person",
|
| 38 |
+
"B-private_phone",
|
| 39 |
+
"I-private_phone",
|
| 40 |
+
"E-private_phone",
|
| 41 |
+
"S-private_phone",
|
| 42 |
+
"B-private_url",
|
| 43 |
+
"I-private_url",
|
| 44 |
+
"E-private_url",
|
| 45 |
+
"S-private_url",
|
| 46 |
+
"B-secret",
|
| 47 |
+
"I-secret",
|
| 48 |
+
"E-secret",
|
| 49 |
+
"S-secret"
|
| 50 |
+
],
|
| 51 |
+
"num_attention_heads": 14,
|
| 52 |
+
"num_experts": 128,
|
| 53 |
+
"num_hidden_layers": 8,
|
| 54 |
+
"num_key_value_heads": 2,
|
| 55 |
+
"num_labels": 33,
|
| 56 |
+
"param_dtype": "bfloat16",
|
| 57 |
+
"rope_ntk_alpha": 1.0,
|
| 58 |
+
"rope_ntk_beta": 32.0,
|
| 59 |
+
"rope_scaling_factor": 32.0,
|
| 60 |
+
"rope_theta": 150000,
|
| 61 |
+
"sliding_window": 257,
|
| 62 |
+
"span_class_names": [
|
| 63 |
+
"O",
|
| 64 |
+
"account_number",
|
| 65 |
+
"private_address",
|
| 66 |
+
"private_date",
|
| 67 |
+
"private_email",
|
| 68 |
+
"private_person",
|
| 69 |
+
"private_phone",
|
| 70 |
+
"private_url",
|
| 71 |
+
"secret"
|
| 72 |
+
],
|
| 73 |
+
"vocab_size": 200064
|
| 74 |
+
}
|
finetune_summary.json
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"artifacts": {
|
| 3 |
+
"config_json": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/config.json",
|
| 4 |
+
"model_safetensors": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/model.safetensors",
|
| 5 |
+
"summary_json": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/finetune_summary.json"
|
| 6 |
+
},
|
| 7 |
+
"base_checkpoint": "checkpoints/production_candidate_ru_v2",
|
| 8 |
+
"batch_size": 1,
|
| 9 |
+
"best_epoch": 1,
|
| 10 |
+
"best_metric": 0.01566733591659785,
|
| 11 |
+
"best_metric_name": "validation_loss",
|
| 12 |
+
"checkpoint_category_version": "v2",
|
| 13 |
+
"device": "cuda",
|
| 14 |
+
"elapsed_s": 438.167345628,
|
| 15 |
+
"encoding": "o200k_base",
|
| 16 |
+
"epoch_metrics": [
|
| 17 |
+
{
|
| 18 |
+
"elapsed_s": 436.7827820130001,
|
| 19 |
+
"epoch": 1,
|
| 20 |
+
"optimizer_steps": 1063,
|
| 21 |
+
"train_batches": 17000,
|
| 22 |
+
"train_loss": 0.019168339122784792,
|
| 23 |
+
"train_token_accuracy": 0.9947700011454941,
|
| 24 |
+
"train_tokens": 305545,
|
| 25 |
+
"validation_batches": 2000,
|
| 26 |
+
"validation_loss": 0.01566733591659785,
|
| 27 |
+
"validation_token_accuracy": 0.9952379626277534,
|
| 28 |
+
"validation_tokens": 35909
|
| 29 |
+
}
|
| 30 |
+
],
|
| 31 |
+
"epochs": 1,
|
| 32 |
+
"generated_at_unix": 1777207898.9233346,
|
| 33 |
+
"grad_accum_steps": 16,
|
| 34 |
+
"label_space_json_path": null,
|
| 35 |
+
"label_space_source": "checkpoint",
|
| 36 |
+
"learning_rate": 1e-06,
|
| 37 |
+
"max_grad_norm": 1.0,
|
| 38 |
+
"num_output_labels": 33,
|
| 39 |
+
"num_train_examples": 17000,
|
| 40 |
+
"num_train_windows": 17000,
|
| 41 |
+
"num_validation_examples": 2000,
|
| 42 |
+
"num_validation_windows": 2000,
|
| 43 |
+
"output_checkpoint_dir": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3",
|
| 44 |
+
"output_head_reinitialized": false,
|
| 45 |
+
"output_head_rows_copied": 0,
|
| 46 |
+
"output_head_rows_copied_exact": 0,
|
| 47 |
+
"output_head_rows_copied_fallback": 0,
|
| 48 |
+
"resolved_category_version": "v2",
|
| 49 |
+
"resolved_n_ctx": 512,
|
| 50 |
+
"schema_version": 1,
|
| 51 |
+
"serialized_param_dtype": "bfloat16",
|
| 52 |
+
"span_class_names": [
|
| 53 |
+
"O",
|
| 54 |
+
"account_number",
|
| 55 |
+
"private_address",
|
| 56 |
+
"private_date",
|
| 57 |
+
"private_email",
|
| 58 |
+
"private_person",
|
| 59 |
+
"private_phone",
|
| 60 |
+
"private_url",
|
| 61 |
+
"secret"
|
| 62 |
+
],
|
| 63 |
+
"train_dataset": "data/processed/raw_hardening_v3_train.jsonl",
|
| 64 |
+
"train_dataset_variant": "full",
|
| 65 |
+
"validation_dataset": "data/processed/raw_hardening_v3_val.jsonl",
|
| 66 |
+
"validation_dataset_variant": "full",
|
| 67 |
+
"validation_split": null,
|
| 68 |
+
"weight_decay": 0.01
|
| 69 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dad75869b6130d1cefcaae0efc77eb1885524a7fc2aa47b5a96528d2c0a1904c
|
| 3 |
+
size 2798983976
|