Upload Russian privacy-filter fine-tune

Browse files

Files changed (5) hide show

README.md +97 -0
USAGE.txt +7 -0
config.json +74 -0
finetune_summary.json +69 -0
model.safetensors +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+base_model: openai/privacy-filter
+pipeline_tag: token-classification
+language:
+- ru
+tags:
+- privacy
+- pii
+- token-classification
+- russian
+- opf
+model-index:
+- name: privacy-filter-ru
+  results:
+  - task:
+      type: token-classification
+      name: Token Classification
+    dataset:
+      name: ru_realistic_eval_v1
+      type: local
+    metrics:
+    - type: f1
+      value: 0.9916
+      name: Raw span F1
+  - task:
+      type: token-classification
+      name: Token Classification
+    dataset:
+      name: ru_raw_hard_v3_eval
+      type: local
+    metrics:
+    - type: f1
+      value: 1.0
+      name: Raw span F1
+---
+# privacy-filter-ru
+Russian PII fine-tune of [`openai/privacy-filter`](https://huggingface.co/openai/privacy-filter).
+This checkpoint is the raw-model production candidate from the local `raw_hardening_v3` run. It is intended to run without deterministic post-processing.
+## Labels
+- `private_person`
+- `private_phone`
+- `private_email`
+- `private_address`
+- `private_date`
+- `private_url`
+- `account_number`
+- `secret`
+## Training
+- Base checkpoint: `checkpoints/production_candidate_ru_v2`
+- Original base model: `openai/privacy-filter`
+- Epochs: 1
+- Learning rate: `1e-6`
+- Batch size: 1
+- Gradient accumulation steps: 16
+- Serialization dtype: `bfloat16`
+- Train examples: 17,000
+- Validation examples: 2,000
+The v3 training mix targeted raw-model behavior that previously depended on a deterministic runtime layer: phone/account/secret label separation and person/date boundary cleanup.
+## Raw Evaluation
+No deterministic post-processing was used for these metrics.
+| Eval | v2 raw span F1 | v3 raw span F1 | v2 mismatch rows | v3 mismatch rows |
+| --- | ---: | ---: | ---: | ---: |
+| synthetic test | 1.0000 | 1.0000 | 0 | 0 |
+| ru_realistic_eval_v1 | 0.8787 | 0.9916 | 158 | 11 |
+| ru_phone_account_confusion_v1 | 1.0000 | 1.0000 | 0 | 0 |
+| ru_date_negative_v1 | 1.0000 | 1.0000 | 0 | 0 |
+| ru_raw_hard_v3_eval | 0.8350 | 1.0000 | 297 | 0 |
+| ru_person_hard_eval | 0.8074 | 0.8074 | 183 | 183 |
+| alexen2 | 0.8644 | 0.8547 | 228 | 241 |
+| Rubai heldout | 0.8054 | 0.8036 | 3,131 | 3,136 |
+## Usage
+```bash
+opf --checkpoint apararti/privacy-filter-ru --device cuda "Мой номер 8 999 863 37 84, зовут Андрей Макаров."
+```
+For a local checkout:
+```bash
+opf --checkpoint ./privacy-filter-ru --device cuda "Перезвоните Наталье Никитиной на 8 903 914 81 88."
+```
+## Notes
+This is a fine-tuned checkpoint, not the original OpenAI model. It is optimized for Russian PII filtering and should be validated on domain-specific shadow traffic before production rollout.

USAGE.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Finetuned checkpoint generated by `opf train`.
+Run local inference:
+  opf --checkpoint /home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3 --device cuda "Alice was born on 1990-01-02."
+Run eval:
+  opf eval /path/to/eval.jsonl --checkpoint /home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3 --device cuda

config.json ADDED Viewed

	@@ -0,0 +1,74 @@

+{
+  "bidirectional_context": true,
+  "bidirectional_left_context": 128,
+  "bidirectional_right_context": 128,
+  "category_version": "v2",
+  "default_n_ctx": 128000,
+  "encoding": "o200k_base",
+  "experts_per_token": 4,
+  "head_dim": 64,
+  "hidden_size": 640,
+  "inference_contract_version": 1,
+  "initial_context_length": 4096,
+  "intermediate_size": 640,
+  "max_position_embeddings": 131072,
+  "model_type": "privacy_filter",
+  "ner_class_names": [
+    "O",
+    "B-account_number",
+    "I-account_number",
+    "E-account_number",
+    "S-account_number",
+    "B-private_address",
+    "I-private_address",
+    "E-private_address",
+    "S-private_address",
+    "B-private_date",
+    "I-private_date",
+    "E-private_date",
+    "S-private_date",
+    "B-private_email",
+    "I-private_email",
+    "E-private_email",
+    "S-private_email",
+    "B-private_person",
+    "I-private_person",
+    "E-private_person",
+    "S-private_person",
+    "B-private_phone",
+    "I-private_phone",
+    "E-private_phone",
+    "S-private_phone",
+    "B-private_url",
+    "I-private_url",
+    "E-private_url",
+    "S-private_url",
+    "B-secret",
+    "I-secret",
+    "E-secret",
+    "S-secret"
+  ],
+  "num_attention_heads": 14,
+  "num_experts": 128,
+  "num_hidden_layers": 8,
+  "num_key_value_heads": 2,
+  "num_labels": 33,
+  "param_dtype": "bfloat16",
+  "rope_ntk_alpha": 1.0,
+  "rope_ntk_beta": 32.0,
+  "rope_scaling_factor": 32.0,
+  "rope_theta": 150000,
+  "sliding_window": 257,
+  "span_class_names": [
+    "O",
+    "account_number",
+    "private_address",
+    "private_date",
+    "private_email",
+    "private_person",
+    "private_phone",
+    "private_url",
+    "secret"
+  ],
+  "vocab_size": 200064
+}

finetune_summary.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "artifacts": {
+    "config_json": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/config.json",
+    "model_safetensors": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/model.safetensors",
+    "summary_json": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3/finetune_summary.json"
+  },
+  "base_checkpoint": "checkpoints/production_candidate_ru_v2",
+  "batch_size": 1,
+  "best_epoch": 1,
+  "best_metric": 0.01566733591659785,
+  "best_metric_name": "validation_loss",
+  "checkpoint_category_version": "v2",
+  "device": "cuda",
+  "elapsed_s": 438.167345628,
+  "encoding": "o200k_base",
+  "epoch_metrics": [
+    {
+      "elapsed_s": 436.7827820130001,
+      "epoch": 1,
+      "optimizer_steps": 1063,
+      "train_batches": 17000,
+      "train_loss": 0.019168339122784792,
+      "train_token_accuracy": 0.9947700011454941,
+      "train_tokens": 305545,
+      "validation_batches": 2000,
+      "validation_loss": 0.01566733591659785,
+      "validation_token_accuracy": 0.9952379626277534,
+      "validation_tokens": 35909
+    }
+  ],
+  "epochs": 1,
+  "generated_at_unix": 1777207898.9233346,
+  "grad_accum_steps": 16,
+  "label_space_json_path": null,
+  "label_space_source": "checkpoint",
+  "learning_rate": 1e-06,
+  "max_grad_norm": 1.0,
+  "num_output_labels": 33,
+  "num_train_examples": 17000,
+  "num_train_windows": 17000,
+  "num_validation_examples": 2000,
+  "num_validation_windows": 2000,
+  "output_checkpoint_dir": "/home/apararti/projects/pets/openai-masks/privacy-filter-ru/checkpoints/raw_hardening_v3",
+  "output_head_reinitialized": false,
+  "output_head_rows_copied": 0,
+  "output_head_rows_copied_exact": 0,
+  "output_head_rows_copied_fallback": 0,
+  "resolved_category_version": "v2",
+  "resolved_n_ctx": 512,
+  "schema_version": 1,
+  "serialized_param_dtype": "bfloat16",
+  "span_class_names": [
+    "O",
+    "account_number",
+    "private_address",
+    "private_date",
+    "private_email",
+    "private_person",
+    "private_phone",
+    "private_url",
+    "secret"
+  ],
+  "train_dataset": "data/processed/raw_hardening_v3_train.jsonl",
+  "train_dataset_variant": "full",
+  "validation_dataset": "data/processed/raw_hardening_v3_val.jsonl",
+  "validation_dataset_variant": "full",
+  "validation_split": null,
+  "weight_decay": 0.01
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dad75869b6130d1cefcaae0efc77eb1885524a7fc2aa47b5a96528d2c0a1904c
+size 2798983976