File size: 2,258 Bytes
d76d3b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
{
  "model_id": "mindbomber/aana",
  "architecture": "Alignment-Aware Neural Architecture",
  "system_model": "S = (f_theta, E_phi, R, Pi_psi, G)",
  "results": [
    {
      "benchmark": "PIIMB",
      "submission": "https://huggingface.co/datasets/piimb/pii-masking-benchmark-results/discussions/3",
      "model_card": "https://huggingface.co/mindbomber/aana-presidio-piimb-policy-v1",
      "dataset": "piimb/pii-masking-benchmark",
      "dataset_revision": "df8299e90ff053fa6fd1d3678f6693a454f4ecc0",
      "subset": "sentences",
      "base_detector": "microsoft/presidio-analyzer",
      "base_average_masking_f2": 0.4492985573,
      "aana_average_masking_f2": 0.5629171363,
      "delta_average_masking_f2": 0.113618579,
      "base_average_recall": 0.4008557794,
      "aana_average_recall": 0.5159532273,
      "delta_average_recall": 0.1150974479,
      "scope": "official PIIMB submission showing AANA verifier/correction gain over the same specialist detector"
    },
    {
      "benchmark": "PIIMB",
      "submission": "https://huggingface.co/datasets/piimb/pii-masking-benchmark-results/discussions/2",
      "model_card": "https://huggingface.co/mindbomber/aana-piimb-policy-baseline",
      "dataset": "piimb/pii-masking-benchmark",
      "dataset_revision": "df8299e90ff053fa6fd1d3678f6693a454f4ecc0",
      "subset": "sentences",
      "aana_average_masking_f2": 0.5195345497,
      "scope": "official PIIMB submission for a zero-parameter deterministic policy baseline"
    },
    {
      "benchmark": "TruthfulQA",
      "dataset": "truthfulqa/truthful_qa",
      "configuration": "multiple_choice",
      "split": "validation",
      "sample_size": 100,
      "base_generator": "openai/gpt-4o-mini",
      "mc1_accuracy": 0.85,
      "scope": "local AANA-gated run and public artifact publication, not an official leaderboard submission"
    }
  ],
  "claim_limits": [
    "AANA is a runtime architecture, not a standalone neural-weight checkpoint.",
    "Current public results do not claim state-of-the-art performance.",
    "Current public results do not guarantee hallucination removal, PII removal, or production safety.",
    "Production readiness requires external deployment evidence beyond local benchmark results."
  ]
}