leakage_diagnostic.json · xpertsystems/cyb009-baseline-classifier at main

File size: 10,885 Bytes

e520bf1

{
  "purpose": "CYB009 sample has the most pervasive structural leakage of any SKU in the XpertSystems catalog. Eight oracle paths were discovered, and five of the README's headline targets are unlearnable on the sample after honest leak removal. The primary baseline that ships with this repo (vulnerability_class 8-class) is the only README-suggested target that learns honestly - and it is the WEAKEST baseline in the catalog by design (acc 0.24 vs majority 0.18). The headline finding for CYB009 is this diagnostic, not the classifier.",
  "primary_target": "vulnerability_class (8-class)",
  "split": "StratifiedShuffleSplit, 70/15/15 nested",
  "oracle_paths_documented": {
    "P1_cvss_temporal_ratio": {
      "target": "exploit_maturity_final",
      "leak_column": "cvss_temporal_score_final",
      "mechanism": "CVSS v3.1 computes Temporal Score from Base Score using an Exploit Code Maturity multiplier (0.91 unproven, 0.94 PoC, 0.97 functional, 1.00 high/weaponised). The cvss_temporal/cvss_base ratio in the sample clusters near these multipliers per maturity tier, making it a near-deterministic oracle for the target.",
      "observed_ratios_by_tier": {
        "functional": {
          "min": 0.8516,
          "median": 0.8537,
          "max": 0.8843,
          "std": 0.0113
        },
        "proof_of_concept": {
          "min": 0.8255,
          "median": 0.8274,
          "max": 0.8567,
          "std": 0.0114
        },
        "unproven": {
          "min": 0.7991,
          "median": 0.801,
          "max": 0.8302,
          "std": 0.011
        },
        "weaponised": {
          "min": 0.878,
          "median": 0.88,
          "max": 0.9116,
          "std": 0.0115
        }
      },
      "impact": "With cvss_temporal_score_final included, XGBoost achieves test accuracy 0.74 (mF1 0.72, AUC 0.91). With it excluded, accuracy collapses to 0.31 (mF1 0.31, AUC 0.58) - below majority baseline of 0.36. The target is structurally unlearnable on the sample after honest leak removal."
    },
    "P2_time_to_exploit_sentinel": {
      "target": "exploitation_occurred_flag (and zero_day_flag)",
      "leak_column": "time_to_exploit_days",
      "mechanism": "Sentinel-coded post-hoc field: -1 when no exploitation occurred; positive (0-95 days) when exploitation occurred. Perfect oracle.",
      "evidence": {
        "time_to_exploit_minus1_AND_flag_0": 2435,
        "time_to_exploit_positive_AND_flag_1": 197,
        "time_to_exploit_positive_AND_flag_0": 0
      },
      "impact": "Perfect oracle for exploitation_occurred_flag and zero_day_flag."
    },
    "P3_time_to_remediate_sentinel": {
      "target": "remediation_success_flag, sla_compliance_flag",
      "leak_column": "time_to_remediate_days",
      "mechanism": "Sentinel-coded post-hoc field: 120 (the timeline horizon) when not remediated; lower values (3-113) when remediated. Perfect oracle.",
      "evidence": {
        "remediation_flag_0_time_mean": 120.0,
        "remediation_flag_0_time_min": 120,
        "remediation_flag_1_time_mean": 41.77892756349953,
        "remediation_flag_1_time_max": 113
      },
      "impact": "Perfect oracle for remediation_success_flag and near-perfect for sla_compliance_flag."
    },
    "P4_severity_class_cvss_boundaries": {
      "target": "severity_class",
      "leak_column": "cvss_base_score",
      "mechanism": "severity_class is computed as a CVSS v3.1 boundary function of cvss_base_score (low=0.1-3.9, medium=4.0-6.9, high=7.0-8.9, critical=9.0-10.0). Including cvss_base_score makes severity prediction trivial; excluding it leaves only weak signal (acc 0.55 vs majority 0.51 = barely above).",
      "observed_cvss_ranges_per_severity": {
        "critical": {
          "min": 9.0,
          "max": 10.0
        },
        "high": {
          "min": 7.0,
          "max": 9.0
        },
        "low": {
          "min": 1.77,
          "max": 4.0
        },
        "medium": {
          "min": 4.02,
          "max": 7.0
        }
      },
      "impact": "100% mechanical encoding. severity_class is not a useful ML target on this dataset."
    },
    "P5_lifecycle_to_remediation": {
      "target": "remediation_status (per-timestep)",
      "leak_column": "lifecycle_phase",
      "mechanism": "The 12-phase lifecycle state machine has multiple phases that deterministically pin remediation_status. ~83% of per-timestep rows have lifecycle_phase that determines remediation_status exactly.",
      "deterministic_phase_mappings": {
        "accepted_risk": {
          "maps_to": "in_remediation",
          "purity": 1.0,
          "n_rows": 16
        },
        "discovery": {
          "maps_to": "undetected",
          "purity": 1.0,
          "n_rows": 327
        },
        "false_positive_closed": {
          "maps_to": "in_remediation",
          "purity": 0.9944,
          "n_rows": 1421
        },
        "organisational_triage": {
          "maps_to": "triaged",
          "purity": 1.0,
          "n_rows": 18
        },
        "patch_release": {
          "maps_to": "undetected",
          "purity": 1.0,
          "n_rows": 33
        },
        "remediation_deployment": {
          "maps_to": "in_remediation",
          "purity": 1.0,
          "n_rows": 4362
        },
        "residual_risk_review": {
          "maps_to": "remediated",
          "purity": 1.0,
          "n_rows": 8921
        }
      },
      "impact": "Per-timestep targets remediation_status, patch_status, and lifecycle_phase form a tightly-coupled state machine; any two pin the third. All three appear as 0.95-0.98 accuracy in naive evaluation but are mechanically determined."
    },
    "P6_patch_to_remediation": {
      "target": "remediation_status (per-timestep)",
      "leak_column": "patch_status",
      "mechanism": "Of 6 patch_status values, at least 5 map near-deterministically to a single remediation_status value. `patch_status=deployed` -> 100% `remediated`; `patch_validated`/`vendor_notified`/`patch_in_development`/`patch_released` -> ~99% `in_remediation`.",
      "deterministic_status_mappings": {
        "deployed": {
          "maps_to": "remediated",
          "purity": 1.0,
          "n_rows": 8958
        },
        "patch_validated": {
          "maps_to": "in_remediation",
          "purity": 0.9941,
          "n_rows": 5293
        }
      },
      "impact": "patch_status alone is a near-oracle for remediation_status."
    },
    "P7_risk_score_composite": {
      "target": "all binary flag fields (indirect)",
      "leak_column": "risk_score_composite",
      "mechanism": "risk_score_composite is computed in the generator from cvss_base_score, epss_score_final, and the flag fields. Including it in features would launder flag information into the model via this composite.",
      "evidence": "Generator-side composite; correlation with all flag fields > 0.3.",
      "impact": "Precautionary drop. Affects all binary flag targets."
    },
    "P8_patch_lag_days": {
      "target": "remediation_success_flag (suspected)",
      "leak_column": "patch_lag_days",
      "mechanism": "Likely same sentinel-coding structure as time_to_remediate_days (120 sentinel for unpatched; lower values when patched). Dropped as precaution; not separately validated.",
      "impact": "Precautionary drop."
    }
  },
  "unlearnable_targets": [
    {
      "target": "exploitation_occurred_flag",
      "n_positives": 203,
      "majority_baseline": 0.9230477634571645,
      "honest_accuracy": 0.8569023569023568,
      "honest_roc_auc": 0.6534304796599878,
      "verdict": "below_majority"
    },
    {
      "target": "zero_day_flag",
      "n_positives": 76,
      "majority_baseline": 0.9711902956785443,
      "honest_accuracy": 0.9486531986531986,
      "honest_roc_auc": 0.6040141676505313,
      "verdict": "below_majority"
    },
    {
      "target": "cisa_kev_flag",
      "n_positives": 14,
      "majority_baseline": 0.9946929492039424,
      "honest_accuracy": 0.9924242424242425,
      "honest_roc_auc": 0.6125211505922166,
      "verdict": "below_majority"
    },
    {
      "target": "supply_chain_propagation_flag",
      "n_positives": 20,
      "majority_baseline": 0.9924184988627748,
      "honest_accuracy": 0.9915824915824917,
      "honest_roc_auc": 0.7950240316652529,
      "verdict": "below_majority"
    },
    {
      "target": "false_positive_flag",
      "n_positives": 205,
      "majority_baseline": 0.922289613343442,
      "honest_accuracy": 0.8661616161616162,
      "honest_roc_auc": 0.5172779496243923,
      "verdict": "below_majority"
    },
    {
      "target": "exploit_maturity_final (after cvss_temporal_score_final dropped)",
      "n_classes": 4,
      "majority_baseline": 0.35898407884761185,
      "honest_accuracy": 0.30639730639730645,
      "honest_roc_auc": 0.5731243306339614,
      "verdict": "below_majority"
    }
  ],
  "unlearnable_summary": "Six of the README's headline use cases are unlearnable on the sample after honest leak removal: exploitation_occurred_flag, zero_day_flag, cisa_kev_flag, supply_chain_propagation_flag, false_positive_flag, and exploit_maturity_final (the original primary candidate target before the cvss_temporal_score_final leakage was discovered). Only vulnerability_class learns honestly, and it gives the weakest baseline in the catalog (acc 0.24 vs majority 0.18).",
  "recommendations_to_dataset_author": [
    "Remove the deterministic CVSS v3.1 exploit-code-maturity multiplier from cvss_temporal_score_final calculation, or add per-vulnerability noise so the cvss_temporal/cvss_base ratio overlaps across maturity tiers. As shipped, the ratio uniquely identifies the tier.",
    "Replace -1 / 120 / etc. sentinel values in time_to_exploit_days, time_to_remediate_days, and patch_lag_days with probabilistic censoring that doesn't perfectly identify the outcome class. For example, use the latest observed time on partially-complete trajectories rather than a fixed sentinel.",
    "Decouple the lifecycle_phase -> remediation_status -> patch_status state machine. Real telemetry has noisy intermediate states (e.g. a vuln can move to patch_released without immediately being remediated). The current sample has 5+ pure deterministic edges in this graph.",
    "Add per-vulnerability-class feature signatures. The 8 classes differ in cvss_base_score means (5.4-8.3) but per-class feature distributions overlap heavily. Add class-specific EPSS distributions, asset-affinity, and disclosure-timeline patterns to make class prediction tractable from features.",
    "Increase positive-class counts for rare-event binaries in the sample: 14 cisa_kev positives, 20 supply_chain positives, and 76 zero_day positives are below the threshold for reliable minority-class ML evaluation at n=2638. Either upsample these in the sample or document them as full-product-only signals."
  ]
}