cyb006-baseline-classifier / leakage_diagnostic.json
pradeep-xpert's picture
Initial release: XGBoost + MLP for user-risk-tier classification, plus structural-leakage diagnostic on threat-actor detection
e6a6835 verified
{
"purpose": "Document why threat_actor_capability_tier (the README's stated headline use case) was NOT shipped as the primary baseline. Every oracle feature group is independently sufficient for 100% test accuracy on threat-actor binary detection; even with all 6 groups dropped, accuracy stays >97%. This is a structural property of the sample's generator (non-overlapping anomaly distributions between threat and legitimate sessions), not a methodology failure. Real-world identity telemetry has substantial overlap; this sample dataset does not reproduce it.",
"target": "threat_actor_capability_tier != 'none' (binary)",
"split": "GroupShuffleSplit by user_id, 70/15/15 nested",
"non_overlapping_distributions": {
"velocity_anomaly_score": {
"actor_range": [
0.5213,
0.8181
],
"non_actor_range": [
0.0,
0.2469
],
"actor_mean": 0.651,
"non_actor_mean": 0.053
},
"session_timestamp_utc": {
"actor_range": [
6417,
1440062
],
"non_actor_range": [
1445187,
18000137
],
"note": "Actor sessions and non-actor sessions occupy disjoint time windows"
},
"credential_attempt_count": {
"actor_range": [
1,
59
],
"non_actor_range": [
1,
2
],
"actor_mean": 12.9,
"non_actor_mean": 1.07
},
"login_outcome": {
"actor_only_values": [
"failure_account_locked",
"account_takeover_confirmed",
"session_hijacked",
"success_anomalous"
],
"non_actor_only_values": [
"success_normal"
],
"note": "success_normal is 4306 non-actor / 0 actor rows; failure_account_locked is 0 non-actor / 186 actor rows."
}
},
"ablation_experiments": [
{
"config": "full features (all oracles intact)",
"n_features": 166,
"accuracy": 1.0,
"roc_auc": 1.0
},
{
"config": "cumulative drop through behavioural_oracles",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc"
],
"n_features": 163,
"accuracy": 0.9991111111111112,
"roc_auc": 1.0
},
{
"config": "cumulative drop through outcome_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome"
],
"n_features": 154,
"accuracy": 0.9982222222222222,
"roc_auc": 0.9999714285714285
},
{
"config": "cumulative drop through geo_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code"
],
"n_features": 138,
"accuracy": 0.9986666666666667,
"roc_auc": 0.9999619047619047
},
{
"config": "cumulative drop through device_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code",
"device_trust_level"
],
"n_features": 133,
"accuracy": 0.9982222222222222,
"roc_auc": 0.9999047619047619
},
{
"config": "cumulative drop through user_risk_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code",
"device_trust_level",
"user_risk_tier"
],
"n_features": 130,
"accuracy": 0.9977777777777778,
"roc_auc": 0.9996095238095238
},
{
"config": "cumulative drop through anomaly_signal",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code",
"device_trust_level",
"user_risk_tier",
"geo_anomaly_score"
],
"n_features": 129,
"accuracy": 0.9706666666666667,
"roc_auc": 0.9896857142857143
}
],
"conclusion": "Even with all six oracle feature groups removed (40+ columns dropped), the residual feature set still yields 97% test accuracy and AUC 0.99 on threat-actor binary detection. The leakage is not localised \u2014 it is distributed across the entire feature space because the generator produces threat-actor sessions that are anomalous on every dimension simultaneously without overlap. A buyer planning to train a real detection model on this dataset should know that the sample's headline detection task is not a representative ML problem.",
"recommendation_to_dataset_author": "Increase distributional overlap between threat-actor and legitimate session populations across all anomaly indicators: velocity score, credential attempt count, geo anomaly score, geo country code frequency, device trust level, login outcome class. Real-world detection systems operate at AUC 0.7-0.9, not 1.0; the sample should reflect that operating regime."
}