Initial release: XGBoost + MLP for user-risk-tier classification, plus structural-leakage diagnostic on threat-actor detection
e6a6835 verified | { | |
| "purpose": "Document why threat_actor_capability_tier (the README's stated headline use case) was NOT shipped as the primary baseline. Every oracle feature group is independently sufficient for 100% test accuracy on threat-actor binary detection; even with all 6 groups dropped, accuracy stays >97%. This is a structural property of the sample's generator (non-overlapping anomaly distributions between threat and legitimate sessions), not a methodology failure. Real-world identity telemetry has substantial overlap; this sample dataset does not reproduce it.", | |
| "target": "threat_actor_capability_tier != 'none' (binary)", | |
| "split": "GroupShuffleSplit by user_id, 70/15/15 nested", | |
| "non_overlapping_distributions": { | |
| "velocity_anomaly_score": { | |
| "actor_range": [ | |
| 0.5213, | |
| 0.8181 | |
| ], | |
| "non_actor_range": [ | |
| 0.0, | |
| 0.2469 | |
| ], | |
| "actor_mean": 0.651, | |
| "non_actor_mean": 0.053 | |
| }, | |
| "session_timestamp_utc": { | |
| "actor_range": [ | |
| 6417, | |
| 1440062 | |
| ], | |
| "non_actor_range": [ | |
| 1445187, | |
| 18000137 | |
| ], | |
| "note": "Actor sessions and non-actor sessions occupy disjoint time windows" | |
| }, | |
| "credential_attempt_count": { | |
| "actor_range": [ | |
| 1, | |
| 59 | |
| ], | |
| "non_actor_range": [ | |
| 1, | |
| 2 | |
| ], | |
| "actor_mean": 12.9, | |
| "non_actor_mean": 1.07 | |
| }, | |
| "login_outcome": { | |
| "actor_only_values": [ | |
| "failure_account_locked", | |
| "account_takeover_confirmed", | |
| "session_hijacked", | |
| "success_anomalous" | |
| ], | |
| "non_actor_only_values": [ | |
| "success_normal" | |
| ], | |
| "note": "success_normal is 4306 non-actor / 0 actor rows; failure_account_locked is 0 non-actor / 186 actor rows." | |
| } | |
| }, | |
| "ablation_experiments": [ | |
| { | |
| "config": "full features (all oracles intact)", | |
| "n_features": 166, | |
| "accuracy": 1.0, | |
| "roc_auc": 1.0 | |
| }, | |
| { | |
| "config": "cumulative drop through behavioural_oracles", | |
| "dropped_so_far": [ | |
| "velocity_anomaly_score", | |
| "credential_attempt_count", | |
| "session_timestamp_utc" | |
| ], | |
| "n_features": 163, | |
| "accuracy": 0.9991111111111112, | |
| "roc_auc": 1.0 | |
| }, | |
| { | |
| "config": "cumulative drop through outcome_oracle", | |
| "dropped_so_far": [ | |
| "velocity_anomaly_score", | |
| "credential_attempt_count", | |
| "session_timestamp_utc", | |
| "login_outcome" | |
| ], | |
| "n_features": 154, | |
| "accuracy": 0.9982222222222222, | |
| "roc_auc": 0.9999714285714285 | |
| }, | |
| { | |
| "config": "cumulative drop through geo_oracle", | |
| "dropped_so_far": [ | |
| "velocity_anomaly_score", | |
| "credential_attempt_count", | |
| "session_timestamp_utc", | |
| "login_outcome", | |
| "geo_country_code" | |
| ], | |
| "n_features": 138, | |
| "accuracy": 0.9986666666666667, | |
| "roc_auc": 0.9999619047619047 | |
| }, | |
| { | |
| "config": "cumulative drop through device_oracle", | |
| "dropped_so_far": [ | |
| "velocity_anomaly_score", | |
| "credential_attempt_count", | |
| "session_timestamp_utc", | |
| "login_outcome", | |
| "geo_country_code", | |
| "device_trust_level" | |
| ], | |
| "n_features": 133, | |
| "accuracy": 0.9982222222222222, | |
| "roc_auc": 0.9999047619047619 | |
| }, | |
| { | |
| "config": "cumulative drop through user_risk_oracle", | |
| "dropped_so_far": [ | |
| "velocity_anomaly_score", | |
| "credential_attempt_count", | |
| "session_timestamp_utc", | |
| "login_outcome", | |
| "geo_country_code", | |
| "device_trust_level", | |
| "user_risk_tier" | |
| ], | |
| "n_features": 130, | |
| "accuracy": 0.9977777777777778, | |
| "roc_auc": 0.9996095238095238 | |
| }, | |
| { | |
| "config": "cumulative drop through anomaly_signal", | |
| "dropped_so_far": [ | |
| "velocity_anomaly_score", | |
| "credential_attempt_count", | |
| "session_timestamp_utc", | |
| "login_outcome", | |
| "geo_country_code", | |
| "device_trust_level", | |
| "user_risk_tier", | |
| "geo_anomaly_score" | |
| ], | |
| "n_features": 129, | |
| "accuracy": 0.9706666666666667, | |
| "roc_auc": 0.9896857142857143 | |
| } | |
| ], | |
| "conclusion": "Even with all six oracle feature groups removed (40+ columns dropped), the residual feature set still yields 97% test accuracy and AUC 0.99 on threat-actor binary detection. The leakage is not localised \u2014 it is distributed across the entire feature space because the generator produces threat-actor sessions that are anomalous on every dimension simultaneously without overlap. A buyer planning to train a real detection model on this dataset should know that the sample's headline detection task is not a representative ML problem.", | |
| "recommendation_to_dataset_author": "Increase distributional overlap between threat-actor and legitimate session populations across all anomaly indicators: velocity score, credential attempt count, geo anomaly score, geo country code frequency, device trust level, login outcome class. Real-world detection systems operate at AUC 0.7-0.9, not 1.0; the sample should reflect that operating regime." | |
| } |