File size: 5,161 Bytes
e6a6835 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | {
"purpose": "Document why threat_actor_capability_tier (the README's stated headline use case) was NOT shipped as the primary baseline. Every oracle feature group is independently sufficient for 100% test accuracy on threat-actor binary detection; even with all 6 groups dropped, accuracy stays >97%. This is a structural property of the sample's generator (non-overlapping anomaly distributions between threat and legitimate sessions), not a methodology failure. Real-world identity telemetry has substantial overlap; this sample dataset does not reproduce it.",
"target": "threat_actor_capability_tier != 'none' (binary)",
"split": "GroupShuffleSplit by user_id, 70/15/15 nested",
"non_overlapping_distributions": {
"velocity_anomaly_score": {
"actor_range": [
0.5213,
0.8181
],
"non_actor_range": [
0.0,
0.2469
],
"actor_mean": 0.651,
"non_actor_mean": 0.053
},
"session_timestamp_utc": {
"actor_range": [
6417,
1440062
],
"non_actor_range": [
1445187,
18000137
],
"note": "Actor sessions and non-actor sessions occupy disjoint time windows"
},
"credential_attempt_count": {
"actor_range": [
1,
59
],
"non_actor_range": [
1,
2
],
"actor_mean": 12.9,
"non_actor_mean": 1.07
},
"login_outcome": {
"actor_only_values": [
"failure_account_locked",
"account_takeover_confirmed",
"session_hijacked",
"success_anomalous"
],
"non_actor_only_values": [
"success_normal"
],
"note": "success_normal is 4306 non-actor / 0 actor rows; failure_account_locked is 0 non-actor / 186 actor rows."
}
},
"ablation_experiments": [
{
"config": "full features (all oracles intact)",
"n_features": 166,
"accuracy": 1.0,
"roc_auc": 1.0
},
{
"config": "cumulative drop through behavioural_oracles",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc"
],
"n_features": 163,
"accuracy": 0.9991111111111112,
"roc_auc": 1.0
},
{
"config": "cumulative drop through outcome_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome"
],
"n_features": 154,
"accuracy": 0.9982222222222222,
"roc_auc": 0.9999714285714285
},
{
"config": "cumulative drop through geo_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code"
],
"n_features": 138,
"accuracy": 0.9986666666666667,
"roc_auc": 0.9999619047619047
},
{
"config": "cumulative drop through device_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code",
"device_trust_level"
],
"n_features": 133,
"accuracy": 0.9982222222222222,
"roc_auc": 0.9999047619047619
},
{
"config": "cumulative drop through user_risk_oracle",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code",
"device_trust_level",
"user_risk_tier"
],
"n_features": 130,
"accuracy": 0.9977777777777778,
"roc_auc": 0.9996095238095238
},
{
"config": "cumulative drop through anomaly_signal",
"dropped_so_far": [
"velocity_anomaly_score",
"credential_attempt_count",
"session_timestamp_utc",
"login_outcome",
"geo_country_code",
"device_trust_level",
"user_risk_tier",
"geo_anomaly_score"
],
"n_features": 129,
"accuracy": 0.9706666666666667,
"roc_auc": 0.9896857142857143
}
],
"conclusion": "Even with all six oracle feature groups removed (40+ columns dropped), the residual feature set still yields 97% test accuracy and AUC 0.99 on threat-actor binary detection. The leakage is not localised \u2014 it is distributed across the entire feature space because the generator produces threat-actor sessions that are anomalous on every dimension simultaneously without overlap. A buyer planning to train a real detection model on this dataset should know that the sample's headline detection task is not a representative ML problem.",
"recommendation_to_dataset_author": "Increase distributional overlap between threat-actor and legitimate session populations across all anomaly indicators: velocity score, credential attempt count, geo anomaly score, geo country code frequency, device trust level, login outcome class. Real-world detection systems operate at AUC 0.7-0.9, not 1.0; the sample should reflect that operating regime."
} |