cyb006-baseline-classifier / leakage_diagnostic.json

Initial release: XGBoost + MLP for user-risk-tier classification, plus structural-leakage diagnostic on threat-actor detection

e6a6835 verified 2 days ago

raw

history blame contribute delete

5.16 kB

	{
	"purpose": "Document why threat_actor_capability_tier (the README's stated headline use case) was NOT shipped as the primary baseline. Every oracle feature group is independently sufficient for 100% test accuracy on threat-actor binary detection; even with all 6 groups dropped, accuracy stays >97%. This is a structural property of the sample's generator (non-overlapping anomaly distributions between threat and legitimate sessions), not a methodology failure. Real-world identity telemetry has substantial overlap; this sample dataset does not reproduce it.",
	"target": "threat_actor_capability_tier != 'none' (binary)",
	"split": "GroupShuffleSplit by user_id, 70/15/15 nested",
	"non_overlapping_distributions": {
	"velocity_anomaly_score": {
	"actor_range": [
	0.5213,
	0.8181
	],
	"non_actor_range": [
	0.0,
	0.2469
	],
	"actor_mean": 0.651,
	"non_actor_mean": 0.053
	},
	"session_timestamp_utc": {
	"actor_range": [
	6417,
	1440062
	],
	"non_actor_range": [
	1445187,
	18000137
	],
	"note": "Actor sessions and non-actor sessions occupy disjoint time windows"
	},
	"credential_attempt_count": {
	"actor_range": [
	1,
	59
	],
	"non_actor_range": [
	1,
	2
	],
	"actor_mean": 12.9,
	"non_actor_mean": 1.07
	},
	"login_outcome": {
	"actor_only_values": [
	"failure_account_locked",
	"account_takeover_confirmed",
	"session_hijacked",
	"success_anomalous"
	],
	"non_actor_only_values": [
	"success_normal"
	],
	"note": "success_normal is 4306 non-actor / 0 actor rows; failure_account_locked is 0 non-actor / 186 actor rows."
	}
	},
	"ablation_experiments": [
	{
	"config": "full features (all oracles intact)",
	"n_features": 166,
	"accuracy": 1.0,
	"roc_auc": 1.0
	},
	{
	"config": "cumulative drop through behavioural_oracles",
	"dropped_so_far": [
	"velocity_anomaly_score",
	"credential_attempt_count",
	"session_timestamp_utc"
	],
	"n_features": 163,
	"accuracy": 0.9991111111111112,
	"roc_auc": 1.0
	},
	{
	"config": "cumulative drop through outcome_oracle",
	"dropped_so_far": [
	"velocity_anomaly_score",
	"credential_attempt_count",
	"session_timestamp_utc",
	"login_outcome"
	],
	"n_features": 154,
	"accuracy": 0.9982222222222222,
	"roc_auc": 0.9999714285714285
	},
	{
	"config": "cumulative drop through geo_oracle",
	"dropped_so_far": [
	"velocity_anomaly_score",
	"credential_attempt_count",
	"session_timestamp_utc",
	"login_outcome",
	"geo_country_code"
	],
	"n_features": 138,
	"accuracy": 0.9986666666666667,
	"roc_auc": 0.9999619047619047
	},
	{
	"config": "cumulative drop through device_oracle",
	"dropped_so_far": [
	"velocity_anomaly_score",
	"credential_attempt_count",
	"session_timestamp_utc",
	"login_outcome",
	"geo_country_code",
	"device_trust_level"
	],
	"n_features": 133,
	"accuracy": 0.9982222222222222,
	"roc_auc": 0.9999047619047619
	},
	{
	"config": "cumulative drop through user_risk_oracle",
	"dropped_so_far": [
	"velocity_anomaly_score",
	"credential_attempt_count",
	"session_timestamp_utc",
	"login_outcome",
	"geo_country_code",
	"device_trust_level",
	"user_risk_tier"
	],
	"n_features": 130,
	"accuracy": 0.9977777777777778,
	"roc_auc": 0.9996095238095238
	},
	{
	"config": "cumulative drop through anomaly_signal",
	"dropped_so_far": [
	"velocity_anomaly_score",
	"credential_attempt_count",
	"session_timestamp_utc",
	"login_outcome",
	"geo_country_code",
	"device_trust_level",
	"user_risk_tier",
	"geo_anomaly_score"
	],
	"n_features": 129,
	"accuracy": 0.9706666666666667,
	"roc_auc": 0.9896857142857143
	}
	],
	"conclusion": "Even with all six oracle feature groups removed (40+ columns dropped), the residual feature set still yields 97% test accuracy and AUC 0.99 on threat-actor binary detection. The leakage is not localised \u2014 it is distributed across the entire feature space because the generator produces threat-actor sessions that are anomalous on every dimension simultaneously without overlap. A buyer planning to train a real detection model on this dataset should know that the sample's headline detection task is not a representative ML problem.",
	"recommendation_to_dataset_author": "Increase distributional overlap between threat-actor and legitimate session populations across all anomaly indicators: velocity score, credential attempt count, geo anomaly score, geo country code frequency, device trust level, login outcome class. Real-world detection systems operate at AUC 0.7-0.9, not 1.0; the sample should reflect that operating regime."
	}