cyb008-baseline-classifier / leakage_diagnostic.json

Initial release: XGBoost + MLP for SOC alert triage outcome classification, with structural-leakage and unlearnable-target diagnostic

001717c verified 2 days ago

raw

history blame contribute delete

4.11 kB

	{
	"purpose": "Document the three structural oracle columns dropped from the primary feature pipeline, and the unlearnable-target finding for mitre_tactic. CYB008 is calibrated against 12 SOC-operations benchmarks but encodes the resolution_outcome label structurally into alert_lifecycle_phase, automation_resolved, and escalation_flag. Real SOC telemetry has substantial overlap between these signals; the sample does not.",
	"primary_target": "resolution_outcome (5-class)",
	"split": "StratifiedShuffleSplit, 70/15/15 nested",
	"oracle_structural_findings": {
	"alert_lifecycle_phase": {
	"deterministic_mapping": {
	"auto_closed": "100% -> auto_resolved_soar",
	"escalated": "100% -> true_positive_escalated",
	"suppressed_duplicate": "100% -> duplicate_merged",
	"resolved": "splits ~62/38 false_positive_closed / true_positive_remediated"
	},
	"note": "3 of 4 lifecycle phases are perfect class oracles. Drop required to evaluate honest learning."
	},
	"automation_resolved": {
	"deterministic_mapping": {
	"1": "100% -> auto_resolved_soar",
	"0": "0 cases of auto_resolved_soar"
	},
	"note": "Exact 1:1 oracle with auto_resolved_soar outcome class."
	},
	"escalation_flag": {
	"deterministic_mapping": {
	"1 (n=1875)": "1319 true_positive_escalated + 556 auto_resolved_soar",
	"0 (n=7325)": "0 cases of true_positive_escalated"
	},
	"note": "Near-perfect oracle for true_positive_escalated outcome."
	}
	},
	"ablation_experiments": [
	{
	"config": "full features (all oracles intact)",
	"n_features": 53,
	"accuracy": 1.0,
	"roc_auc": 1.0
	},
	{
	"config": "cumulative drop through alert_lifecycle_phase",
	"dropped_so_far": [
	"alert_lifecycle_phase"
	],
	"n_features": 49,
	"accuracy": 1.0,
	"roc_auc": 1.0
	},
	{
	"config": "cumulative drop through automation_resolved",
	"dropped_so_far": [
	"alert_lifecycle_phase",
	"automation_resolved"
	],
	"n_features": 48,
	"accuracy": 0.8388888888888889,
	"roc_auc": 0.9726930344746759
	},
	{
	"config": "cumulative drop through escalation_flag",
	"dropped_so_far": [
	"alert_lifecycle_phase",
	"automation_resolved",
	"escalation_flag"
	],
	"n_features": 47,
	"accuracy": 0.7898550724637681,
	"roc_auc": 0.9562439021017856
	}
	],
	"conclusion": "With all three oracle columns dropped, test accuracy is 0.79 (vs 1.00 with oracles intact, and 0.33 majority baseline). The honest model still ROC-AUC 0.96 on a 5-class task - real learning, real signal, no mechanical leakage. The published baseline trains with the three oracle columns excluded.",
	"mitre_tactic_unlearnable": {
	"purpose": "The CYB008 README's first suggested use case is 'MITRE ATT&CK tactic classification from alert features'. We test this on the sample dataset and find it is NOT LEARNABLE - features do not distinguish tactics, the model performs below majority baseline.",
	"task": "mitre_tactic 12-class (with mitre_technique_id excluded - it would be a perfect ATT&CK oracle)",
	"majority_baseline_accuracy": 0.14097826086956522,
	"xgboost_accuracy_mean_3seeds": 0.07971014492753624,
	"interpretation": "Per-tactic feature distributions are nearly identical (raw_score 0.37-0.39, enriched_score similar, fatigue 0.64 across all 12 tactics). Without mitre_technique_id (which is a 100% ATT&CK-by-design oracle), alert_source is the only discriminating signal, and it has cross-tactic purity of 0.14 - close to random. Real SOC telemetry has stronger source-to-tactic associations and per-tactic feature distributions; the sample does not reproduce these.",
	"recommendation_to_dataset_author": "To make tactic classification a viable benchmark, the generator should produce stronger per-tactic feature signatures (differentiated raw_score / enriched_score distributions per tactic, source-tactic affinity > 0.3 purity, characteristic MTTD / MTTR per tactic)."
	}
	}