cyb008-baseline-classifier / leakage_diagnostic.json
pradeep-xpert's picture
Initial release: XGBoost + MLP for SOC alert triage outcome classification, with structural-leakage and unlearnable-target diagnostic
001717c verified
{
"purpose": "Document the three structural oracle columns dropped from the primary feature pipeline, and the unlearnable-target finding for mitre_tactic. CYB008 is calibrated against 12 SOC-operations benchmarks but encodes the resolution_outcome label structurally into alert_lifecycle_phase, automation_resolved, and escalation_flag. Real SOC telemetry has substantial overlap between these signals; the sample does not.",
"primary_target": "resolution_outcome (5-class)",
"split": "StratifiedShuffleSplit, 70/15/15 nested",
"oracle_structural_findings": {
"alert_lifecycle_phase": {
"deterministic_mapping": {
"auto_closed": "100% -> auto_resolved_soar",
"escalated": "100% -> true_positive_escalated",
"suppressed_duplicate": "100% -> duplicate_merged",
"resolved": "splits ~62/38 false_positive_closed / true_positive_remediated"
},
"note": "3 of 4 lifecycle phases are perfect class oracles. Drop required to evaluate honest learning."
},
"automation_resolved": {
"deterministic_mapping": {
"1": "100% -> auto_resolved_soar",
"0": "0 cases of auto_resolved_soar"
},
"note": "Exact 1:1 oracle with auto_resolved_soar outcome class."
},
"escalation_flag": {
"deterministic_mapping": {
"1 (n=1875)": "1319 true_positive_escalated + 556 auto_resolved_soar",
"0 (n=7325)": "0 cases of true_positive_escalated"
},
"note": "Near-perfect oracle for true_positive_escalated outcome."
}
},
"ablation_experiments": [
{
"config": "full features (all oracles intact)",
"n_features": 53,
"accuracy": 1.0,
"roc_auc": 1.0
},
{
"config": "cumulative drop through alert_lifecycle_phase",
"dropped_so_far": [
"alert_lifecycle_phase"
],
"n_features": 49,
"accuracy": 1.0,
"roc_auc": 1.0
},
{
"config": "cumulative drop through automation_resolved",
"dropped_so_far": [
"alert_lifecycle_phase",
"automation_resolved"
],
"n_features": 48,
"accuracy": 0.8388888888888889,
"roc_auc": 0.9726930344746759
},
{
"config": "cumulative drop through escalation_flag",
"dropped_so_far": [
"alert_lifecycle_phase",
"automation_resolved",
"escalation_flag"
],
"n_features": 47,
"accuracy": 0.7898550724637681,
"roc_auc": 0.9562439021017856
}
],
"conclusion": "With all three oracle columns dropped, test accuracy is 0.79 (vs 1.00 with oracles intact, and 0.33 majority baseline). The honest model still ROC-AUC 0.96 on a 5-class task - real learning, real signal, no mechanical leakage. The published baseline trains with the three oracle columns excluded.",
"mitre_tactic_unlearnable": {
"purpose": "The CYB008 README's first suggested use case is 'MITRE ATT&CK tactic classification from alert features'. We test this on the sample dataset and find it is NOT LEARNABLE - features do not distinguish tactics, the model performs below majority baseline.",
"task": "mitre_tactic 12-class (with mitre_technique_id excluded - it would be a perfect ATT&CK oracle)",
"majority_baseline_accuracy": 0.14097826086956522,
"xgboost_accuracy_mean_3seeds": 0.07971014492753624,
"interpretation": "Per-tactic feature distributions are nearly identical (raw_score 0.37-0.39, enriched_score similar, fatigue 0.64 across all 12 tactics). Without mitre_technique_id (which is a 100% ATT&CK-by-design oracle), alert_source is the only discriminating signal, and it has cross-tactic purity of 0.14 - close to random. Real SOC telemetry has stronger source-to-tactic associations and per-tactic feature distributions; the sample does not reproduce these.",
"recommendation_to_dataset_author": "To make tactic classification a viable benchmark, the generator should produce stronger per-tactic feature signatures (differentiated raw_score / enriched_score distributions per tactic, source-tactic affinity > 0.3 purity, characteristic MTTD / MTTR per tactic)."
}
}