File size: 4,113 Bytes
001717c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
{
  "purpose": "Document the three structural oracle columns dropped from the primary feature pipeline, and the unlearnable-target finding for mitre_tactic. CYB008 is calibrated against 12 SOC-operations benchmarks but encodes the resolution_outcome label structurally into alert_lifecycle_phase, automation_resolved, and escalation_flag. Real SOC telemetry has substantial overlap between these signals; the sample does not.",
  "primary_target": "resolution_outcome (5-class)",
  "split": "StratifiedShuffleSplit, 70/15/15 nested",
  "oracle_structural_findings": {
    "alert_lifecycle_phase": {
      "deterministic_mapping": {
        "auto_closed": "100% -> auto_resolved_soar",
        "escalated": "100% -> true_positive_escalated",
        "suppressed_duplicate": "100% -> duplicate_merged",
        "resolved": "splits ~62/38 false_positive_closed / true_positive_remediated"
      },
      "note": "3 of 4 lifecycle phases are perfect class oracles. Drop required to evaluate honest learning."
    },
    "automation_resolved": {
      "deterministic_mapping": {
        "1": "100% -> auto_resolved_soar",
        "0": "0 cases of auto_resolved_soar"
      },
      "note": "Exact 1:1 oracle with auto_resolved_soar outcome class."
    },
    "escalation_flag": {
      "deterministic_mapping": {
        "1 (n=1875)": "1319 true_positive_escalated + 556 auto_resolved_soar",
        "0 (n=7325)": "0 cases of true_positive_escalated"
      },
      "note": "Near-perfect oracle for true_positive_escalated outcome."
    }
  },
  "ablation_experiments": [
    {
      "config": "full features (all oracles intact)",
      "n_features": 53,
      "accuracy": 1.0,
      "roc_auc": 1.0
    },
    {
      "config": "cumulative drop through alert_lifecycle_phase",
      "dropped_so_far": [
        "alert_lifecycle_phase"
      ],
      "n_features": 49,
      "accuracy": 1.0,
      "roc_auc": 1.0
    },
    {
      "config": "cumulative drop through automation_resolved",
      "dropped_so_far": [
        "alert_lifecycle_phase",
        "automation_resolved"
      ],
      "n_features": 48,
      "accuracy": 0.8388888888888889,
      "roc_auc": 0.9726930344746759
    },
    {
      "config": "cumulative drop through escalation_flag",
      "dropped_so_far": [
        "alert_lifecycle_phase",
        "automation_resolved",
        "escalation_flag"
      ],
      "n_features": 47,
      "accuracy": 0.7898550724637681,
      "roc_auc": 0.9562439021017856
    }
  ],
  "conclusion": "With all three oracle columns dropped, test accuracy is 0.79 (vs 1.00 with oracles intact, and 0.33 majority baseline). The honest model still ROC-AUC 0.96 on a 5-class task - real learning, real signal, no mechanical leakage. The published baseline trains with the three oracle columns excluded.",
  "mitre_tactic_unlearnable": {
    "purpose": "The CYB008 README's first suggested use case is 'MITRE ATT&CK tactic classification from alert features'. We test this on the sample dataset and find it is NOT LEARNABLE - features do not distinguish tactics, the model performs below majority baseline.",
    "task": "mitre_tactic 12-class (with mitre_technique_id excluded - it would be a perfect ATT&CK oracle)",
    "majority_baseline_accuracy": 0.14097826086956522,
    "xgboost_accuracy_mean_3seeds": 0.07971014492753624,
    "interpretation": "Per-tactic feature distributions are nearly identical (raw_score 0.37-0.39, enriched_score similar, fatigue 0.64 across all 12 tactics). Without mitre_technique_id (which is a 100% ATT&CK-by-design oracle), alert_source is the only discriminating signal, and it has cross-tactic purity of 0.14 - close to random. Real SOC telemetry has stronger source-to-tactic associations and per-tactic feature distributions; the sample does not reproduce these.",
    "recommendation_to_dataset_author": "To make tactic classification a viable benchmark, the generator should produce stronger per-tactic feature signatures (differentiated raw_score / enriched_score distributions per tactic, source-tactic affinity > 0.3 purity, characteristic MTTD / MTTR per tactic)."
  }
}