{ "version": "1.0.0", "dataset": "xpertsystems/cyb008-sample", "task": "5-class resolution_outcome classification (SOC alert triage)", "baselines": { "always_predict_majority_accuracy": 0.32608695652173914, "majority_class": "false_positive_closed", "random_guess_accuracy": 0.2 }, "split": { "strategy": "stratified (StratifiedShuffleSplit, nested 70/15/15)", "rationale": "CYB008 has no natural row-level group key: 25 analysts (group-aware split would yield ~4 test analysts), 5 SOCs (would yield 1 test SOC), 589 incidents but only 9% of alerts have a non-null incident_id. Alerts are essentially independent given features, so stratified random split is the right choice (same approach as CYB001 for network flow classification).", "alerts_train": 6440, "alerts_val": 1380, "alerts_test": 1380, "seed": 42 }, "n_features": 53, "label_classes": [ "auto_resolved_soar", "duplicate_merged", "false_positive_closed", "true_positive_remediated", "true_positive_escalated" ], "class_distribution_train": { "false_positive_closed": 2097, "auto_resolved_soar": 1849, "true_positive_remediated": 1294, "true_positive_escalated": 923, "duplicate_merged": 277 }, "class_distribution_test": { "false_positive_closed": 450, "auto_resolved_soar": 396, "true_positive_remediated": 277, "true_positive_escalated": 198, "duplicate_merged": 59 }, "oracle_excluded_features": [ "alert_lifecycle_phase (deterministically maps to 3 of 5 outcomes)", "automation_resolved (1:1 with auto_resolved_soar)", "escalation_flag (near 1:1 with true_positive_escalated)" ], "high_cardinality_excluded_features": [ "mitre_technique_id (36 unique values; perfect oracle for mitre_tactic but unrelated to this target)", "detection_rule_id (656 unique values; one-hot explosion)" ], "leakage_audit_note": "See leakage_diagnostic.json for the full audit of structural oracles and the separate unlearnable-target finding for mitre_tactic. The model is trained with all three oracle columns excluded; full-features experiments showed 100% test accuracy, confirming the structural leakage.", "models": { "xgboost": { "architecture": "Gradient-boosted decision trees, multi:softprob, 5 classes", "framework": "xgboost", "test_metrics": { "model": "xgboost", "accuracy": 0.7659420289855072, "macro_f1": 0.7429876131468711, "weighted_f1": 0.7669168766123218, "per_class_f1": { "auto_resolved_soar": 0.7572383073496659, "duplicate_merged": 0.7936507936507936, "false_positive_closed": 0.9038461538461539, "true_positive_remediated": 0.7012987012987013, "true_positive_escalated": 0.5589041095890411 }, "confusion_matrix": { "labels": [ "auto_resolved_soar", "duplicate_merged", "false_positive_closed", "true_positive_remediated", "true_positive_escalated" ], "matrix": [ [ 340, 17, 6, 16, 17 ], [ 9, 50, 0, 0, 0 ], [ 74, 0, 376, 0, 0 ], [ 40, 0, 0, 189, 48 ], [ 39, 0, 0, 57, 102 ] ] }, "macro_roc_auc_ovr": 0.9522005654044479 } }, "mlp": { "architecture": "PyTorch MLP, 53 -> 128 -> 64 -> 5, BatchNorm1d + ReLU + Dropout, weighted cross-entropy loss", "framework": "pytorch", "test_metrics": { "model": "mlp", "accuracy": 0.7673913043478261, "macro_f1": 0.7510024599009764, "weighted_f1": 0.769556192579193, "per_class_f1": { "auto_resolved_soar": 0.7505773672055427, "duplicate_merged": 0.8251748251748252, "false_positive_closed": 0.910411622276029, "true_positive_remediated": 0.6981818181818182, "true_positive_escalated": 0.5706666666666667 }, "confusion_matrix": { "labels": [ "auto_resolved_soar", "duplicate_merged", "false_positive_closed", "true_positive_remediated", "true_positive_escalated" ], "matrix": [ [ 325, 25, 0, 23, 23 ], [ 0, 59, 0, 0, 0 ], [ 74, 0, 376, 0, 0 ], [ 38, 0, 0, 192, 47 ], [ 33, 0, 0, 58, 107 ] ] }, "macro_roc_auc_ovr": 0.9552409409036638 } } } }