Initial release: XGBoost + MLP for user-risk-tier classification, plus structural-leakage diagnostic on threat-actor detection
Browse files- README.md +503 -0
- ablation_results.json +209 -0
- feature_engineering.py +374 -0
- feature_meta.json +93 -0
- feature_scaler.json +1 -0
- inference_example.ipynb +322 -0
- leakage_diagnostic.json +145 -0
- model_mlp.safetensors +3 -0
- model_xgb.json +0 -0
- multi_seed_results.json +98 -0
- validation_results.json +126 -0
README.md
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
library_name: pytorch
|
| 4 |
+
tags:
|
| 5 |
+
- cybersecurity
|
| 6 |
+
- identity-security
|
| 7 |
+
- insider-threat
|
| 8 |
+
- ueba
|
| 9 |
+
- user-risk-scoring
|
| 10 |
+
- tabular-classification
|
| 11 |
+
- synthetic-data
|
| 12 |
+
- xgboost
|
| 13 |
+
- baseline
|
| 14 |
+
- leakage-diagnostic
|
| 15 |
+
pipeline_tag: tabular-classification
|
| 16 |
+
base_model: []
|
| 17 |
+
datasets:
|
| 18 |
+
- xpertsystems/cyb006-sample
|
| 19 |
+
metrics:
|
| 20 |
+
- accuracy
|
| 21 |
+
- f1
|
| 22 |
+
- roc_auc
|
| 23 |
+
model-index:
|
| 24 |
+
- name: cyb006-baseline-classifier
|
| 25 |
+
results:
|
| 26 |
+
- task:
|
| 27 |
+
type: tabular-classification
|
| 28 |
+
name: 3-class user risk tier classification
|
| 29 |
+
dataset:
|
| 30 |
+
type: xpertsystems/cyb006-sample
|
| 31 |
+
name: CYB006 Synthetic Login Activity Dataset (Sample)
|
| 32 |
+
metrics:
|
| 33 |
+
- type: roc_auc
|
| 34 |
+
value: 0.8017
|
| 35 |
+
name: Test macro ROC-AUC OvR (XGBoost, seed 42)
|
| 36 |
+
- type: accuracy
|
| 37 |
+
value: 0.6667
|
| 38 |
+
name: Test accuracy (XGBoost, seed 42)
|
| 39 |
+
- type: f1
|
| 40 |
+
value: 0.6454
|
| 41 |
+
name: Test macro-F1 (XGBoost, seed 42)
|
| 42 |
+
- type: accuracy
|
| 43 |
+
value: 0.700
|
| 44 |
+
name: Multi-seed accuracy mean ± 0.082 (XGBoost, 10 seeds)
|
| 45 |
+
- type: roc_auc
|
| 46 |
+
value: 0.812
|
| 47 |
+
name: Multi-seed ROC-AUC mean ± 0.048 (XGBoost, 10 seeds)
|
| 48 |
+
- type: roc_auc
|
| 49 |
+
value: 0.6974
|
| 50 |
+
name: Test macro ROC-AUC OvR (MLP, seed 42)
|
| 51 |
+
- type: accuracy
|
| 52 |
+
value: 0.6000
|
| 53 |
+
name: Test accuracy (MLP, seed 42)
|
| 54 |
+
- type: f1
|
| 55 |
+
value: 0.5914
|
| 56 |
+
name: Test macro-F1 (MLP, seed 42)
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
# CYB006 Baseline Classifier
|
| 60 |
+
|
| 61 |
+
**User-risk-tier classifier trained on the CYB006 synthetic login
|
| 62 |
+
activity sample. Predicts which of 3 risk tiers (`low` / `medium` /
|
| 63 |
+
`high`) a user belongs to, from per-user identity aggregates and
|
| 64 |
+
non-leaky session aggregates. ALSO ships a leakage diagnostic for the
|
| 65 |
+
README's stated headline use case (threat-actor tier classification).**
|
| 66 |
+
|
| 67 |
+
> **Read this first.** This repo ships two artifacts: (1) a working
|
| 68 |
+
> baseline classifier for `user_risk_tier` (the primary product), and
|
| 69 |
+
> (2) a separate diagnostic file (`leakage_diagnostic.json`)
|
| 70 |
+
> documenting why the README's stated headline use case — 4-class
|
| 71 |
+
> threat-actor tier classification — is not a usable ML task on the
|
| 72 |
+
> sample dataset. Both matter; the diagnostic is required reading for
|
| 73 |
+
> anyone evaluating CYB006 for a threat-detection product.
|
| 74 |
+
|
| 75 |
+
## Model overview
|
| 76 |
+
|
| 77 |
+
| Property | Value |
|
| 78 |
+
|---|---|
|
| 79 |
+
| Primary task | 3-class user_risk_tier classification (`low`/`medium`/`high`) |
|
| 80 |
+
| Secondary artifact | `leakage_diagnostic.json` — audit of threat-actor detection on this sample |
|
| 81 |
+
| Training data | `xpertsystems/cyb006-sample` (200 users × 25 sessions = 5,000 sessions) |
|
| 82 |
+
| Models | XGBoost + PyTorch MLP |
|
| 83 |
+
| Input features | 34 (per-user aggregates + session aggregates + engineered) |
|
| 84 |
+
| Split | **Stratified by user_risk_tier** (this is a user-level task, n=200) |
|
| 85 |
+
| Validation | Single seed (artifact) + multi-seed aggregate across 10 seeds |
|
| 86 |
+
| License | CC-BY-NC-4.0 (matches dataset) |
|
| 87 |
+
| Status | Reference baseline + structural-leakage diagnostic |
|
| 88 |
+
|
| 89 |
+
## Why this task — and why not threat-actor classification?
|
| 90 |
+
|
| 91 |
+
The CYB006 README's first suggested use case is "training **account
|
| 92 |
+
takeover (ATO) detection** models" and second is "**threat-actor tier
|
| 93 |
+
classification** — 4-class with realistic class imbalance". We piloted
|
| 94 |
+
the threat-actor target first and discovered that the sample dataset
|
| 95 |
+
contains **structural distributional non-overlap** between threat-actor
|
| 96 |
+
and legitimate session populations across at least six independent
|
| 97 |
+
feature groups:
|
| 98 |
+
|
| 99 |
+
| Oracle feature | Actor range / value | Non-actor range / value |
|
| 100 |
+
|---|---|---|
|
| 101 |
+
| `velocity_anomaly_score` | [0.52, 0.82] | [0.00, 0.25] — **zero overlap** |
|
| 102 |
+
| `session_timestamp_utc` | [6,417, 1,440,062] | [1,445,187, 18,000,137] — **disjoint windows** |
|
| 103 |
+
| `credential_attempt_count` | [1, 59] (mean 12.9) | [1, 2] (mean 1.07) |
|
| 104 |
+
| `login_outcome` | `success_normal` only occurs for non-actors; `failure_account_locked` / `account_takeover_confirmed` / `session_hijacked` / `success_anomalous` only occur for actors |
|
| 105 |
+
| `geo_country_code` | `KP`, `XX`, `CN`, `BY` appear only for actors |
|
| 106 |
+
| `device_trust_level` | `trusted_managed` / `compliant_enrolled` appear only for non-actors |
|
| 107 |
+
|
| 108 |
+
As a consequence, **plain XGBoost achieves 100% test accuracy on
|
| 109 |
+
threat-actor binary detection (any-actor vs none) across every random
|
| 110 |
+
seed**, and stays at **97% accuracy and AUC 0.99 even with all six
|
| 111 |
+
oracle feature groups dropped** (40+ columns excluded). This is not a
|
| 112 |
+
useful ML benchmark; it's a property of the synthetic generator. Real
|
| 113 |
+
identity-security telemetry has substantial overlap between threat
|
| 114 |
+
and legitimate behaviour, with state-of-the-art detection systems
|
| 115 |
+
operating at AUC 0.7–0.9, not 1.0.
|
| 116 |
+
|
| 117 |
+
The diagnostic finding is documented quantitatively in
|
| 118 |
+
[`leakage_diagnostic.json`](./leakage_diagnostic.json) and summarised
|
| 119 |
+
in the [Leakage diagnostic](#leakage-diagnostic) section below.
|
| 120 |
+
|
| 121 |
+
We therefore pivoted to **`user_risk_tier` (3-class user-level
|
| 122 |
+
classification)** as the primary baseline target. This task:
|
| 123 |
+
|
| 124 |
+
- Has **overlapping per-tier feature distributions** — no oracle features
|
| 125 |
+
- Carries **modest real signal** (acc 0.66, AUC 0.80 over majority 0.57)
|
| 126 |
+
- Targets a legitimate use case (the README lists "Insider threat scoring with composite behavioral indicators")
|
| 127 |
+
- Demonstrates honest ML rigor on the dataset
|
| 128 |
+
|
| 129 |
+
Two model artifacts are published. They are designed to be used together — disagreement is a useful triage signal:
|
| 130 |
+
|
| 131 |
+
- `model_xgb.json` — gradient-boosted trees, primary recommendation
|
| 132 |
+
- `model_mlp.safetensors` — PyTorch MLP in SafeTensors format
|
| 133 |
+
|
| 134 |
+
## Quick start
|
| 135 |
+
|
| 136 |
+
```bash
|
| 137 |
+
pip install xgboost torch safetensors pandas huggingface_hub
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
```python
|
| 141 |
+
from huggingface_hub import hf_hub_download
|
| 142 |
+
import json, numpy as np, torch, xgboost as xgb
|
| 143 |
+
from safetensors.torch import load_file
|
| 144 |
+
|
| 145 |
+
REPO = "xpertsystems/cyb006-baseline-classifier"
|
| 146 |
+
|
| 147 |
+
paths = {n: hf_hub_download(REPO, n) for n in [
|
| 148 |
+
"model_xgb.json", "model_mlp.safetensors",
|
| 149 |
+
"feature_engineering.py", "feature_meta.json", "feature_scaler.json",
|
| 150 |
+
]}
|
| 151 |
+
|
| 152 |
+
import sys, os
|
| 153 |
+
sys.path.insert(0, os.path.dirname(paths["feature_engineering.py"]))
|
| 154 |
+
from feature_engineering import (
|
| 155 |
+
transform_single, load_meta, INT_TO_LABEL,
|
| 156 |
+
compute_session_aggregates_for_user
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
meta = load_meta(paths["feature_meta.json"])
|
| 160 |
+
xgb_model = xgb.XGBClassifier(); xgb_model.load_model(paths["model_xgb.json"])
|
| 161 |
+
|
| 162 |
+
# Compose a per-user record from user_risk_summary row + session aggregates
|
| 163 |
+
user_record = user_summary_row.to_dict()
|
| 164 |
+
user_record.update(compute_session_aggregates_for_user(user_sessions))
|
| 165 |
+
|
| 166 |
+
X = transform_single(user_record, meta)
|
| 167 |
+
proba = xgb_model.predict_proba(X)[0]
|
| 168 |
+
print(INT_TO_LABEL[int(np.argmax(proba))])
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
See [`inference_example.ipynb`](./inference_example.ipynb) for the full
|
| 172 |
+
copy-paste demo.
|
| 173 |
+
|
| 174 |
+
## Training data
|
| 175 |
+
|
| 176 |
+
Trained on the public sample of CYB006, 200 per-user rows from
|
| 177 |
+
`user_risk_summary.csv` enriched with per-user session aggregates
|
| 178 |
+
computed from `login_sessions.csv`:
|
| 179 |
+
|
| 180 |
+
| Tier | Users | Class share |
|
| 181 |
+
|---|---:|---:|
|
| 182 |
+
| `low` | 114 | 57% |
|
| 183 |
+
| `medium` | 47 | 23.5% |
|
| 184 |
+
| `high` | 39 | 19.5% |
|
| 185 |
+
|
| 186 |
+
The CYB006 README claims a 4-tier scheme (`low`/`medium`/`high`/`critical`).
|
| 187 |
+
The sample data contains only 3 — there is no `critical` tier present.
|
| 188 |
+
|
| 189 |
+
### Stratified split
|
| 190 |
+
|
| 191 |
+
This is a **user-level** task (one row per user, 200 users total).
|
| 192 |
+
Group-aware splitting does not apply since there is no
|
| 193 |
+
many-rows-per-group structure to leak. We use
|
| 194 |
+
**StratifiedShuffleSplit** (nested 70/15/15) to preserve the 3-tier
|
| 195 |
+
class distribution across folds:
|
| 196 |
+
|
| 197 |
+
| Fold | Users |
|
| 198 |
+
|---|---:|
|
| 199 |
+
| Train | 139 |
|
| 200 |
+
| Validation | 31 |
|
| 201 |
+
| Test | 30 |
|
| 202 |
+
|
| 203 |
+
Class imbalance is addressed with `class_weight='balanced'` (XGBoost
|
| 204 |
+
`sample_weight`) and weighted cross-entropy (MLP).
|
| 205 |
+
|
| 206 |
+
## Feature pipeline
|
| 207 |
+
|
| 208 |
+
The bundled `feature_engineering.py` is the canonical feature recipe.
|
| 209 |
+
34 features survive after encoding, drawn from:
|
| 210 |
+
|
| 211 |
+
- **Per-user numeric** (14, from `user_risk_summary.csv`): `total_login_attempts`, `successful_logins`, `failed_logins`, `mfa_failures`, `impossible_travel_events`, `lateral_hop_count`, `privilege_escalations`, `account_lockout_count`, `geo_dispersion_score`, `login_velocity_score`, `session_anomaly_rate`, `ueba_alert_count`, `overall_identity_risk_score`, `insider_threat_indicator_score`
|
| 212 |
+
- **Per-user categorical** (1, one-hot): `peak_privilege_level_accessed` (6 values)
|
| 213 |
+
- **Session aggregates** (8, derived from `login_sessions.csv`): `avg_session_duration_seconds`, `avg_mfa_response_latency_ms`, `avg_geo_anomaly_score`, `max_geo_anomaly_score`, `frac_impossible_travel`, `n_unique_countries`, `n_unique_devices`, `n_unique_applications`
|
| 214 |
+
- **Engineered** (6): `failed_login_rate`, `mfa_failure_rate`, `ueba_alerts_per_session`, `hops_per_escalation`, `geo_velocity_composite`, `composite_anomaly_score`
|
| 215 |
+
|
| 216 |
+
### Leakage exclusions
|
| 217 |
+
|
| 218 |
+
Three columns from `user_risk_summary.csv` are dropped to avoid contamination:
|
| 219 |
+
- `threat_actor_flag` — perfect oracle for `tier='high'` subset (only high-tier users can be threat actors)
|
| 220 |
+
- `account_takeover_flag` — 2 positive cases out of 200 (1%); too sparse and oracle-prone
|
| 221 |
+
- `credential_attack_victim_flag` — 1 positive case out of 200 (0.5%); same issue
|
| 222 |
+
|
| 223 |
+
Four columns from `login_sessions.csv` are NOT aggregated into session
|
| 224 |
+
features because they exhibited the structural non-overlap documented
|
| 225 |
+
in [Leakage diagnostic](#leakage-diagnostic):
|
| 226 |
+
- `velocity_anomaly_score`, `session_timestamp_utc`, `credential_attempt_count`, `login_outcome`
|
| 227 |
+
|
| 228 |
+
## Evaluation
|
| 229 |
+
|
| 230 |
+
### Test-set metrics, seed 42 (n = 30 disjoint users)
|
| 231 |
+
|
| 232 |
+
**XGBoost** (the published `model_xgb.json` artifact)
|
| 233 |
+
|
| 234 |
+
| Metric | Value |
|
| 235 |
+
|---|---:|
|
| 236 |
+
| Macro ROC-AUC (OvR) | **0.8017** |
|
| 237 |
+
| Accuracy | **0.6667** |
|
| 238 |
+
| Macro-F1 | 0.6454 |
|
| 239 |
+
| Weighted-F1 | 0.6606 |
|
| 240 |
+
|
| 241 |
+
**MLP** (the published `model_mlp.safetensors` artifact)
|
| 242 |
+
|
| 243 |
+
| Metric | Value |
|
| 244 |
+
|---|---:|
|
| 245 |
+
| Macro ROC-AUC (OvR) | 0.6974 |
|
| 246 |
+
| Accuracy | 0.6000 |
|
| 247 |
+
| Macro-F1 | 0.5914 |
|
| 248 |
+
| Weighted-F1 | 0.6068 |
|
| 249 |
+
|
| 250 |
+
### Multi-seed robustness (XGBoost, 10 seeds)
|
| 251 |
+
|
| 252 |
+
| Metric | Mean | Std | Min | Max |
|
| 253 |
+
|---|---:|---:|---:|---:|
|
| 254 |
+
| Accuracy | 0.700 | 0.082 | 0.533 | 0.867 |
|
| 255 |
+
| Macro-F1 | 0.638 | 0.093 | 0.445 | 0.814 |
|
| 256 |
+
| Macro ROC-AUC OvR | 0.812 | 0.048 | 0.738 | 0.877 |
|
| 257 |
+
|
| 258 |
+
Full per-seed results in [`multi_seed_results.json`](./multi_seed_results.json).
|
| 259 |
+
With only 30 test users per seed, single-seed accuracy varies materially
|
| 260 |
+
(0.53–0.87 across seeds). **ROC-AUC 0.812 ± 0.048 is the more reliable
|
| 261 |
+
performance estimate.** All 10 seeds yield all 3 tiers in the test
|
| 262 |
+
fold thanks to stratification.
|
| 263 |
+
|
| 264 |
+
### Per-class F1 (seed 42)
|
| 265 |
+
|
| 266 |
+
| Tier | Class share | XGBoost F1 | MLP F1 |
|
| 267 |
+
|---|---:|---:|---:|
|
| 268 |
+
| `low` | 57% | 0.727 | 0.647 |
|
| 269 |
+
| `medium` | 23.5% | 0.286 | 0.400 |
|
| 270 |
+
| `high` | 19.5% | **0.923** | 0.727 |
|
| 271 |
+
|
| 272 |
+
The model performs best on `high` (the most behaviourally distinct
|
| 273 |
+
tier — high failed-login rates, frequent impossible travel, elevated
|
| 274 |
+
anomaly scores) and `low` (the majority class). The `medium` tier is
|
| 275 |
+
hardest, which is the expected behaviour for a 3-tier ordinal task —
|
| 276 |
+
mid-class samples sit between two boundaries and pick up confusion
|
| 277 |
+
from both sides.
|
| 278 |
+
|
| 279 |
+
### Ablation: which feature groups matter
|
| 280 |
+
|
| 281 |
+
| Configuration | Accuracy | Macro-F1 | Δ accuracy |
|
| 282 |
+
|---|---:|---:|---:|
|
| 283 |
+
| Full feature set (published) | 0.6667 | 0.6454 | — |
|
| 284 |
+
| No user aggregates (count features) | 0.5333 | 0.4586 | **−0.1333** |
|
| 285 |
+
| No risk scores | 0.5667 | 0.5300 | −0.1000 |
|
| 286 |
+
| No engineered features | 0.5667 | 0.5444 | −0.1000 |
|
| 287 |
+
| No session aggregates | 0.7000 | 0.6130 | +0.0333 |
|
| 288 |
+
|
| 289 |
+
Findings:
|
| 290 |
+
|
| 291 |
+
1. **User-level count features matter most** (failed logins, lateral
|
| 292 |
+
hops, MFA failures). Dropping them costs 13 pp accuracy.
|
| 293 |
+
2. **Risk scores and engineered features each contribute ~10 pp.**
|
| 294 |
+
With only 139 training users, the trees can't fully recover
|
| 295 |
+
engineered composites from raw inputs.
|
| 296 |
+
3. **Session aggregates slightly hurt accuracy** in seed 42 (gain
|
| 297 |
+
3 pp when dropped). With n=200, additional features can crowd
|
| 298 |
+
the small data; the trees do better with fewer signals when
|
| 299 |
+
each one is information-dense. Session aggregates are kept in
|
| 300 |
+
the published pipeline because they help on most other seeds.
|
| 301 |
+
|
| 302 |
+
### Architecture
|
| 303 |
+
|
| 304 |
+
**XGBoost:** multi-class gradient boosting (`multi:softprob`, 3 classes),
|
| 305 |
+
`hist` tree method, class-balanced sample weights, early stopping on
|
| 306 |
+
validation mlogloss.
|
| 307 |
+
|
| 308 |
+
**MLP:** `34 → 128 → 64 → 3`, each hidden layer followed by `BatchNorm1d`
|
| 309 |
+
→ `ReLU` → `Dropout(0.3)`, weighted cross-entropy loss, AdamW optimizer,
|
| 310 |
+
early stopping on validation macro-F1.
|
| 311 |
+
|
| 312 |
+
Training hyperparameters are held internally by XpertSystems.
|
| 313 |
+
|
| 314 |
+
## Leakage diagnostic
|
| 315 |
+
|
| 316 |
+
This is the most important section of the model card. The full
|
| 317 |
+
diagnostic is in [`leakage_diagnostic.json`](./leakage_diagnostic.json).
|
| 318 |
+
Summary:
|
| 319 |
+
|
| 320 |
+
**Setup:** Train an XGBoost binary classifier to predict
|
| 321 |
+
`threat_actor_capability_tier != 'none'` from per-session features.
|
| 322 |
+
Use group-aware split by `user_id` (15% test = 30 disjoint users).
|
| 323 |
+
Cumulatively drop suspected oracle feature groups and re-evaluate.
|
| 324 |
+
|
| 325 |
+
| Configuration | n_features | Accuracy | ROC-AUC |
|
| 326 |
+
|---|---:|---:|---:|
|
| 327 |
+
| Full feature set | 166 | **1.0000** | **1.0000** |
|
| 328 |
+
| − behavioural oracles (velocity, timestamp, credential count) | 163 | 0.9991 | 1.0000 |
|
| 329 |
+
| − login_outcome | 154 | 0.9982 | 1.0000 |
|
| 330 |
+
| − geo_country_code | 138 | 0.9987 | 1.0000 |
|
| 331 |
+
| − device_trust_level | 133 | 0.9982 | 0.9999 |
|
| 332 |
+
| − user_risk_tier | 130 | 0.9978 | 0.9996 |
|
| 333 |
+
| − geo_anomaly_score | 129 | 0.9707 | 0.9897 |
|
| 334 |
+
|
| 335 |
+
**Even after dropping six oracle feature groups (37 columns), the
|
| 336 |
+
model still achieves 97% test accuracy and AUC 0.99.** The leakage
|
| 337 |
+
is not localised to a few suspect features; it is distributed across
|
| 338 |
+
the entire feature space because the synthetic generator produces
|
| 339 |
+
threat-actor sessions that are anomalous on every dimension
|
| 340 |
+
simultaneously, with no overlap into legitimate behaviour.
|
| 341 |
+
|
| 342 |
+
### Recommendation to dataset author
|
| 343 |
+
|
| 344 |
+
For threat-actor detection to be a useful ML benchmark on this
|
| 345 |
+
dataset, the next generator version should introduce **distributional
|
| 346 |
+
overlap** between threat-actor and legitimate session populations
|
| 347 |
+
across all anomaly indicators:
|
| 348 |
+
|
| 349 |
+
- `velocity_anomaly_score`: extend non-actor distribution into [0.0, 0.5] and shrink actor to [0.3, 0.9] for substantial overlap in [0.3, 0.5]
|
| 350 |
+
- `session_timestamp_utc`: interleave threat-actor and legitimate sessions across the same time window
|
| 351 |
+
- `credential_attempt_count`: allow some non-actor users to exhibit elevated counts (mistyped passwords, MFA fatigue)
|
| 352 |
+
- `login_outcome`: allow `failure_account_locked` and `success_anomalous` for some legitimate sessions
|
| 353 |
+
- `geo_country_code`: include a baseline frequency of risky-country logins among legitimate users (business travel, contractors)
|
| 354 |
+
- `device_trust_level`: allow threat actors to occasionally use compliant devices (token theft scenarios)
|
| 355 |
+
|
| 356 |
+
Target operating regime: real-world detection AUC 0.7–0.9, not 1.0.
|
| 357 |
+
|
| 358 |
+
### What this means for buyers
|
| 359 |
+
|
| 360 |
+
If you're evaluating CYB006 for a threat-detection product, you should
|
| 361 |
+
know that:
|
| 362 |
+
|
| 363 |
+
- **The sample dataset cannot be used to honestly benchmark threat-actor
|
| 364 |
+
detection models.** A trivially regularised model will score 100%,
|
| 365 |
+
which doesn't differentiate good detection systems from bad ones.
|
| 366 |
+
- **The user-risk-tier task shipped in this baseline is a legitimate
|
| 367 |
+
ML benchmark on the sample data.** It generalises modestly (AUC 0.81)
|
| 368 |
+
and is the right starting point for evaluating insider-threat
|
| 369 |
+
scoring on the sample.
|
| 370 |
+
- **The full ~1.1M-row CYB006 product may or may not have the same
|
| 371 |
+
structural property.** Confirm with XpertSystems before committing
|
| 372 |
+
to a threat-detection use case.
|
| 373 |
+
|
| 374 |
+
## Limitations
|
| 375 |
+
|
| 376 |
+
**This is a baseline reference, not a production identity-security system.**
|
| 377 |
+
|
| 378 |
+
1. **Small held-out test fold (n=30).** With only 30 test users per
|
| 379 |
+
seed, single-seed metrics swing 0.53–0.87 in accuracy. The
|
| 380 |
+
multi-seed ROC-AUC of 0.81 ± 0.05 is the reliable estimate. The
|
| 381 |
+
full ~1.1M-row product would tighten the confidence interval
|
| 382 |
+
substantially.
|
| 383 |
+
|
| 384 |
+
2. **The `medium` tier is harder than the others.** F1 0.29 on
|
| 385 |
+
`medium` (vs 0.92 on `high`) is expected — ordinal middle classes
|
| 386 |
+
are typically the hardest under a flat-classification setup.
|
| 387 |
+
|
| 388 |
+
3. **MLP weaker than XGBoost.** AUC 0.70 vs 0.80. With only 139
|
| 389 |
+
training users, the MLP cannot match boosted trees on tabular data.
|
| 390 |
+
|
| 391 |
+
4. **Threat-actor detection task is not usable on this sample.**
|
| 392 |
+
See [Leakage diagnostic](#leakage-diagnostic) above.
|
| 393 |
+
|
| 394 |
+
5. **Synthetic-vs-real transfer.** The dataset is synthetic and
|
| 395 |
+
calibrated to identity-security benchmarks (Microsoft Digital
|
| 396 |
+
Defense Report, Okta Customer Identity Trends, Verizon DBIR, CISA
|
| 397 |
+
Joint Advisories, Mandiant M-Trends, MITRE ATT&CK Evaluations).
|
| 398 |
+
Real identity telemetry has different noise characteristics; do
|
| 399 |
+
not assume metrics transfer.
|
| 400 |
+
|
| 401 |
+
6. **3 tiers, not 4.** README lists `low`/`medium`/`high`/`critical`
|
| 402 |
+
but the data contains only 3. If you need 4-class support, wait
|
| 403 |
+
for a regenerated sample.
|
| 404 |
+
|
| 405 |
+
## Notes on dataset schema
|
| 406 |
+
|
| 407 |
+
The CYB006 sample dataset README describes some fields differently
|
| 408 |
+
from the actual schema. The model was trained on the actual schema;
|
| 409 |
+
this note helps buyers reconcile what they read with what they receive.
|
| 410 |
+
|
| 411 |
+
| What the README says | What the data actually contains |
|
| 412 |
+
|---|---|
|
| 413 |
+
| `session_phase` has 6 values | **All 5,000 rows have `session_phase = session_termination`** — the field is constant. There is no usable session-phase target. |
|
| 414 |
+
| `login_outcome` has 4 values (`success / failed / mfa_required / blocked`) | 9 values: `success_normal`, `failure_bad_password`, `failure_account_locked`, `failure_mfa_rejected`, `failure_device_untrusted`, `failure_geo_blocked`, `success_anomalous`, `account_takeover_confirmed`, `session_hijacked` |
|
| 415 |
+
| 4 actor tiers | 5 values: 4 tier labels + `none` (92% of rows have `none`) |
|
| 416 |
+
| `mfa_challenge_type` has 5 values | 7: adds `authenticator_app`, `hardware_token`, `voice_call` |
|
| 417 |
+
| `authentication_method` has 4 values | 5: no `api_key`; adds `password_plus_mfa`, `phishing_resistant_fido2` |
|
| 418 |
+
| `user_risk_tier` has 4 values (`low/medium/high/critical`) | 3 values: no `critical` |
|
| 419 |
+
| `session_timestamp_utc` is an ISO timestamp string | It is an integer |
|
| 420 |
+
| `user_risk_summary.csv` columns listed | Adds `peak_privilege_level_accessed`, `credential_attack_victim_flag` (not in README) |
|
| 421 |
+
|
| 422 |
+
None of these affects model correctness — the feature pipeline uses
|
| 423 |
+
the actual column names. If you build your own pipeline against the
|
| 424 |
+
dataset, use the actual columns.
|
| 425 |
+
|
| 426 |
+
## Intended use
|
| 427 |
+
|
| 428 |
+
- **Evaluating fit** of the CYB006 dataset for your insider-threat
|
| 429 |
+
or user-risk-scoring research
|
| 430 |
+
- **Baseline reference** for new model architectures
|
| 431 |
+
- **Reference example of structural-leakage diagnostics** in synthetic
|
| 432 |
+
cybersecurity datasets — the diagnostic methodology in
|
| 433 |
+
`train_classifier.py` is reusable
|
| 434 |
+
- **Feature engineering reference** for per-user identity aggregates
|
| 435 |
+
|
| 436 |
+
## Out-of-scope use
|
| 437 |
+
|
| 438 |
+
- Production identity-security detection on real telemetry
|
| 439 |
+
- Threat-actor attribution (this baseline does not address that task; see why above)
|
| 440 |
+
- Any operational security or law-enforcement decision
|
| 441 |
+
|
| 442 |
+
## Reproducibility
|
| 443 |
+
|
| 444 |
+
Outputs above were produced with `seed = 42` (published artifact),
|
| 445 |
+
nested `StratifiedShuffleSplit` (70/15/15 by user_risk_tier), on the
|
| 446 |
+
published sample (`xpertsystems/cyb006-sample`, version 1.0.0,
|
| 447 |
+
generated 2026-05-16). The feature pipeline in `feature_engineering.py`
|
| 448 |
+
is deterministic and the trained weights in this repo correspond
|
| 449 |
+
exactly to the metrics above.
|
| 450 |
+
|
| 451 |
+
Multi-seed results (seeds 42, 7, 13, 17, 23, 31, 45, 99, 123, 200) in
|
| 452 |
+
`multi_seed_results.json` confirm robust performance across splits.
|
| 453 |
+
|
| 454 |
+
The training script itself is private to XpertSystems.
|
| 455 |
+
|
| 456 |
+
## Files in this repo
|
| 457 |
+
|
| 458 |
+
| File | Purpose |
|
| 459 |
+
|---|---|
|
| 460 |
+
| `model_xgb.json` | XGBoost weights (seed 42) |
|
| 461 |
+
| `model_mlp.safetensors` | PyTorch MLP weights (seed 42) |
|
| 462 |
+
| `feature_engineering.py` | Feature pipeline |
|
| 463 |
+
| `feature_meta.json` | Feature column order + categorical levels |
|
| 464 |
+
| `feature_scaler.json` | MLP input mean/std (XGBoost ignores) |
|
| 465 |
+
| `validation_results.json` | Per-class metrics, confusion matrix, architecture |
|
| 466 |
+
| `ablation_results.json` | Per-feature-group ablation |
|
| 467 |
+
| `multi_seed_results.json` | XGBoost metrics across 10 seeds |
|
| 468 |
+
| `leakage_diagnostic.json` | **Structural-leakage audit on threat-actor detection** |
|
| 469 |
+
| `inference_example.ipynb` | End-to-end inference demo notebook |
|
| 470 |
+
| `README.md` | This file |
|
| 471 |
+
|
| 472 |
+
## Contact and full product
|
| 473 |
+
|
| 474 |
+
The full **CYB006** dataset contains ~1.1 million rows across four
|
| 475 |
+
files, with 12 calibrated benchmark validation tests drawn from
|
| 476 |
+
authoritative identity security and threat intelligence sources
|
| 477 |
+
(Microsoft Digital Defense Report, Okta Customer Identity Trends,
|
| 478 |
+
Verizon DBIR, CISA Joint Advisories, Mandiant M-Trends, MITRE ATT&CK
|
| 479 |
+
Evaluations). The full XpertSystems.ai synthetic data catalogue spans
|
| 480 |
+
41 SKUs across Cybersecurity, Healthcare, Insurance & Risk, Oil & Gas,
|
| 481 |
+
and Materials & Energy.
|
| 482 |
+
|
| 483 |
+
- 📧 **pradeep@xpertsystems.ai**
|
| 484 |
+
- 🌐 **https://xpertsystems.ai**
|
| 485 |
+
- 🗂 Dataset: https://huggingface.co/datasets/xpertsystems/cyb006-sample
|
| 486 |
+
- 🤖 Companion models:
|
| 487 |
+
- https://huggingface.co/xpertsystems/cyb001-baseline-classifier (network traffic)
|
| 488 |
+
- https://huggingface.co/xpertsystems/cyb002-baseline-classifier (ATT&CK kill-chain)
|
| 489 |
+
- https://huggingface.co/xpertsystems/cyb003-baseline-classifier (malware execution phase)
|
| 490 |
+
- https://huggingface.co/xpertsystems/cyb004-baseline-classifier (phishing campaign phase)
|
| 491 |
+
- https://huggingface.co/xpertsystems/cyb005-baseline-classifier (ransomware actor-tier attribution)
|
| 492 |
+
|
| 493 |
+
## Citation
|
| 494 |
+
|
| 495 |
+
```bibtex
|
| 496 |
+
@misc{xpertsystems_cyb006_baseline_2026,
|
| 497 |
+
title = {CYB006 Baseline Classifier: XGBoost and MLP for User Risk Tier Classification, with Structural-Leakage Diagnostic on Threat-Actor Detection},
|
| 498 |
+
author = {XpertSystems.ai},
|
| 499 |
+
year = {2026},
|
| 500 |
+
url = {https://huggingface.co/xpertsystems/cyb006-baseline-classifier},
|
| 501 |
+
note = {Baseline reference model trained on xpertsystems/cyb006-sample}
|
| 502 |
+
}
|
| 503 |
+
```
|
ablation_results.json
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"purpose": "Quantify how much each feature group contributes to the headline XGBoost score. Identical architecture, same stratified split, with one feature group dropped at a time.",
|
| 3 |
+
"full_model_metrics": {
|
| 4 |
+
"model": "xgboost",
|
| 5 |
+
"accuracy": 0.6666666666666666,
|
| 6 |
+
"macro_f1": 0.6453546453546454,
|
| 7 |
+
"weighted_f1": 0.6634032634032633,
|
| 8 |
+
"per_class_f1": {
|
| 9 |
+
"low": 0.7272727272727273,
|
| 10 |
+
"medium": 0.2857142857142857,
|
| 11 |
+
"high": 0.9230769230769231
|
| 12 |
+
},
|
| 13 |
+
"confusion_matrix": {
|
| 14 |
+
"labels": [
|
| 15 |
+
"low",
|
| 16 |
+
"medium",
|
| 17 |
+
"high"
|
| 18 |
+
],
|
| 19 |
+
"matrix": [
|
| 20 |
+
[
|
| 21 |
+
12,
|
| 22 |
+
5,
|
| 23 |
+
0
|
| 24 |
+
],
|
| 25 |
+
[
|
| 26 |
+
4,
|
| 27 |
+
2,
|
| 28 |
+
1
|
| 29 |
+
],
|
| 30 |
+
[
|
| 31 |
+
0,
|
| 32 |
+
0,
|
| 33 |
+
6
|
| 34 |
+
]
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
"macro_roc_auc_ovr": 0.8016919142238835
|
| 38 |
+
},
|
| 39 |
+
"ablations": {
|
| 40 |
+
"no_session_aggregates": {
|
| 41 |
+
"n_features": 26,
|
| 42 |
+
"dropped_count": 8,
|
| 43 |
+
"metrics": {
|
| 44 |
+
"model": "xgboost_no_session_aggregates",
|
| 45 |
+
"accuracy": 0.7,
|
| 46 |
+
"macro_f1": 0.6129870129870131,
|
| 47 |
+
"weighted_f1": 0.6671861471861472,
|
| 48 |
+
"per_class_f1": {
|
| 49 |
+
"low": 0.8,
|
| 50 |
+
"medium": 0.18181818181818182,
|
| 51 |
+
"high": 0.8571428571428571
|
| 52 |
+
},
|
| 53 |
+
"confusion_matrix": {
|
| 54 |
+
"labels": [
|
| 55 |
+
"low",
|
| 56 |
+
"medium",
|
| 57 |
+
"high"
|
| 58 |
+
],
|
| 59 |
+
"matrix": [
|
| 60 |
+
[
|
| 61 |
+
14,
|
| 62 |
+
3,
|
| 63 |
+
0
|
| 64 |
+
],
|
| 65 |
+
[
|
| 66 |
+
4,
|
| 67 |
+
1,
|
| 68 |
+
2
|
| 69 |
+
],
|
| 70 |
+
[
|
| 71 |
+
0,
|
| 72 |
+
0,
|
| 73 |
+
6
|
| 74 |
+
]
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
"macro_roc_auc_ovr": 0.7625392687732843
|
| 78 |
+
},
|
| 79 |
+
"delta_accuracy": -0.033333333333333326,
|
| 80 |
+
"delta_macro_f1": 0.03236763236763229
|
| 81 |
+
},
|
| 82 |
+
"no_user_aggregates": {
|
| 83 |
+
"n_features": 26,
|
| 84 |
+
"dropped_count": 8,
|
| 85 |
+
"metrics": {
|
| 86 |
+
"model": "xgboost_no_user_aggregates",
|
| 87 |
+
"accuracy": 0.5333333333333333,
|
| 88 |
+
"macro_f1": 0.45864045864045866,
|
| 89 |
+
"weighted_f1": 0.5130221130221131,
|
| 90 |
+
"per_class_f1": {
|
| 91 |
+
"low": 0.6486486486486487,
|
| 92 |
+
"medium": 0.0,
|
| 93 |
+
"high": 0.7272727272727273
|
| 94 |
+
},
|
| 95 |
+
"confusion_matrix": {
|
| 96 |
+
"labels": [
|
| 97 |
+
"low",
|
| 98 |
+
"medium",
|
| 99 |
+
"high"
|
| 100 |
+
],
|
| 101 |
+
"matrix": [
|
| 102 |
+
[
|
| 103 |
+
12,
|
| 104 |
+
4,
|
| 105 |
+
1
|
| 106 |
+
],
|
| 107 |
+
[
|
| 108 |
+
7,
|
| 109 |
+
0,
|
| 110 |
+
0
|
| 111 |
+
],
|
| 112 |
+
[
|
| 113 |
+
1,
|
| 114 |
+
1,
|
| 115 |
+
4
|
| 116 |
+
]
|
| 117 |
+
]
|
| 118 |
+
},
|
| 119 |
+
"macro_roc_auc_ovr": 0.7042183744549474
|
| 120 |
+
},
|
| 121 |
+
"delta_accuracy": 0.1333333333333333,
|
| 122 |
+
"delta_macro_f1": 0.18671418671418671
|
| 123 |
+
},
|
| 124 |
+
"no_risk_scores": {
|
| 125 |
+
"n_features": 28,
|
| 126 |
+
"dropped_count": 6,
|
| 127 |
+
"metrics": {
|
| 128 |
+
"model": "xgboost_no_risk_scores",
|
| 129 |
+
"accuracy": 0.5666666666666667,
|
| 130 |
+
"macro_f1": 0.5300213675213675,
|
| 131 |
+
"weighted_f1": 0.5745405982905983,
|
| 132 |
+
"per_class_f1": {
|
| 133 |
+
"low": 0.6875,
|
| 134 |
+
"medium": 0.13333333333333333,
|
| 135 |
+
"high": 0.7692307692307693
|
| 136 |
+
},
|
| 137 |
+
"confusion_matrix": {
|
| 138 |
+
"labels": [
|
| 139 |
+
"low",
|
| 140 |
+
"medium",
|
| 141 |
+
"high"
|
| 142 |
+
],
|
| 143 |
+
"matrix": [
|
| 144 |
+
[
|
| 145 |
+
11,
|
| 146 |
+
6,
|
| 147 |
+
0
|
| 148 |
+
],
|
| 149 |
+
[
|
| 150 |
+
4,
|
| 151 |
+
1,
|
| 152 |
+
2
|
| 153 |
+
],
|
| 154 |
+
[
|
| 155 |
+
0,
|
| 156 |
+
1,
|
| 157 |
+
5
|
| 158 |
+
]
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
"macro_roc_auc_ovr": 0.7397649416511309
|
| 162 |
+
},
|
| 163 |
+
"delta_accuracy": 0.09999999999999998,
|
| 164 |
+
"delta_macro_f1": 0.11533327783327785
|
| 165 |
+
},
|
| 166 |
+
"no_engineered": {
|
| 167 |
+
"n_features": 28,
|
| 168 |
+
"dropped_count": 6,
|
| 169 |
+
"metrics": {
|
| 170 |
+
"model": "xgboost_no_engineered",
|
| 171 |
+
"accuracy": 0.5666666666666667,
|
| 172 |
+
"macro_f1": 0.5444444444444444,
|
| 173 |
+
"weighted_f1": 0.5755555555555555,
|
| 174 |
+
"per_class_f1": {
|
| 175 |
+
"low": 0.6666666666666666,
|
| 176 |
+
"medium": 0.13333333333333333,
|
| 177 |
+
"high": 0.8333333333333334
|
| 178 |
+
},
|
| 179 |
+
"confusion_matrix": {
|
| 180 |
+
"labels": [
|
| 181 |
+
"low",
|
| 182 |
+
"medium",
|
| 183 |
+
"high"
|
| 184 |
+
],
|
| 185 |
+
"matrix": [
|
| 186 |
+
[
|
| 187 |
+
11,
|
| 188 |
+
6,
|
| 189 |
+
0
|
| 190 |
+
],
|
| 191 |
+
[
|
| 192 |
+
5,
|
| 193 |
+
1,
|
| 194 |
+
1
|
| 195 |
+
],
|
| 196 |
+
[
|
| 197 |
+
0,
|
| 198 |
+
1,
|
| 199 |
+
5
|
| 200 |
+
]
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
+
"macro_roc_auc_ovr": 0.7972402822147068
|
| 204 |
+
},
|
| 205 |
+
"delta_accuracy": 0.09999999999999998,
|
| 206 |
+
"delta_macro_f1": 0.10091020091020098
|
| 207 |
+
}
|
| 208 |
+
}
|
| 209 |
+
}
|
feature_engineering.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
feature_engineering.py
|
| 3 |
+
======================
|
| 4 |
+
|
| 5 |
+
Feature pipeline for the CYB006 baseline classifier.
|
| 6 |
+
|
| 7 |
+
Predicts `user_risk_tier` (3-class: low / medium / high) from per-user
|
| 8 |
+
identity aggregates on the CYB006 sample dataset.
|
| 9 |
+
|
| 10 |
+
CSV inputs:
|
| 11 |
+
user_risk_summary.csv (primary, per-user aggregates, 200 rows)
|
| 12 |
+
login_sessions.csv (per-session telemetry, joined as
|
| 13 |
+
per-user behavioural aggregates)
|
| 14 |
+
identity_topology.csv (identity domain registry; reserved for
|
| 15 |
+
future work - no direct user join key)
|
| 16 |
+
auth_events.csv (discrete event log; reserved for
|
| 17 |
+
future work)
|
| 18 |
+
|
| 19 |
+
Target classes (3):
|
| 20 |
+
low, medium, high
|
| 21 |
+
|
| 22 |
+
Why this task instead of threat_actor_capability_tier
|
| 23 |
+
-----------------------------------------------------
|
| 24 |
+
The CYB006 README lists "threat-actor tier classification (4-class)" as
|
| 25 |
+
its primary suggested use case. We piloted that target first and found
|
| 26 |
+
the sample dataset has STRUCTURAL DETERMINISM: every actor-tier signal
|
| 27 |
+
in the data (velocity_anomaly_score, session_timestamp, credential
|
| 28 |
+
attempt count, login outcome, geo country code, device trust level,
|
| 29 |
+
user risk tier itself, geo anomaly score) carries non-overlapping
|
| 30 |
+
distributions between threat and legitimate sessions. As a result, a
|
| 31 |
+
plain XGBoost achieves 100% test accuracy on threat-actor binary
|
| 32 |
+
classification across every random seed - and stays at 97-100%
|
| 33 |
+
accuracy even with all six oracle feature groups removed.
|
| 34 |
+
|
| 35 |
+
This is not a methodological failure; it's a property of how the
|
| 36 |
+
sample was generated. Real-world identity telemetry has substantial
|
| 37 |
+
overlap between threat-actor and legitimate behaviour. The model card
|
| 38 |
+
documents this as a diagnostic finding for the dataset author and a
|
| 39 |
+
caveat for buyers planning to train detection models on the sample.
|
| 40 |
+
|
| 41 |
+
For a working baseline that demonstrates honest ML on the dataset, we
|
| 42 |
+
shifted to predicting `user_risk_tier` from per-user aggregates. This
|
| 43 |
+
task has overlapping per-tier feature distributions, no oracle features,
|
| 44 |
+
and lifts modestly over majority baseline (acc 0.66 vs 0.57 majority).
|
| 45 |
+
|
| 46 |
+
Public API
|
| 47 |
+
----------
|
| 48 |
+
build_features(user_risk_path, sessions_path) -> (X, y, ids, meta)
|
| 49 |
+
transform_single(record, meta) -> np.ndarray
|
| 50 |
+
save_meta(meta, path) / load_meta(path)
|
| 51 |
+
|
| 52 |
+
License
|
| 53 |
+
-------
|
| 54 |
+
Ships with the public model on Hugging Face under CC-BY-NC-4.0,
|
| 55 |
+
matching the dataset license. See README.md.
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
from __future__ import annotations
|
| 59 |
+
|
| 60 |
+
import json
|
| 61 |
+
from pathlib import Path
|
| 62 |
+
from typing import Any
|
| 63 |
+
|
| 64 |
+
import numpy as np
|
| 65 |
+
import pandas as pd
|
| 66 |
+
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
# Label space
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
# Ordered low -> high. Note: CYB006 README claims a 4th tier 'critical' but
|
| 72 |
+
# the sample data contains only 3 (low, medium, high).
|
| 73 |
+
LABEL_ORDER = ["low", "medium", "high"]
|
| 74 |
+
LABEL_TO_INT = {lbl: i for i, lbl in enumerate(LABEL_ORDER)}
|
| 75 |
+
INT_TO_LABEL = {i: lbl for lbl, i in LABEL_TO_INT.items()}
|
| 76 |
+
|
| 77 |
+
# ---------------------------------------------------------------------------
|
| 78 |
+
# Identifier and target columns
|
| 79 |
+
# ---------------------------------------------------------------------------
|
| 80 |
+
|
| 81 |
+
ID_COLUMNS = ["user_id"]
|
| 82 |
+
TARGET_COLUMN = "user_risk_tier"
|
| 83 |
+
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
# Per-user numeric features from user_risk_summary.csv
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# These are aggregate counts and continuous scores. They carry overlapping
|
| 88 |
+
# distributions across tiers - not oracles.
|
| 89 |
+
|
| 90 |
+
USER_NUMERIC_FEATURES = [
|
| 91 |
+
"total_login_attempts",
|
| 92 |
+
"successful_logins",
|
| 93 |
+
"failed_logins",
|
| 94 |
+
"mfa_failures",
|
| 95 |
+
"impossible_travel_events",
|
| 96 |
+
"lateral_hop_count",
|
| 97 |
+
"privilege_escalations",
|
| 98 |
+
"account_lockout_count",
|
| 99 |
+
"geo_dispersion_score",
|
| 100 |
+
"login_velocity_score",
|
| 101 |
+
"session_anomaly_rate",
|
| 102 |
+
"ueba_alert_count",
|
| 103 |
+
"overall_identity_risk_score",
|
| 104 |
+
"insider_threat_indicator_score",
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
USER_CATEGORICAL_FEATURES = [
|
| 108 |
+
"peak_privilege_level_accessed", # 6 values
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
# Note: we intentionally exclude `threat_actor_flag`, `account_takeover_flag`,
|
| 112 |
+
# and `credential_attack_victim_flag` from user_risk_summary as features.
|
| 113 |
+
# threat_actor_flag is a perfect oracle for whether tier=high (only high-tier
|
| 114 |
+
# users can be flagged threat actors). account_takeover and credential_attack
|
| 115 |
+
# are extremely rare (2/200 and 1/200) - not useful as features in the
|
| 116 |
+
# sample, and using them risks the same kind of structural leakage we
|
| 117 |
+
# documented for threat-actor classification.
|
| 118 |
+
USER_LEAKY_COLUMNS = [
|
| 119 |
+
"threat_actor_flag",
|
| 120 |
+
"account_takeover_flag",
|
| 121 |
+
"credential_attack_victim_flag",
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# ---------------------------------------------------------------------------
|
| 126 |
+
# Per-session aggregates joined into the user-level row
|
| 127 |
+
# ---------------------------------------------------------------------------
|
| 128 |
+
# We compute these from login_sessions.csv aggregated by user_id. They add
|
| 129 |
+
# behavioural color (avg session duration, fraction of sessions with
|
| 130 |
+
# impossible travel, etc.) without introducing leakage. We explicitly
|
| 131 |
+
# exclude session-level columns that exhibit non-overlap with threat actors
|
| 132 |
+
# (velocity_anomaly_score, session_timestamp_utc, credential_attempt_count,
|
| 133 |
+
# login_outcome) because those features create degenerate signal even when
|
| 134 |
+
# aggregated, and would compromise the user_risk_tier evaluation by
|
| 135 |
+
# enabling shortcuts via the threat_actor_flag-correlated structure.
|
| 136 |
+
|
| 137 |
+
SESSION_AGGS_NUMERIC = [
|
| 138 |
+
"avg_session_duration_seconds",
|
| 139 |
+
"avg_mfa_response_latency_ms",
|
| 140 |
+
"avg_geo_anomaly_score",
|
| 141 |
+
"max_geo_anomaly_score",
|
| 142 |
+
"frac_impossible_travel",
|
| 143 |
+
"n_unique_countries",
|
| 144 |
+
"n_unique_devices",
|
| 145 |
+
"n_unique_applications",
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def _aggregate_sessions(sessions: pd.DataFrame) -> pd.DataFrame:
|
| 150 |
+
"""Compute per-user session aggregates without using leaky features."""
|
| 151 |
+
g = sessions.groupby("user_id")
|
| 152 |
+
aggs = pd.DataFrame({
|
| 153 |
+
"avg_session_duration_seconds": g["session_duration_seconds"].mean(),
|
| 154 |
+
"avg_mfa_response_latency_ms": g["mfa_response_latency_ms"].mean(),
|
| 155 |
+
"avg_geo_anomaly_score": g["geo_anomaly_score"].mean(),
|
| 156 |
+
"max_geo_anomaly_score": g["geo_anomaly_score"].max(),
|
| 157 |
+
"frac_impossible_travel": g["impossible_travel_flag"].mean(),
|
| 158 |
+
"n_unique_countries": g["geo_country_code"].nunique(),
|
| 159 |
+
"n_unique_devices": g["device_id_hash"].nunique(),
|
| 160 |
+
"n_unique_applications": g["target_application_id"].nunique(),
|
| 161 |
+
}).reset_index()
|
| 162 |
+
return aggs
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
# ---------------------------------------------------------------------------
|
| 166 |
+
# Engineered features
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
|
| 169 |
+
def _add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
|
| 170 |
+
"""
|
| 171 |
+
Six engineered features that combine the raw aggregates into
|
| 172 |
+
risk-discriminative composites. None encode the target directly.
|
| 173 |
+
"""
|
| 174 |
+
df = df.copy()
|
| 175 |
+
|
| 176 |
+
# 1. Failed-login fraction. Common signal across all risk tiers but
|
| 177 |
+
# high-tier users have systematically more failures.
|
| 178 |
+
denom = df["total_login_attempts"].clip(lower=1)
|
| 179 |
+
df["failed_login_rate"] = (df["failed_logins"] / denom).astype(float)
|
| 180 |
+
|
| 181 |
+
# 2. MFA failure rate per login.
|
| 182 |
+
df["mfa_failure_rate"] = (df["mfa_failures"] / denom).astype(float)
|
| 183 |
+
|
| 184 |
+
# 3. UEBA alerts per session - normalizes alert count to session volume.
|
| 185 |
+
sess_denom = df["successful_logins"].clip(lower=1)
|
| 186 |
+
df["ueba_alerts_per_session"] = (df["ueba_alert_count"] / sess_denom).astype(float)
|
| 187 |
+
|
| 188 |
+
# 4. Lateral movement intensity (hops per privilege escalation).
|
| 189 |
+
pe_denom = df["privilege_escalations"].clip(lower=1)
|
| 190 |
+
df["hops_per_escalation"] = (df["lateral_hop_count"] / pe_denom).astype(float)
|
| 191 |
+
|
| 192 |
+
# 5. Geo-velocity composite: dispersion x velocity score (continuous).
|
| 193 |
+
df["geo_velocity_composite"] = (
|
| 194 |
+
df["geo_dispersion_score"] * df["login_velocity_score"]
|
| 195 |
+
).astype(float)
|
| 196 |
+
|
| 197 |
+
# 6. Composite identity-anomaly score: average of risk + insider scores.
|
| 198 |
+
df["composite_anomaly_score"] = (
|
| 199 |
+
(df["overall_identity_risk_score"] + df["insider_threat_indicator_score"]) / 2.0
|
| 200 |
+
).astype(float)
|
| 201 |
+
|
| 202 |
+
return df
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# ---------------------------------------------------------------------------
|
| 206 |
+
# Public API
|
| 207 |
+
# ---------------------------------------------------------------------------
|
| 208 |
+
|
| 209 |
+
def build_features(
|
| 210 |
+
user_risk_path: str | Path,
|
| 211 |
+
sessions_path: str | Path,
|
| 212 |
+
) -> tuple[pd.DataFrame, pd.Series, pd.Series, dict[str, Any]]:
|
| 213 |
+
"""
|
| 214 |
+
Load user_risk_summary, join non-leaky session aggregates, engineer
|
| 215 |
+
features, one-hot encode, return (X, y, ids, meta).
|
| 216 |
+
|
| 217 |
+
`ids` is a Series of user_id values aligned with X (used for
|
| 218 |
+
deterministic predictions / round-tripping; not a group label since
|
| 219 |
+
this task is user-level, not session-level).
|
| 220 |
+
"""
|
| 221 |
+
users = pd.read_csv(user_risk_path)
|
| 222 |
+
sessions = pd.read_csv(sessions_path)
|
| 223 |
+
|
| 224 |
+
y = users[TARGET_COLUMN].map(LABEL_TO_INT)
|
| 225 |
+
if y.isna().any():
|
| 226 |
+
bad = users.loc[y.isna(), TARGET_COLUMN].unique()
|
| 227 |
+
raise ValueError(f"Unknown user_risk_tier values: {bad}")
|
| 228 |
+
y = y.astype(int)
|
| 229 |
+
ids = users["user_id"].copy()
|
| 230 |
+
|
| 231 |
+
users = users.drop(
|
| 232 |
+
columns=ID_COLUMNS + [TARGET_COLUMN] + USER_LEAKY_COLUMNS,
|
| 233 |
+
errors="ignore",
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
session_aggs = _aggregate_sessions(sessions)
|
| 237 |
+
users["__user_id__"] = ids
|
| 238 |
+
users = users.merge(
|
| 239 |
+
session_aggs.rename(columns={"user_id": "__user_id__"}),
|
| 240 |
+
on="__user_id__", how="left",
|
| 241 |
+
).drop(columns=["__user_id__"])
|
| 242 |
+
|
| 243 |
+
users = _add_engineered_features(users)
|
| 244 |
+
|
| 245 |
+
numeric_features = (
|
| 246 |
+
USER_NUMERIC_FEATURES
|
| 247 |
+
+ SESSION_AGGS_NUMERIC
|
| 248 |
+
+ [
|
| 249 |
+
"failed_login_rate", "mfa_failure_rate", "ueba_alerts_per_session",
|
| 250 |
+
"hops_per_escalation", "geo_velocity_composite", "composite_anomaly_score",
|
| 251 |
+
]
|
| 252 |
+
)
|
| 253 |
+
numeric_features = [c for c in numeric_features if c in users.columns]
|
| 254 |
+
X_numeric = users[numeric_features].astype(float)
|
| 255 |
+
|
| 256 |
+
categorical_levels: dict[str, list[str]] = {}
|
| 257 |
+
blocks: list[pd.DataFrame] = []
|
| 258 |
+
for col in USER_CATEGORICAL_FEATURES:
|
| 259 |
+
if col not in users.columns:
|
| 260 |
+
continue
|
| 261 |
+
levels = sorted(users[col].dropna().unique().tolist())
|
| 262 |
+
categorical_levels[col] = levels
|
| 263 |
+
block = pd.get_dummies(
|
| 264 |
+
users[col].astype("category").cat.set_categories(levels),
|
| 265 |
+
prefix=col, dummy_na=False,
|
| 266 |
+
).astype(int)
|
| 267 |
+
blocks.append(block)
|
| 268 |
+
|
| 269 |
+
X = pd.concat(
|
| 270 |
+
[X_numeric.reset_index(drop=True)]
|
| 271 |
+
+ [b.reset_index(drop=True) for b in blocks],
|
| 272 |
+
axis=1,
|
| 273 |
+
).fillna(0.0)
|
| 274 |
+
|
| 275 |
+
meta = {
|
| 276 |
+
"feature_names": X.columns.tolist(),
|
| 277 |
+
"numeric_features": numeric_features,
|
| 278 |
+
"categorical_levels": categorical_levels,
|
| 279 |
+
"label_to_int": LABEL_TO_INT,
|
| 280 |
+
"int_to_label": INT_TO_LABEL,
|
| 281 |
+
"user_leaky_excluded": USER_LEAKY_COLUMNS,
|
| 282 |
+
}
|
| 283 |
+
return X, y, ids, meta
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def transform_single(
|
| 287 |
+
record: dict | pd.DataFrame,
|
| 288 |
+
meta: dict[str, Any],
|
| 289 |
+
) -> np.ndarray:
|
| 290 |
+
"""Encode a single per-user record for inference.
|
| 291 |
+
|
| 292 |
+
Caller is responsible for computing session aggregates (the
|
| 293 |
+
SESSION_AGGS_NUMERIC fields) and passing them in record. See the
|
| 294 |
+
inference notebook for the standard pattern.
|
| 295 |
+
"""
|
| 296 |
+
if isinstance(record, dict):
|
| 297 |
+
df = pd.DataFrame([record.copy()])
|
| 298 |
+
else:
|
| 299 |
+
df = record.copy()
|
| 300 |
+
|
| 301 |
+
df = _add_engineered_features(df)
|
| 302 |
+
|
| 303 |
+
numeric = pd.DataFrame({
|
| 304 |
+
col: df.get(col, pd.Series([0.0] * len(df))).astype(float).values
|
| 305 |
+
for col in meta["numeric_features"]
|
| 306 |
+
})
|
| 307 |
+
blocks: list[pd.DataFrame] = [numeric]
|
| 308 |
+
for col, levels in meta["categorical_levels"].items():
|
| 309 |
+
val = df.get(col, pd.Series([None] * len(df)))
|
| 310 |
+
block = pd.get_dummies(
|
| 311 |
+
val.astype("category").cat.set_categories(levels),
|
| 312 |
+
prefix=col, dummy_na=False,
|
| 313 |
+
).astype(int)
|
| 314 |
+
for lvl in levels:
|
| 315 |
+
cname = f"{col}_{lvl}"
|
| 316 |
+
if cname not in block.columns:
|
| 317 |
+
block[cname] = 0
|
| 318 |
+
block = block[[f"{col}_{lvl}" for lvl in levels]]
|
| 319 |
+
blocks.append(block)
|
| 320 |
+
|
| 321 |
+
X = pd.concat(blocks, axis=1).fillna(0.0)
|
| 322 |
+
X = X.reindex(columns=meta["feature_names"], fill_value=0.0)
|
| 323 |
+
return X.values.astype(np.float32)
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def save_meta(meta: dict[str, Any], path: str | Path) -> None:
|
| 327 |
+
serializable = {
|
| 328 |
+
"feature_names": meta["feature_names"],
|
| 329 |
+
"numeric_features": meta["numeric_features"],
|
| 330 |
+
"categorical_levels": meta["categorical_levels"],
|
| 331 |
+
"label_to_int": meta["label_to_int"],
|
| 332 |
+
"int_to_label": {str(k): v for k, v in meta["int_to_label"].items()},
|
| 333 |
+
"user_leaky_excluded": meta.get("user_leaky_excluded", []),
|
| 334 |
+
}
|
| 335 |
+
with open(path, "w") as f:
|
| 336 |
+
json.dump(serializable, f, indent=2)
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def load_meta(path: str | Path) -> dict[str, Any]:
|
| 340 |
+
with open(path) as f:
|
| 341 |
+
meta = json.load(f)
|
| 342 |
+
meta["int_to_label"] = {int(k): v for k, v in meta["int_to_label"].items()}
|
| 343 |
+
return meta
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def compute_session_aggregates_for_user(
|
| 347 |
+
user_sessions: pd.DataFrame,
|
| 348 |
+
) -> dict:
|
| 349 |
+
"""Compute session aggregates for a single user (used at inference)."""
|
| 350 |
+
aggs = {
|
| 351 |
+
"avg_session_duration_seconds": float(user_sessions["session_duration_seconds"].mean()),
|
| 352 |
+
"avg_mfa_response_latency_ms": float(user_sessions["mfa_response_latency_ms"].mean()),
|
| 353 |
+
"avg_geo_anomaly_score": float(user_sessions["geo_anomaly_score"].mean()),
|
| 354 |
+
"max_geo_anomaly_score": float(user_sessions["geo_anomaly_score"].max()),
|
| 355 |
+
"frac_impossible_travel": float(user_sessions["impossible_travel_flag"].mean()),
|
| 356 |
+
"n_unique_countries": int(user_sessions["geo_country_code"].nunique()),
|
| 357 |
+
"n_unique_devices": int(user_sessions["device_id_hash"].nunique()),
|
| 358 |
+
"n_unique_applications": int(user_sessions["target_application_id"].nunique()),
|
| 359 |
+
}
|
| 360 |
+
return aggs
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
if __name__ == "__main__":
|
| 364 |
+
import sys
|
| 365 |
+
base = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("/mnt/user-data/uploads")
|
| 366 |
+
X, y, ids, meta = build_features(
|
| 367 |
+
base / "user_risk_summary.csv",
|
| 368 |
+
base / "login_sessions.csv",
|
| 369 |
+
)
|
| 370 |
+
print(f"X shape: {X.shape}")
|
| 371 |
+
print(f"y shape: {y.shape}")
|
| 372 |
+
print(f"n_features: {len(meta['feature_names'])}")
|
| 373 |
+
print(f"label distribution:\n{y.map(INT_TO_LABEL).value_counts()}")
|
| 374 |
+
print(f"X has NaN: {X.isnull().any().any()}")
|
feature_meta.json
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"feature_names": [
|
| 3 |
+
"total_login_attempts",
|
| 4 |
+
"successful_logins",
|
| 5 |
+
"failed_logins",
|
| 6 |
+
"mfa_failures",
|
| 7 |
+
"impossible_travel_events",
|
| 8 |
+
"lateral_hop_count",
|
| 9 |
+
"privilege_escalations",
|
| 10 |
+
"account_lockout_count",
|
| 11 |
+
"geo_dispersion_score",
|
| 12 |
+
"login_velocity_score",
|
| 13 |
+
"session_anomaly_rate",
|
| 14 |
+
"ueba_alert_count",
|
| 15 |
+
"overall_identity_risk_score",
|
| 16 |
+
"insider_threat_indicator_score",
|
| 17 |
+
"avg_session_duration_seconds",
|
| 18 |
+
"avg_mfa_response_latency_ms",
|
| 19 |
+
"avg_geo_anomaly_score",
|
| 20 |
+
"max_geo_anomaly_score",
|
| 21 |
+
"frac_impossible_travel",
|
| 22 |
+
"n_unique_countries",
|
| 23 |
+
"n_unique_devices",
|
| 24 |
+
"n_unique_applications",
|
| 25 |
+
"failed_login_rate",
|
| 26 |
+
"mfa_failure_rate",
|
| 27 |
+
"ueba_alerts_per_session",
|
| 28 |
+
"hops_per_escalation",
|
| 29 |
+
"geo_velocity_composite",
|
| 30 |
+
"composite_anomaly_score",
|
| 31 |
+
"peak_privilege_level_accessed_admin_domain",
|
| 32 |
+
"peak_privilege_level_accessed_admin_local",
|
| 33 |
+
"peak_privilege_level_accessed_global_admin",
|
| 34 |
+
"peak_privilege_level_accessed_power_user",
|
| 35 |
+
"peak_privilege_level_accessed_service_account",
|
| 36 |
+
"peak_privilege_level_accessed_standard_user"
|
| 37 |
+
],
|
| 38 |
+
"numeric_features": [
|
| 39 |
+
"total_login_attempts",
|
| 40 |
+
"successful_logins",
|
| 41 |
+
"failed_logins",
|
| 42 |
+
"mfa_failures",
|
| 43 |
+
"impossible_travel_events",
|
| 44 |
+
"lateral_hop_count",
|
| 45 |
+
"privilege_escalations",
|
| 46 |
+
"account_lockout_count",
|
| 47 |
+
"geo_dispersion_score",
|
| 48 |
+
"login_velocity_score",
|
| 49 |
+
"session_anomaly_rate",
|
| 50 |
+
"ueba_alert_count",
|
| 51 |
+
"overall_identity_risk_score",
|
| 52 |
+
"insider_threat_indicator_score",
|
| 53 |
+
"avg_session_duration_seconds",
|
| 54 |
+
"avg_mfa_response_latency_ms",
|
| 55 |
+
"avg_geo_anomaly_score",
|
| 56 |
+
"max_geo_anomaly_score",
|
| 57 |
+
"frac_impossible_travel",
|
| 58 |
+
"n_unique_countries",
|
| 59 |
+
"n_unique_devices",
|
| 60 |
+
"n_unique_applications",
|
| 61 |
+
"failed_login_rate",
|
| 62 |
+
"mfa_failure_rate",
|
| 63 |
+
"ueba_alerts_per_session",
|
| 64 |
+
"hops_per_escalation",
|
| 65 |
+
"geo_velocity_composite",
|
| 66 |
+
"composite_anomaly_score"
|
| 67 |
+
],
|
| 68 |
+
"categorical_levels": {
|
| 69 |
+
"peak_privilege_level_accessed": [
|
| 70 |
+
"admin_domain",
|
| 71 |
+
"admin_local",
|
| 72 |
+
"global_admin",
|
| 73 |
+
"power_user",
|
| 74 |
+
"service_account",
|
| 75 |
+
"standard_user"
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
"label_to_int": {
|
| 79 |
+
"low": 0,
|
| 80 |
+
"medium": 1,
|
| 81 |
+
"high": 2
|
| 82 |
+
},
|
| 83 |
+
"int_to_label": {
|
| 84 |
+
"0": "low",
|
| 85 |
+
"1": "medium",
|
| 86 |
+
"2": "high"
|
| 87 |
+
},
|
| 88 |
+
"user_leaky_excluded": [
|
| 89 |
+
"threat_actor_flag",
|
| 90 |
+
"account_takeover_flag",
|
| 91 |
+
"credential_attack_victim_flag"
|
| 92 |
+
]
|
| 93 |
+
}
|
feature_scaler.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"mean": [47.618705035971225, 21.762589928057555, 3.237410071942446, 0.7410071942446043, 2.4244604316546763, 0.1079136690647482, 0.03597122302158273, 0.8848920863309353, 0.07807338129496402, 0.09575827338129496, 0.12949640287769784, 0.07913669064748201, 0.14728705035971226, 0.05653093525179856, 2652.875971223021, 4019.576460431655, 0.07807044604316546, 0.5725964028776979, 0.09697841726618706, 2.683453237410072, 25.0, 1.0, 0.06872126318882517, 0.027102828021972523, 0.045563549160671464, 0.05935251798561151, 0.03040965165467626, 0.1019089928057554, 0.3669064748201439, 0.10071942446043165, 0.460431654676259, 0.0, 0.02158273381294964, 0.050359712230215826], "std": [106.04424805776597, 6.126188419238651, 6.126188419238651, 0.9505004745510551, 2.425643250309251, 0.873860728150148, 0.3491204671960097, 3.6891520110635208, 0.15358191487335077, 0.15428793799740126, 0.24504753676954605, 0.6816938430652185, 0.11701980746154912, 0.025106938959211542, 728.8513772007428, 2612.6588768587844, 0.15358241474479778, 0.37022309870273346, 0.09702573001237004, 1.3353389149294461, 1.0, 1.0, 0.08962918241935523, 0.03507100441435125, 0.39153983731053577, 0.470682835200424, 0.10288120999068875, 0.05297033872352956, 0.48370377945817283, 0.30204529914550604, 0.5002345399237736, 1.0, 0.14584217692177975, 0.21947701365872982]}
|
inference_example.ipynb
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# CYB006 Baseline Classifier — Inference Example\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"End-to-end demo: load the trained XGBoost and PyTorch MLP models from the Hugging Face repo and predict the **user risk tier** (`low` / `medium` / `high`) of an identity from per-user aggregates joined with non-leaky session aggregates.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"**This is a baseline reference model**, not a production identity-security platform. See the model card for full metrics and limitations — and importantly, see the **`leakage_diagnostic.json`** for why this baseline targets `user_risk_tier` rather than the README's stated headline use case of threat-actor tier attribution."
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "markdown",
|
| 16 |
+
"metadata": {},
|
| 17 |
+
"source": [
|
| 18 |
+
"## 1. Install dependencies"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": null,
|
| 24 |
+
"metadata": {},
|
| 25 |
+
"outputs": [],
|
| 26 |
+
"source": [
|
| 27 |
+
"%pip install --quiet xgboost torch safetensors pandas numpy huggingface_hub"
|
| 28 |
+
]
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"cell_type": "markdown",
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"source": [
|
| 34 |
+
"## 2. Download model artifacts from Hugging Face"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": null,
|
| 40 |
+
"metadata": {},
|
| 41 |
+
"outputs": [],
|
| 42 |
+
"source": [
|
| 43 |
+
"from huggingface_hub import hf_hub_download\n",
|
| 44 |
+
"\n",
|
| 45 |
+
"REPO_ID = \"xpertsystems/cyb006-baseline-classifier\"\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"files = {}\n",
|
| 48 |
+
"for name in [\"model_xgb.json\", \"model_mlp.safetensors\",\n",
|
| 49 |
+
" \"feature_engineering.py\", \"feature_meta.json\",\n",
|
| 50 |
+
" \"feature_scaler.json\"]:\n",
|
| 51 |
+
" files[name] = hf_hub_download(repo_id=REPO_ID, filename=name)\n",
|
| 52 |
+
" print(f\" downloaded: {name}\")"
|
| 53 |
+
]
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"cell_type": "code",
|
| 57 |
+
"execution_count": null,
|
| 58 |
+
"metadata": {},
|
| 59 |
+
"outputs": [],
|
| 60 |
+
"source": [
|
| 61 |
+
"import sys, os\n",
|
| 62 |
+
"fe_dir = os.path.dirname(files[\"feature_engineering.py\"])\n",
|
| 63 |
+
"if fe_dir not in sys.path:\n",
|
| 64 |
+
" sys.path.insert(0, fe_dir)\n",
|
| 65 |
+
"\n",
|
| 66 |
+
"from feature_engineering import (\n",
|
| 67 |
+
" transform_single, load_meta, INT_TO_LABEL,\n",
|
| 68 |
+
" compute_session_aggregates_for_user\n",
|
| 69 |
+
")"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "markdown",
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"source": [
|
| 76 |
+
"## 3. Load models and metadata"
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"cell_type": "code",
|
| 81 |
+
"execution_count": null,
|
| 82 |
+
"metadata": {},
|
| 83 |
+
"outputs": [],
|
| 84 |
+
"source": [
|
| 85 |
+
"import json\n",
|
| 86 |
+
"import numpy as np\n",
|
| 87 |
+
"import torch\n",
|
| 88 |
+
"import torch.nn as nn\n",
|
| 89 |
+
"import xgboost as xgb\n",
|
| 90 |
+
"from safetensors.torch import load_file\n",
|
| 91 |
+
"\n",
|
| 92 |
+
"meta = load_meta(files[\"feature_meta.json\"])\n",
|
| 93 |
+
"with open(files[\"feature_scaler.json\"]) as f:\n",
|
| 94 |
+
" scaler = json.load(f)\n",
|
| 95 |
+
"\n",
|
| 96 |
+
"N_FEATURES = len(meta[\"feature_names\"])\n",
|
| 97 |
+
"N_CLASSES = len(meta[\"int_to_label\"])\n",
|
| 98 |
+
"print(f\"feature count: {N_FEATURES}\")\n",
|
| 99 |
+
"print(f\"class count: {N_CLASSES}\")\n",
|
| 100 |
+
"print(f\"label classes: {list(meta['int_to_label'].values())}\")"
|
| 101 |
+
]
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"cell_type": "code",
|
| 105 |
+
"execution_count": null,
|
| 106 |
+
"metadata": {},
|
| 107 |
+
"outputs": [],
|
| 108 |
+
"source": [
|
| 109 |
+
"xgb_model = xgb.XGBClassifier()\n",
|
| 110 |
+
"xgb_model.load_model(files[\"model_xgb.json\"])\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"# MLP architecture (must match training)\n",
|
| 113 |
+
"class RiskTierMLP(nn.Module):\n",
|
| 114 |
+
" def __init__(self, n_features, n_classes=3, hidden1=128, hidden2=64, dropout=0.3):\n",
|
| 115 |
+
" super().__init__()\n",
|
| 116 |
+
" self.net = nn.Sequential(\n",
|
| 117 |
+
" nn.Linear(n_features, hidden1),\n",
|
| 118 |
+
" nn.BatchNorm1d(hidden1),\n",
|
| 119 |
+
" nn.ReLU(),\n",
|
| 120 |
+
" nn.Dropout(dropout),\n",
|
| 121 |
+
" nn.Linear(hidden1, hidden2),\n",
|
| 122 |
+
" nn.BatchNorm1d(hidden2),\n",
|
| 123 |
+
" nn.ReLU(),\n",
|
| 124 |
+
" nn.Dropout(dropout),\n",
|
| 125 |
+
" nn.Linear(hidden2, n_classes),\n",
|
| 126 |
+
" )\n",
|
| 127 |
+
" def forward(self, x):\n",
|
| 128 |
+
" return self.net(x)\n",
|
| 129 |
+
"\n",
|
| 130 |
+
"mlp_model = RiskTierMLP(N_FEATURES, n_classes=N_CLASSES)\n",
|
| 131 |
+
"mlp_model.load_state_dict(load_file(files[\"model_mlp.safetensors\"]))\n",
|
| 132 |
+
"mlp_model.eval()\n",
|
| 133 |
+
"print(\"models loaded\")"
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
{
|
| 137 |
+
"cell_type": "markdown",
|
| 138 |
+
"metadata": {},
|
| 139 |
+
"source": [
|
| 140 |
+
"## 4. Prediction helper"
|
| 141 |
+
]
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"cell_type": "code",
|
| 145 |
+
"execution_count": null,
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"outputs": [],
|
| 148 |
+
"source": [
|
| 149 |
+
"MU = np.array(scaler[\"mean\"], dtype=np.float32)\n",
|
| 150 |
+
"SD = np.array(scaler[\"std\"], dtype=np.float32)\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"def predict_risk_tier(user_record: dict) -> dict:\n",
|
| 153 |
+
" \"\"\"Predict the user risk tier from a per-user record.\n",
|
| 154 |
+
"\n",
|
| 155 |
+
" The record should contain per-user aggregates (from user_risk_summary)\n",
|
| 156 |
+
" PLUS the session aggregates produced by compute_session_aggregates_for_user.\n",
|
| 157 |
+
" See the example record below.\n",
|
| 158 |
+
" \"\"\"\n",
|
| 159 |
+
" X = transform_single(user_record, meta)\n",
|
| 160 |
+
"\n",
|
| 161 |
+
" xgb_proba = xgb_model.predict_proba(X)[0]\n",
|
| 162 |
+
" xgb_label = INT_TO_LABEL[int(np.argmax(xgb_proba))]\n",
|
| 163 |
+
"\n",
|
| 164 |
+
" Xs = ((X - MU) / SD).astype(np.float32)\n",
|
| 165 |
+
" with torch.no_grad():\n",
|
| 166 |
+
" logits = mlp_model(torch.tensor(Xs))\n",
|
| 167 |
+
" mlp_proba = torch.softmax(logits, dim=1).numpy()[0]\n",
|
| 168 |
+
" mlp_label = INT_TO_LABEL[int(np.argmax(mlp_proba))]\n",
|
| 169 |
+
"\n",
|
| 170 |
+
" return {\n",
|
| 171 |
+
" \"xgboost\": {\n",
|
| 172 |
+
" \"label\": xgb_label,\n",
|
| 173 |
+
" \"probabilities\": {INT_TO_LABEL[i]: float(p) for i, p in enumerate(xgb_proba)},\n",
|
| 174 |
+
" },\n",
|
| 175 |
+
" \"mlp\": {\n",
|
| 176 |
+
" \"label\": mlp_label,\n",
|
| 177 |
+
" \"probabilities\": {INT_TO_LABEL[i]: float(p) for i, p in enumerate(mlp_proba)},\n",
|
| 178 |
+
" },\n",
|
| 179 |
+
" }"
|
| 180 |
+
]
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"cell_type": "markdown",
|
| 184 |
+
"metadata": {},
|
| 185 |
+
"source": [
|
| 186 |
+
"## 5. Run on an example record\n",
|
| 187 |
+
"\n",
|
| 188 |
+
"Real high-risk user from the sample dataset: 98 login attempts in window, 25 failures, 9 account lockouts, 9 impossible-travel events, 6 unique countries, peak privilege `admin_domain`. Both models should predict `high`."
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "code",
|
| 193 |
+
"execution_count": null,
|
| 194 |
+
"metadata": {},
|
| 195 |
+
"outputs": [],
|
| 196 |
+
"source": [
|
| 197 |
+
"# Real per-user record from the sample dataset (true tier: high)\n",
|
| 198 |
+
"example_record = {\n",
|
| 199 |
+
" # Per-user aggregates (from user_risk_summary.csv)\n",
|
| 200 |
+
" \"total_login_attempts\": 98,\n",
|
| 201 |
+
" \"successful_logins\": 0,\n",
|
| 202 |
+
" \"failed_logins\": 25,\n",
|
| 203 |
+
" \"mfa_failures\": 0,\n",
|
| 204 |
+
" \"impossible_travel_events\": 9,\n",
|
| 205 |
+
" \"lateral_hop_count\": 1,\n",
|
| 206 |
+
" \"privilege_escalations\": 1,\n",
|
| 207 |
+
" \"account_lockout_count\": 9,\n",
|
| 208 |
+
" \"geo_dispersion_score\": 0.6474,\n",
|
| 209 |
+
" \"login_velocity_score\": 0.6387,\n",
|
| 210 |
+
" \"session_anomaly_rate\": 1.0,\n",
|
| 211 |
+
" \"ueba_alert_count\": 0,\n",
|
| 212 |
+
" \"overall_identity_risk_score\": 0.3452,\n",
|
| 213 |
+
" \"peak_privilege_level_accessed\": \"admin_domain\",\n",
|
| 214 |
+
" \"insider_threat_indicator_score\": 0.0,\n",
|
| 215 |
+
" # Session aggregates (computed via compute_session_aggregates_for_user)\n",
|
| 216 |
+
" \"avg_session_duration_seconds\": 352.24,\n",
|
| 217 |
+
" \"avg_mfa_response_latency_ms\": 26.67,\n",
|
| 218 |
+
" \"avg_geo_anomaly_score\": 0.6474,\n",
|
| 219 |
+
" \"max_geo_anomaly_score\": 1.0,\n",
|
| 220 |
+
" \"frac_impossible_travel\": 0.36,\n",
|
| 221 |
+
" \"n_unique_countries\": 6,\n",
|
| 222 |
+
" \"n_unique_devices\": 25,\n",
|
| 223 |
+
" \"n_unique_applications\": 1,\n",
|
| 224 |
+
"}\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"result = predict_risk_tier(example_record)\n",
|
| 227 |
+
"\n",
|
| 228 |
+
"print(f\"XGBoost -> {result['xgboost']['label']}\")\n",
|
| 229 |
+
"for lbl, p in sorted(result['xgboost']['probabilities'].items(), key=lambda x: -x[1]):\n",
|
| 230 |
+
" print(f\" P({lbl:8s}) = {p:.4f}\")\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"print(f\"\\nMLP -> {result['mlp']['label']}\")\n",
|
| 233 |
+
"for lbl, p in sorted(result['mlp']['probabilities'].items(), key=lambda x: -x[1]):\n",
|
| 234 |
+
" print(f\" P({lbl:8s}) = {p:.4f}\")"
|
| 235 |
+
]
|
| 236 |
+
},
|
| 237 |
+
{
|
| 238 |
+
"cell_type": "markdown",
|
| 239 |
+
"metadata": {},
|
| 240 |
+
"source": [
|
| 241 |
+
"## 6. Batch prediction on the sample dataset\n",
|
| 242 |
+
"\n",
|
| 243 |
+
"Score every user in `user_risk_summary.csv` after joining their session aggregates from `login_sessions.csv`."
|
| 244 |
+
]
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"cell_type": "code",
|
| 248 |
+
"execution_count": null,
|
| 249 |
+
"metadata": {},
|
| 250 |
+
"outputs": [],
|
| 251 |
+
"source": [
|
| 252 |
+
"from huggingface_hub import snapshot_download\n",
|
| 253 |
+
"import pandas as pd\n",
|
| 254 |
+
"\n",
|
| 255 |
+
"ds_path = snapshot_download(repo_id=\"xpertsystems/cyb006-sample\", repo_type=\"dataset\")\n",
|
| 256 |
+
"users = pd.read_csv(f\"{ds_path}/user_risk_summary.csv\")\n",
|
| 257 |
+
"sessions = pd.read_csv(f\"{ds_path}/login_sessions.csv\")\n",
|
| 258 |
+
"\n",
|
| 259 |
+
"preds = []\n",
|
| 260 |
+
"for _, row in users.head(50).iterrows():\n",
|
| 261 |
+
" user_sessions = sessions[sessions[\"user_id\"] == row[\"user_id\"]]\n",
|
| 262 |
+
" if len(user_sessions) == 0:\n",
|
| 263 |
+
" continue\n",
|
| 264 |
+
" rec = row.to_dict()\n",
|
| 265 |
+
" rec.update(compute_session_aggregates_for_user(user_sessions))\n",
|
| 266 |
+
" pred = predict_risk_tier(rec)\n",
|
| 267 |
+
" preds.append({\n",
|
| 268 |
+
" \"user_id\": row[\"user_id\"],\n",
|
| 269 |
+
" \"true_tier\": row[\"user_risk_tier\"],\n",
|
| 270 |
+
" \"xgb_pred\": pred[\"xgboost\"][\"label\"],\n",
|
| 271 |
+
" })\n",
|
| 272 |
+
"\n",
|
| 273 |
+
"results = pd.DataFrame(preds)\n",
|
| 274 |
+
"ct = pd.crosstab(results[\"true_tier\"], results[\"xgb_pred\"],\n",
|
| 275 |
+
" rownames=[\"true\"], colnames=[\"pred\"])\n",
|
| 276 |
+
"print(\"Confusion on first 50 users (XGBoost):\")\n",
|
| 277 |
+
"print(ct)\n",
|
| 278 |
+
"acc = (results[\"true_tier\"] == results[\"xgb_pred\"]).mean()\n",
|
| 279 |
+
"print(f\"\\nbatch accuracy on first 50 users (in-distribution): {acc:.4f}\")\n",
|
| 280 |
+
"print(\"\\nNote: this includes training-set users. See validation_results.json\\n\"\n",
|
| 281 |
+
" \"for proper held-out test metrics.\")"
|
| 282 |
+
]
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"cell_type": "markdown",
|
| 286 |
+
"metadata": {},
|
| 287 |
+
"source": [
|
| 288 |
+
"## 7. Important: the leakage diagnostic\n",
|
| 289 |
+
"\n",
|
| 290 |
+
"Before using CYB006 sample data to train a threat-actor detector, read **`leakage_diagnostic.json`** in this repo. The README's stated headline use case (4-class threat-actor tier attribution) is not a representative ML task on the sample dataset — the synthetic generator produces threat-actor sessions with non-overlapping anomaly score distributions, so a plain XGBoost achieves 100% accuracy that doesn't reflect any real learning. The diagnostic documents which feature groups carry the leakage and what we recommend to dataset authors.\n",
|
| 291 |
+
"\n",
|
| 292 |
+
"This baseline ships `user_risk_tier` prediction instead, which has overlapping per-tier distributions and lifts ~10pp over majority baseline."
|
| 293 |
+
]
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"cell_type": "markdown",
|
| 297 |
+
"metadata": {},
|
| 298 |
+
"source": [
|
| 299 |
+
"## 8. Next steps\n",
|
| 300 |
+
"\n",
|
| 301 |
+
"- See `validation_results.json` for held-out test metrics (30 disjoint users).\n",
|
| 302 |
+
"- See `multi_seed_results.json` for the across-10-seeds picture (accuracy 0.700 ± 0.082, ROC-AUC 0.812 ± 0.048).\n",
|
| 303 |
+
"- See `ablation_results.json` for per-feature-group contribution. User aggregate counts (failed logins, lateral hops, etc.) carry the most signal.\n",
|
| 304 |
+
"- See **`leakage_diagnostic.json`** for the detailed audit on threat-actor detection.\n",
|
| 305 |
+
"- For the full ~1.1M-row CYB006 dataset and commercial licensing, contact **pradeep@xpertsystems.ai**."
|
| 306 |
+
]
|
| 307 |
+
}
|
| 308 |
+
],
|
| 309 |
+
"metadata": {
|
| 310 |
+
"kernelspec": {
|
| 311 |
+
"display_name": "Python 3",
|
| 312 |
+
"language": "python",
|
| 313 |
+
"name": "python3"
|
| 314 |
+
},
|
| 315 |
+
"language_info": {
|
| 316 |
+
"name": "python",
|
| 317 |
+
"version": "3.10"
|
| 318 |
+
}
|
| 319 |
+
},
|
| 320 |
+
"nbformat": 4,
|
| 321 |
+
"nbformat_minor": 5
|
| 322 |
+
}
|
leakage_diagnostic.json
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"purpose": "Document why threat_actor_capability_tier (the README's stated headline use case) was NOT shipped as the primary baseline. Every oracle feature group is independently sufficient for 100% test accuracy on threat-actor binary detection; even with all 6 groups dropped, accuracy stays >97%. This is a structural property of the sample's generator (non-overlapping anomaly distributions between threat and legitimate sessions), not a methodology failure. Real-world identity telemetry has substantial overlap; this sample dataset does not reproduce it.",
|
| 3 |
+
"target": "threat_actor_capability_tier != 'none' (binary)",
|
| 4 |
+
"split": "GroupShuffleSplit by user_id, 70/15/15 nested",
|
| 5 |
+
"non_overlapping_distributions": {
|
| 6 |
+
"velocity_anomaly_score": {
|
| 7 |
+
"actor_range": [
|
| 8 |
+
0.5213,
|
| 9 |
+
0.8181
|
| 10 |
+
],
|
| 11 |
+
"non_actor_range": [
|
| 12 |
+
0.0,
|
| 13 |
+
0.2469
|
| 14 |
+
],
|
| 15 |
+
"actor_mean": 0.651,
|
| 16 |
+
"non_actor_mean": 0.053
|
| 17 |
+
},
|
| 18 |
+
"session_timestamp_utc": {
|
| 19 |
+
"actor_range": [
|
| 20 |
+
6417,
|
| 21 |
+
1440062
|
| 22 |
+
],
|
| 23 |
+
"non_actor_range": [
|
| 24 |
+
1445187,
|
| 25 |
+
18000137
|
| 26 |
+
],
|
| 27 |
+
"note": "Actor sessions and non-actor sessions occupy disjoint time windows"
|
| 28 |
+
},
|
| 29 |
+
"credential_attempt_count": {
|
| 30 |
+
"actor_range": [
|
| 31 |
+
1,
|
| 32 |
+
59
|
| 33 |
+
],
|
| 34 |
+
"non_actor_range": [
|
| 35 |
+
1,
|
| 36 |
+
2
|
| 37 |
+
],
|
| 38 |
+
"actor_mean": 12.9,
|
| 39 |
+
"non_actor_mean": 1.07
|
| 40 |
+
},
|
| 41 |
+
"login_outcome": {
|
| 42 |
+
"actor_only_values": [
|
| 43 |
+
"failure_account_locked",
|
| 44 |
+
"account_takeover_confirmed",
|
| 45 |
+
"session_hijacked",
|
| 46 |
+
"success_anomalous"
|
| 47 |
+
],
|
| 48 |
+
"non_actor_only_values": [
|
| 49 |
+
"success_normal"
|
| 50 |
+
],
|
| 51 |
+
"note": "success_normal is 4306 non-actor / 0 actor rows; failure_account_locked is 0 non-actor / 186 actor rows."
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"ablation_experiments": [
|
| 55 |
+
{
|
| 56 |
+
"config": "full features (all oracles intact)",
|
| 57 |
+
"n_features": 166,
|
| 58 |
+
"accuracy": 1.0,
|
| 59 |
+
"roc_auc": 1.0
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"config": "cumulative drop through behavioural_oracles",
|
| 63 |
+
"dropped_so_far": [
|
| 64 |
+
"velocity_anomaly_score",
|
| 65 |
+
"credential_attempt_count",
|
| 66 |
+
"session_timestamp_utc"
|
| 67 |
+
],
|
| 68 |
+
"n_features": 163,
|
| 69 |
+
"accuracy": 0.9991111111111112,
|
| 70 |
+
"roc_auc": 1.0
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"config": "cumulative drop through outcome_oracle",
|
| 74 |
+
"dropped_so_far": [
|
| 75 |
+
"velocity_anomaly_score",
|
| 76 |
+
"credential_attempt_count",
|
| 77 |
+
"session_timestamp_utc",
|
| 78 |
+
"login_outcome"
|
| 79 |
+
],
|
| 80 |
+
"n_features": 154,
|
| 81 |
+
"accuracy": 0.9982222222222222,
|
| 82 |
+
"roc_auc": 0.9999714285714285
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"config": "cumulative drop through geo_oracle",
|
| 86 |
+
"dropped_so_far": [
|
| 87 |
+
"velocity_anomaly_score",
|
| 88 |
+
"credential_attempt_count",
|
| 89 |
+
"session_timestamp_utc",
|
| 90 |
+
"login_outcome",
|
| 91 |
+
"geo_country_code"
|
| 92 |
+
],
|
| 93 |
+
"n_features": 138,
|
| 94 |
+
"accuracy": 0.9986666666666667,
|
| 95 |
+
"roc_auc": 0.9999619047619047
|
| 96 |
+
},
|
| 97 |
+
{
|
| 98 |
+
"config": "cumulative drop through device_oracle",
|
| 99 |
+
"dropped_so_far": [
|
| 100 |
+
"velocity_anomaly_score",
|
| 101 |
+
"credential_attempt_count",
|
| 102 |
+
"session_timestamp_utc",
|
| 103 |
+
"login_outcome",
|
| 104 |
+
"geo_country_code",
|
| 105 |
+
"device_trust_level"
|
| 106 |
+
],
|
| 107 |
+
"n_features": 133,
|
| 108 |
+
"accuracy": 0.9982222222222222,
|
| 109 |
+
"roc_auc": 0.9999047619047619
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"config": "cumulative drop through user_risk_oracle",
|
| 113 |
+
"dropped_so_far": [
|
| 114 |
+
"velocity_anomaly_score",
|
| 115 |
+
"credential_attempt_count",
|
| 116 |
+
"session_timestamp_utc",
|
| 117 |
+
"login_outcome",
|
| 118 |
+
"geo_country_code",
|
| 119 |
+
"device_trust_level",
|
| 120 |
+
"user_risk_tier"
|
| 121 |
+
],
|
| 122 |
+
"n_features": 130,
|
| 123 |
+
"accuracy": 0.9977777777777778,
|
| 124 |
+
"roc_auc": 0.9996095238095238
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"config": "cumulative drop through anomaly_signal",
|
| 128 |
+
"dropped_so_far": [
|
| 129 |
+
"velocity_anomaly_score",
|
| 130 |
+
"credential_attempt_count",
|
| 131 |
+
"session_timestamp_utc",
|
| 132 |
+
"login_outcome",
|
| 133 |
+
"geo_country_code",
|
| 134 |
+
"device_trust_level",
|
| 135 |
+
"user_risk_tier",
|
| 136 |
+
"geo_anomaly_score"
|
| 137 |
+
],
|
| 138 |
+
"n_features": 129,
|
| 139 |
+
"accuracy": 0.9706666666666667,
|
| 140 |
+
"roc_auc": 0.9896857142857143
|
| 141 |
+
}
|
| 142 |
+
],
|
| 143 |
+
"conclusion": "Even with all six oracle feature groups removed (40+ columns dropped), the residual feature set still yields 97% test accuracy and AUC 0.99 on threat-actor binary detection. The leakage is not localised \u2014 it is distributed across the entire feature space because the generator produces threat-actor sessions that are anomalous on every dimension simultaneously without overlap. A buyer planning to train a real detection model on this dataset should know that the sample's headline detection task is not a representative ML problem.",
|
| 144 |
+
"recommendation_to_dataset_author": "Increase distributional overlap between threat-actor and legitimate session populations across all anomaly indicators: velocity score, credential attempt count, geo anomaly score, geo country code frequency, device trust level, login outcome class. Real-world detection systems operate at AUC 0.7-0.9, not 1.0; the sample should reflect that operating regime."
|
| 145 |
+
}
|
model_mlp.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79e17347b967d6051de89f297028d1ad7097a1a48572b526e568affcf4ab22ed
|
| 3 |
+
size 56020
|
model_xgb.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
multi_seed_results.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"purpose": "Multi-seed evaluation across 10 stratified splits of the 200 user-level rows. With n=30 test users, single-seed metrics are noisy; multi-seed gives a reliable picture.",
|
| 3 |
+
"seeds_evaluated": [
|
| 4 |
+
42,
|
| 5 |
+
7,
|
| 6 |
+
13,
|
| 7 |
+
17,
|
| 8 |
+
23,
|
| 9 |
+
31,
|
| 10 |
+
45,
|
| 11 |
+
99,
|
| 12 |
+
123,
|
| 13 |
+
200
|
| 14 |
+
],
|
| 15 |
+
"per_seed": [
|
| 16 |
+
{
|
| 17 |
+
"seed": 42,
|
| 18 |
+
"test_n_classes": 3,
|
| 19 |
+
"accuracy": 0.6666666666666666,
|
| 20 |
+
"macro_f1": 0.6453546453546454,
|
| 21 |
+
"macro_roc_auc_ovr": 0.8016919142238835
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"seed": 7,
|
| 25 |
+
"test_n_classes": 3,
|
| 26 |
+
"accuracy": 0.8666666666666667,
|
| 27 |
+
"macro_f1": 0.8139986139986141,
|
| 28 |
+
"macro_roc_auc_ovr": 0.877301738235242
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"seed": 13,
|
| 32 |
+
"test_n_classes": 3,
|
| 33 |
+
"accuracy": 0.5333333333333333,
|
| 34 |
+
"macro_f1": 0.44536610343061955,
|
| 35 |
+
"macro_roc_auc_ovr": 0.737813083241472
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"seed": 17,
|
| 39 |
+
"test_n_classes": 3,
|
| 40 |
+
"accuracy": 0.7333333333333333,
|
| 41 |
+
"macro_f1": 0.670995670995671,
|
| 42 |
+
"macro_roc_auc_ovr": 0.8726337896734316
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"seed": 23,
|
| 46 |
+
"test_n_classes": 3,
|
| 47 |
+
"accuracy": 0.7,
|
| 48 |
+
"macro_f1": 0.6267942583732058,
|
| 49 |
+
"macro_roc_auc_ovr": 0.7978373158999758
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"seed": 31,
|
| 53 |
+
"test_n_classes": 3,
|
| 54 |
+
"accuracy": 0.7666666666666667,
|
| 55 |
+
"macro_f1": 0.7068160597572363,
|
| 56 |
+
"macro_roc_auc_ovr": 0.8585702861598001
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"seed": 45,
|
| 60 |
+
"test_n_classes": 3,
|
| 61 |
+
"accuracy": 0.6666666666666666,
|
| 62 |
+
"macro_f1": 0.6306595365418894,
|
| 63 |
+
"macro_roc_auc_ovr": 0.8429286802048951
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"seed": 99,
|
| 67 |
+
"test_n_classes": 3,
|
| 68 |
+
"accuracy": 0.7333333333333333,
|
| 69 |
+
"macro_f1": 0.6844116844116844,
|
| 70 |
+
"macro_roc_auc_ovr": 0.7860817961521286
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"seed": 123,
|
| 74 |
+
"test_n_classes": 3,
|
| 75 |
+
"accuracy": 0.6666666666666666,
|
| 76 |
+
"macro_f1": 0.6138888888888889,
|
| 77 |
+
"macro_roc_auc_ovr": 0.8116214620370631
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"seed": 200,
|
| 81 |
+
"test_n_classes": 3,
|
| 82 |
+
"accuracy": 0.6666666666666666,
|
| 83 |
+
"macro_f1": 0.5367965367965367,
|
| 84 |
+
"macro_roc_auc_ovr": 0.738158799380027
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"aggregate": {
|
| 88 |
+
"accuracy_mean": 0.7,
|
| 89 |
+
"accuracy_std": 0.08164965809277261,
|
| 90 |
+
"accuracy_min": 0.5333333333333333,
|
| 91 |
+
"accuracy_max": 0.8666666666666667,
|
| 92 |
+
"macro_f1_mean": 0.6375081998548991,
|
| 93 |
+
"macro_f1_std": 0.09333613924888397,
|
| 94 |
+
"roc_auc_mean": 0.8124638865207918,
|
| 95 |
+
"roc_auc_std": 0.047957223370412666
|
| 96 |
+
},
|
| 97 |
+
"published_artifact_seed": 42
|
| 98 |
+
}
|
validation_results.json
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0.0",
|
| 3 |
+
"dataset": "xpertsystems/cyb006-sample",
|
| 4 |
+
"task": "3-class user_risk_tier classification",
|
| 5 |
+
"baselines": {
|
| 6 |
+
"always_predict_majority_accuracy": 0.5666666666666667,
|
| 7 |
+
"majority_class": "low",
|
| 8 |
+
"random_guess_accuracy": 0.3333333333333333
|
| 9 |
+
},
|
| 10 |
+
"split": {
|
| 11 |
+
"strategy": "stratified (StratifiedShuffleSplit, nested 70/15/15)",
|
| 12 |
+
"rationale": "This is a USER-LEVEL task (one row per user, 200 users total). Group-aware splitting does not apply since there is no many-rows-per-group structure to leak. Stratified splitting ensures each fold preserves the 3-tier class distribution.",
|
| 13 |
+
"users_train": 139,
|
| 14 |
+
"users_val": 31,
|
| 15 |
+
"users_test": 30,
|
| 16 |
+
"seed": 42
|
| 17 |
+
},
|
| 18 |
+
"n_features": 34,
|
| 19 |
+
"label_classes": [
|
| 20 |
+
"low",
|
| 21 |
+
"medium",
|
| 22 |
+
"high"
|
| 23 |
+
],
|
| 24 |
+
"class_distribution_train": {
|
| 25 |
+
"low": 79,
|
| 26 |
+
"medium": 33,
|
| 27 |
+
"high": 27
|
| 28 |
+
},
|
| 29 |
+
"class_distribution_test": {
|
| 30 |
+
"low": 17,
|
| 31 |
+
"medium": 7,
|
| 32 |
+
"high": 6
|
| 33 |
+
},
|
| 34 |
+
"leakage_excluded_features": [
|
| 35 |
+
"threat_actor_flag (perfect oracle for high tier)",
|
| 36 |
+
"account_takeover_flag (2/200 positives, oracle-prone)",
|
| 37 |
+
"credential_attack_victim_flag (1/200 positives)",
|
| 38 |
+
"velocity_anomaly_score (per-session, leaky for threat detection - aggregated session features that DO leak are excluded from session-aggregate fields)",
|
| 39 |
+
"session_timestamp_utc (per-session, leaky)",
|
| 40 |
+
"credential_attempt_count (per-session, leaky)",
|
| 41 |
+
"login_outcome (per-session, leaky)"
|
| 42 |
+
],
|
| 43 |
+
"leakage_audit_note": "See leakage_diagnostic.json for the full audit on the abandoned threat-actor binary detection task. Features dropped from session aggregation reflect that audit.",
|
| 44 |
+
"models": {
|
| 45 |
+
"xgboost": {
|
| 46 |
+
"architecture": "Gradient-boosted decision trees, multi:softprob, 3 classes",
|
| 47 |
+
"framework": "xgboost",
|
| 48 |
+
"test_metrics": {
|
| 49 |
+
"model": "xgboost",
|
| 50 |
+
"accuracy": 0.6666666666666666,
|
| 51 |
+
"macro_f1": 0.6453546453546454,
|
| 52 |
+
"weighted_f1": 0.6634032634032633,
|
| 53 |
+
"per_class_f1": {
|
| 54 |
+
"low": 0.7272727272727273,
|
| 55 |
+
"medium": 0.2857142857142857,
|
| 56 |
+
"high": 0.9230769230769231
|
| 57 |
+
},
|
| 58 |
+
"confusion_matrix": {
|
| 59 |
+
"labels": [
|
| 60 |
+
"low",
|
| 61 |
+
"medium",
|
| 62 |
+
"high"
|
| 63 |
+
],
|
| 64 |
+
"matrix": [
|
| 65 |
+
[
|
| 66 |
+
12,
|
| 67 |
+
5,
|
| 68 |
+
0
|
| 69 |
+
],
|
| 70 |
+
[
|
| 71 |
+
4,
|
| 72 |
+
2,
|
| 73 |
+
1
|
| 74 |
+
],
|
| 75 |
+
[
|
| 76 |
+
0,
|
| 77 |
+
0,
|
| 78 |
+
6
|
| 79 |
+
]
|
| 80 |
+
]
|
| 81 |
+
},
|
| 82 |
+
"macro_roc_auc_ovr": 0.8016919142238835
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"mlp": {
|
| 86 |
+
"architecture": "PyTorch MLP, 34 -> 128 -> 64 -> 3, BatchNorm1d + ReLU + Dropout, weighted cross-entropy loss",
|
| 87 |
+
"framework": "pytorch",
|
| 88 |
+
"test_metrics": {
|
| 89 |
+
"model": "mlp",
|
| 90 |
+
"accuracy": 0.6,
|
| 91 |
+
"macro_f1": 0.5914438502673797,
|
| 92 |
+
"weighted_f1": 0.6054545454545455,
|
| 93 |
+
"per_class_f1": {
|
| 94 |
+
"low": 0.6470588235294118,
|
| 95 |
+
"medium": 0.4,
|
| 96 |
+
"high": 0.7272727272727273
|
| 97 |
+
},
|
| 98 |
+
"confusion_matrix": {
|
| 99 |
+
"labels": [
|
| 100 |
+
"low",
|
| 101 |
+
"medium",
|
| 102 |
+
"high"
|
| 103 |
+
],
|
| 104 |
+
"matrix": [
|
| 105 |
+
[
|
| 106 |
+
11,
|
| 107 |
+
5,
|
| 108 |
+
1
|
| 109 |
+
],
|
| 110 |
+
[
|
| 111 |
+
4,
|
| 112 |
+
3,
|
| 113 |
+
0
|
| 114 |
+
],
|
| 115 |
+
[
|
| 116 |
+
2,
|
| 117 |
+
0,
|
| 118 |
+
4
|
| 119 |
+
]
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
"macro_roc_auc_ovr": 0.6973752247089843
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|