Initial release: XGBoost + MLP for ransomware actor-tier attribution
Browse files- README.md +444 -0
- ablation_results.json +264 -0
- feature_engineering.py +388 -0
- feature_meta.json +157 -0
- feature_scaler.json +1 -0
- inference_example.ipynb +326 -0
- model_mlp.safetensors +3 -0
- model_xgb.json +0 -0
- multi_seed_results.json +98 -0
- validation_results.json +146 -0
README.md
ADDED
|
@@ -0,0 +1,444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
library_name: pytorch
|
| 4 |
+
tags:
|
| 5 |
+
- cybersecurity
|
| 6 |
+
- ransomware
|
| 7 |
+
- threat-intelligence
|
| 8 |
+
- threat-attribution
|
| 9 |
+
- mitre-attack
|
| 10 |
+
- tabular-classification
|
| 11 |
+
- synthetic-data
|
| 12 |
+
- xgboost
|
| 13 |
+
- baseline
|
| 14 |
+
pipeline_tag: tabular-classification
|
| 15 |
+
base_model: []
|
| 16 |
+
datasets:
|
| 17 |
+
- xpertsystems/cyb005-sample
|
| 18 |
+
metrics:
|
| 19 |
+
- accuracy
|
| 20 |
+
- f1
|
| 21 |
+
- roc_auc
|
| 22 |
+
model-index:
|
| 23 |
+
- name: cyb005-baseline-classifier
|
| 24 |
+
results:
|
| 25 |
+
- task:
|
| 26 |
+
type: tabular-classification
|
| 27 |
+
name: 4-class threat-actor capability tier attribution
|
| 28 |
+
dataset:
|
| 29 |
+
type: xpertsystems/cyb005-sample
|
| 30 |
+
name: CYB005 Synthetic Ransomware Attack Simulation (Sample)
|
| 31 |
+
metrics:
|
| 32 |
+
- type: roc_auc
|
| 33 |
+
value: 0.8736
|
| 34 |
+
name: Test macro ROC-AUC OvR (XGBoost, seed 42)
|
| 35 |
+
- type: accuracy
|
| 36 |
+
value: 0.6898
|
| 37 |
+
name: Test accuracy (XGBoost, seed 42)
|
| 38 |
+
- type: f1
|
| 39 |
+
value: 0.6751
|
| 40 |
+
name: Test macro-F1 (XGBoost, seed 42)
|
| 41 |
+
- type: accuracy
|
| 42 |
+
value: 0.603
|
| 43 |
+
name: Multi-seed accuracy mean ± 0.040 (XGBoost, 10 seeds)
|
| 44 |
+
- type: roc_auc
|
| 45 |
+
value: 0.853
|
| 46 |
+
name: Multi-seed ROC-AUC mean ± 0.031 (XGBoost, 10 seeds)
|
| 47 |
+
- type: roc_auc
|
| 48 |
+
value: 0.8072
|
| 49 |
+
name: Test macro ROC-AUC OvR (MLP, seed 42)
|
| 50 |
+
- type: accuracy
|
| 51 |
+
value: 0.5118
|
| 52 |
+
name: Test accuracy (MLP, seed 42)
|
| 53 |
+
- type: f1
|
| 54 |
+
value: 0.5121
|
| 55 |
+
name: Test macro-F1 (MLP, seed 42)
|
| 56 |
+
---
|
| 57 |
+
|
| 58 |
+
# CYB005 Baseline Classifier
|
| 59 |
+
|
| 60 |
+
**Threat-actor capability-tier classifier trained on the CYB005 synthetic
|
| 61 |
+
ransomware campaign sample. Predicts which of 4 actor tiers
|
| 62 |
+
(lone_actor / organised_syndicate / raas_affiliate / nation_state_nexus)
|
| 63 |
+
is behind an observed ransomware campaign from per-timestep telemetry.**
|
| 64 |
+
|
| 65 |
+
> **Baseline reference, not for production use.** This model demonstrates
|
| 66 |
+
> that the [CYB005 sample dataset](https://huggingface.co/datasets/xpertsystems/cyb005-sample)
|
| 67 |
+
> is learnable end-to-end and gives prospective buyers a working starting
|
| 68 |
+
> point for threat-attribution research. It is not a production
|
| 69 |
+
> threat-intelligence system, attribution engine, or incident-response
|
| 70 |
+
> tool. See [Limitations](#limitations).
|
| 71 |
+
|
| 72 |
+
## Model overview
|
| 73 |
+
|
| 74 |
+
| Property | Value |
|
| 75 |
+
|---|---|
|
| 76 |
+
| Task | 4-class actor_capability_tier classification |
|
| 77 |
+
| Training data | `xpertsystems/cyb005-sample` (37,489 timesteps across 500 ransomware campaigns) |
|
| 78 |
+
| Models | XGBoost + PyTorch MLP |
|
| 79 |
+
| Input features | 63 (after one-hot encoding) |
|
| 80 |
+
| Split | **Group-aware by campaign_id** (disjoint train/val/test campaigns) |
|
| 81 |
+
| Validation | Single seed (artifact) + multi-seed aggregate across 10 seeds |
|
| 82 |
+
| License | CC-BY-NC-4.0 (matches dataset) |
|
| 83 |
+
| Status | Reference baseline |
|
| 84 |
+
|
| 85 |
+
## Why this task — and why CYB005 ships it where CYB002/3/4 could not
|
| 86 |
+
|
| 87 |
+
This is the first XpertSystems baseline that targets the **dataset's
|
| 88 |
+
stated headline use case**. The CYB005 README's first suggested use case
|
| 89 |
+
is "ransomware classifier models (4-tier actor attribution)", and that is
|
| 90 |
+
exactly what this baseline ships.
|
| 91 |
+
|
| 92 |
+
In CYB002 (kill-chain), CYB003 (malware family), and CYB004 (actor tier),
|
| 93 |
+
the sample datasets had only ~100 groups (events / samples / campaigns),
|
| 94 |
+
which limits group-aware test folds to ~15 unseen groups and 1.5–2 groups
|
| 95 |
+
per class. Each baseline had to pivot to a phase-prediction subtask that
|
| 96 |
+
was learnable at sample size.
|
| 97 |
+
|
| 98 |
+
CYB005's sample is intentionally **5× larger — 500 campaigns** — because
|
| 99 |
+
the README explicitly notes that "benchmarks are conditional on small
|
| 100 |
+
actor-tier subsets". The larger sample makes a held-out test fold of
|
| 101 |
+
75 disjoint campaigns possible, with each of the four tiers represented
|
| 102 |
+
by 11–30 unseen test campaigns. Tier attribution becomes genuinely
|
| 103 |
+
learnable, and that's what we publish.
|
| 104 |
+
|
| 105 |
+
Two model artifacts are published. They are designed to be used together — disagreement is a useful triage signal:
|
| 106 |
+
|
| 107 |
+
- `model_xgb.json` — gradient-boosted trees, primary recommendation
|
| 108 |
+
- `model_mlp.safetensors` — PyTorch MLP in SafeTensors format
|
| 109 |
+
|
| 110 |
+
## Quick start
|
| 111 |
+
|
| 112 |
+
```bash
|
| 113 |
+
pip install xgboost torch safetensors pandas huggingface_hub
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
from huggingface_hub import hf_hub_download
|
| 118 |
+
import json, numpy as np, torch, xgboost as xgb
|
| 119 |
+
from safetensors.torch import load_file
|
| 120 |
+
|
| 121 |
+
REPO = "xpertsystems/cyb005-baseline-classifier"
|
| 122 |
+
|
| 123 |
+
paths = {n: hf_hub_download(REPO, n) for n in [
|
| 124 |
+
"model_xgb.json", "model_mlp.safetensors",
|
| 125 |
+
"feature_engineering.py", "feature_meta.json", "feature_scaler.json",
|
| 126 |
+
]}
|
| 127 |
+
|
| 128 |
+
import sys, os
|
| 129 |
+
sys.path.insert(0, os.path.dirname(paths["feature_engineering.py"]))
|
| 130 |
+
from feature_engineering import (
|
| 131 |
+
transform_single, load_meta, INT_TO_LABEL, build_segment_lookup
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
meta = load_meta(paths["feature_meta.json"])
|
| 135 |
+
xgb_model = xgb.XGBClassifier(); xgb_model.load_model(paths["model_xgb.json"])
|
| 136 |
+
seg_lookup = build_segment_lookup("path/to/victim_topology.csv")
|
| 137 |
+
|
| 138 |
+
# Predict (see inference_example.ipynb for the full pattern)
|
| 139 |
+
seg_aggs = seg_lookup.get(my_record["target_segment_id"], {})
|
| 140 |
+
X = transform_single(my_record, meta, segment_aggregates=seg_aggs)
|
| 141 |
+
proba = xgb_model.predict_proba(X)[0]
|
| 142 |
+
print(INT_TO_LABEL[int(np.argmax(proba))])
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
See [`inference_example.ipynb`](./inference_example.ipynb) for the full
|
| 146 |
+
copy-paste demo.
|
| 147 |
+
|
| 148 |
+
## Training data
|
| 149 |
+
|
| 150 |
+
Trained on the public sample of CYB005, 37,489 per-timestep telemetry
|
| 151 |
+
rows from 500 ransomware campaigns (75 timesteps per campaign):
|
| 152 |
+
|
| 153 |
+
| Tier | Campaigns | Timestep rows | Train share |
|
| 154 |
+
|---|---:|---:|---:|
|
| 155 |
+
| `organised_syndicate` | 200 | 14,998 | 40.0% |
|
| 156 |
+
| `raas_affiliate` | 150 | 11,250 | 30.0% |
|
| 157 |
+
| `lone_actor` | 75 | 5,625 | 15.0% |
|
| 158 |
+
| `nation_state_nexus` | 75 | 5,616 | 15.0% |
|
| 159 |
+
|
| 160 |
+
### Group-aware split
|
| 161 |
+
|
| 162 |
+
A single campaign generates 75 highly-correlated timesteps. Random
|
| 163 |
+
row-level splitting would put timesteps from the same campaign in both
|
| 164 |
+
train and test, inflating metrics in a way that does not generalize to
|
| 165 |
+
new campaigns.
|
| 166 |
+
|
| 167 |
+
This release uses **GroupShuffleSplit by `campaign_id`** (nested,
|
| 168 |
+
70/15/15):
|
| 169 |
+
|
| 170 |
+
| Fold | Campaigns | Timesteps |
|
| 171 |
+
|---|---:|---:|
|
| 172 |
+
| Train | 350 | 26,242 |
|
| 173 |
+
| Validation | 75 | 5,624 |
|
| 174 |
+
| Test | 75 | 5,623 |
|
| 175 |
+
|
| 176 |
+
All test campaigns are completely unseen during training. Class imbalance
|
| 177 |
+
is addressed with `class_weight='balanced'` (XGBoost `sample_weight`) and
|
| 178 |
+
weighted cross-entropy (MLP).
|
| 179 |
+
|
| 180 |
+
## Feature pipeline
|
| 181 |
+
|
| 182 |
+
The bundled `feature_engineering.py` is the canonical feature recipe.
|
| 183 |
+
63 features survive after encoding, drawn from:
|
| 184 |
+
|
| 185 |
+
- **Per-timestep numeric** (15): `timestep`, `files_encrypted_cumulative`, `encryption_throughput_mbps`, `endpoints_compromised`, `lateral_move_count`, `credential_harvest_count`, `c2_bytes_exfiltrated`, `defender_alert_score`, `blast_radius_pct`, `living_off_land_score`, `attribution_risk_score`, `data_exfiltrated_gb`, `wiper_flag`, `double_extortion_flag`, `ir_activated`
|
| 186 |
+
- **Per-timestep categorical** (2, one-hot): `attack_phase`, `detection_outcome`
|
| 187 |
+
- **Victim segment** (10 numeric, 3 categorical one-hot): EDR coverage, network segmentation quality, patch posture, IR latency, endpoint count, AD domain complexity, SOC maturity score, backup recovery probability, backup recovery time, SIEM cadence; `segment_type`, `soc_maturity_tier`, `backup_maturity_tier`
|
| 188 |
+
- **Engineered** (6): `c2_intensity_score`, `escalation_velocity`, `is_destructive`, `dwell_efficiency`, `is_post_detonation`, `lotl_intensity_bin`
|
| 189 |
+
- **Ordinal** (1): `segment_id_hash` (segment ID hashed to integer)
|
| 190 |
+
|
| 191 |
+
### Leakage audit
|
| 192 |
+
|
| 193 |
+
Three columns were audited as potential tier oracles. **None were
|
| 194 |
+
dropped** for this task:
|
| 195 |
+
|
| 196 |
+
| Feature | Cross-tier ranges (mean) | Verdict |
|
| 197 |
+
|---|---|---|
|
| 198 |
+
| `attribution_risk_score` | lone 0.016 / nation_state 0.017 / organised 0.026 / raas 0.025 | Overlapping; NOT an oracle. Keep. |
|
| 199 |
+
| `living_off_land_score` | lone 0.05 / nation_state 0.20 / organised 0.16 / raas 0.13 | Mild correlation with massive overlap (std 0.08–0.25). Real observable. Keep. |
|
| 200 |
+
| `attack_phase` | Phase-purity vs tier is ~uniform | No oracle relationship. Keep. |
|
| 201 |
+
|
| 202 |
+
`detection_outcome` contains a `recovery_in_progress` value that is 1:1
|
| 203 |
+
identical to the `attack_phase` value of the same name (purity 0.89 vs
|
| 204 |
+
phase), but this only matters for *phase* prediction, not *tier*
|
| 205 |
+
prediction. The column is kept as a feature for tier work.
|
| 206 |
+
|
| 207 |
+
The honest result of dropping the two candidate-leakage columns
|
| 208 |
+
(`attribution_risk_score` + `living_off_land_score`) is a 2pp accuracy
|
| 209 |
+
reduction — confirming they provide modest legitimate signal, not oracle
|
| 210 |
+
leakage. They are kept in the published pipeline.
|
| 211 |
+
|
| 212 |
+
## Evaluation
|
| 213 |
+
|
| 214 |
+
### Test-set metrics, seed 42 (n = 5,623 timesteps from 75 disjoint campaigns)
|
| 215 |
+
|
| 216 |
+
**XGBoost** (the published `model_xgb.json` artifact)
|
| 217 |
+
|
| 218 |
+
| Metric | Value |
|
| 219 |
+
|---|---:|
|
| 220 |
+
| Macro ROC-AUC (OvR) | **0.8736** |
|
| 221 |
+
| Accuracy | **0.6898** |
|
| 222 |
+
| Macro-F1 | 0.6751 |
|
| 223 |
+
| Weighted-F1 | 0.6939 |
|
| 224 |
+
|
| 225 |
+
**MLP** (the published `model_mlp.safetensors` artifact)
|
| 226 |
+
|
| 227 |
+
| Metric | Value |
|
| 228 |
+
|---|---:|
|
| 229 |
+
| Macro ROC-AUC (OvR) | 0.8072 |
|
| 230 |
+
| Accuracy | 0.5118 |
|
| 231 |
+
| Macro-F1 | 0.5121 |
|
| 232 |
+
| Weighted-F1 | 0.5160 |
|
| 233 |
+
|
| 234 |
+
The MLP underperforms XGBoost on this task (a common pattern on tabular
|
| 235 |
+
data with limited training scale). Both are published so users can pick
|
| 236 |
+
the right tool, and disagreement between them is a useful triage signal.
|
| 237 |
+
|
| 238 |
+
### Multi-seed robustness (XGBoost, 10 seeds)
|
| 239 |
+
|
| 240 |
+
Stable performance across seeds — all 10 seeds yield all 4 tiers in
|
| 241 |
+
the test fold:
|
| 242 |
+
|
| 243 |
+
| Metric | Mean | Std | Min | Max |
|
| 244 |
+
|---|---:|---:|---:|---:|
|
| 245 |
+
| Accuracy | 0.603 | 0.040 | 0.533 | 0.690 |
|
| 246 |
+
| Macro-F1 | 0.593 | 0.047 | 0.509 | 0.675 |
|
| 247 |
+
| Macro ROC-AUC OvR | 0.853 | 0.031 | 0.796 | 0.891 |
|
| 248 |
+
|
| 249 |
+
Full per-seed results in [`multi_seed_results.json`](./multi_seed_results.json).
|
| 250 |
+
|
| 251 |
+
Seed 42 happens to be a stronger-than-average seed (acc 0.69 vs mean
|
| 252 |
+
0.60). The published artifact uses seed 42 because it produces clean
|
| 253 |
+
ROC-AUC computation; the **multi-seed aggregate ROC-AUC of 0.853 ± 0.031
|
| 254 |
+
is the honest performance estimate**.
|
| 255 |
+
|
| 256 |
+
### Per-class F1 (seed 42)
|
| 257 |
+
|
| 258 |
+
| Tier | Class share | XGBoost F1 | MLP F1 |
|
| 259 |
+
|---|---:|---:|---:|
|
| 260 |
+
| `organised_syndicate` | 40% | **0.739** | 0.520 |
|
| 261 |
+
| `nation_state_nexus` | 15% | **0.686** | 0.602 |
|
| 262 |
+
| `raas_affiliate` | 30% | 0.646 | 0.499 |
|
| 263 |
+
| `lone_actor` | 15% | 0.630 | 0.428 |
|
| 264 |
+
|
| 265 |
+
The model performs evenly across all four classes — no single tier
|
| 266 |
+
collapses. The strongest performance on minority `nation_state_nexus`
|
| 267 |
+
(F1 0.69 despite only 15% prevalence) suggests the model picks up on
|
| 268 |
+
nation-state-specific behaviours (high LotL score, wiper deployment,
|
| 269 |
+
sustained C2 dwell) reliably. The hardest tier is `lone_actor`, the
|
| 270 |
+
behaviourally most variable class.
|
| 271 |
+
|
| 272 |
+
### Ablation: which feature groups matter
|
| 273 |
+
|
| 274 |
+
| Configuration | Accuracy | Macro-F1 | ROC-AUC | Δ accuracy |
|
| 275 |
+
|---|---:|---:|---:|---:|
|
| 276 |
+
| Full feature set (published) | 0.6898 | 0.6751 | 0.8736 | — |
|
| 277 |
+
| No behavioural features | 0.5673 | 0.5214 | 0.8107 | **−0.1225** |
|
| 278 |
+
| No topology features | 0.6146 | 0.6302 | 0.8707 | −0.0752 |
|
| 279 |
+
| No `timestep` | 0.6717 | 0.6417 | 0.8673 | −0.0181 |
|
| 280 |
+
| No engineered features | 0.6882 | 0.6563 | 0.8747 | −0.0016 |
|
| 281 |
+
|
| 282 |
+
Four findings:
|
| 283 |
+
|
| 284 |
+
1. **Behavioural features carry the most tier signal** (drops 12 pp accuracy,
|
| 285 |
+
15 pp macro-F1 when removed). This is the most important finding:
|
| 286 |
+
tier prediction is genuinely behaviour-driven, not a topology-lookup
|
| 287 |
+
shortcut. Sustained C2 intensity, lateral-move velocity, wiper
|
| 288 |
+
deployment, and LotL technique use jointly discriminate tiers.
|
| 289 |
+
2. **Topology contributes ~7 pp accuracy.** Defender posture (SOC
|
| 290 |
+
maturity, backup tier, EDR coverage) provides useful conditioning
|
| 291 |
+
context — actors target environments differently by tier.
|
| 292 |
+
3. **`timestep` matters much less than for phase prediction** (drops only
|
| 293 |
+
~2 pp). This is expected and good: phase prediction depends on
|
| 294 |
+
knowing *where* in the lifecycle you are; tier prediction depends on
|
| 295 |
+
*how* the actor operates, which is more invariant to timestep.
|
| 296 |
+
4. **Engineered features barely contribute on their own** — the trees
|
| 297 |
+
recover most of the c2_intensity, escalation_velocity, etc. signal
|
| 298 |
+
directly from the raw features. They remain in the pipeline as
|
| 299 |
+
a documented baseline-feature reference.
|
| 300 |
+
|
| 301 |
+
### Architecture
|
| 302 |
+
|
| 303 |
+
**XGBoost:** multi-class gradient boosting (`multi:softprob`, 4 classes),
|
| 304 |
+
`hist` tree method, class-balanced sample weights, early stopping on
|
| 305 |
+
validation mlogloss.
|
| 306 |
+
|
| 307 |
+
**MLP:** `63 → 128 → 64 → 4`, each hidden layer followed by `BatchNorm1d`
|
| 308 |
+
→ `ReLU` → `Dropout(0.3)`, weighted cross-entropy loss, AdamW optimizer,
|
| 309 |
+
early stopping on validation macro-F1.
|
| 310 |
+
|
| 311 |
+
Training hyperparameters (learning rate, batch size, n_estimators,
|
| 312 |
+
early-stopping patience, weight decay, class-weighting strategy) are
|
| 313 |
+
held internally by XpertSystems and are not part of this release.
|
| 314 |
+
|
| 315 |
+
## Limitations
|
| 316 |
+
|
| 317 |
+
**This is a baseline reference, not a production threat-attribution system.**
|
| 318 |
+
|
| 319 |
+
1. **Adjacent-tier confusion is honest.** The hardest discriminations
|
| 320 |
+
are `lone_actor` ↔ `nation_state_nexus` (both small minorities,
|
| 321 |
+
sometimes behaviourally similar in early-phase recon) and
|
| 322 |
+
`raas_affiliate` ↔ `organised_syndicate` (operationally similar in
|
| 323 |
+
mid-campaign). Confusion-matrix-aware downstream logic (e.g. flagging
|
| 324 |
+
disagreement between XGBoost and MLP for analyst review) is recommended.
|
| 325 |
+
|
| 326 |
+
2. **MLP weaker than XGBoost.** The MLP lags ~18 pp accuracy behind
|
| 327 |
+
XGBoost. This is a common pattern on tabular data when training set
|
| 328 |
+
sizes don't justify deep-model parameter counts. Both are published;
|
| 329 |
+
the recommendation is XGBoost as the primary predictor and the MLP
|
| 330 |
+
for disagreement-as-triage signal.
|
| 331 |
+
|
| 332 |
+
3. **Synthetic-vs-real transfer.** The dataset is synthetic and
|
| 333 |
+
calibrated to ransomware threat-intelligence benchmark targets
|
| 334 |
+
(Mandiant M-Trends, CrowdStrike GTR, Coveware Quarterly, Sophos
|
| 335 |
+
State of Ransomware, IBM CODB, Verizon DBIR, CISA #StopRansomware,
|
| 336 |
+
Chainalysis). Real ransomware telemetry has different noise
|
| 337 |
+
characteristics, adversary adaptation, and instrumentation gaps. Do
|
| 338 |
+
not assume metrics transfer.
|
| 339 |
+
|
| 340 |
+
4. **Adversarial robustness not evaluated.** The dataset is not
|
| 341 |
+
adversarially generated; the model has not been red-teamed against
|
| 342 |
+
tier-spoofing campaigns (a real attacker may deliberately mimic
|
| 343 |
+
another tier's TTPs to evade attribution).
|
| 344 |
+
|
| 345 |
+
5. **Per-tier sample sizes are still modest.** `lone_actor` and
|
| 346 |
+
`nation_state_nexus` have only 75 training campaigns each. The
|
| 347 |
+
full ~5,500-campaign CYB005 product (with ~825 per minority tier)
|
| 348 |
+
would tighten the per-class confidence intervals materially.
|
| 349 |
+
|
| 350 |
+
## Notes on dataset schema
|
| 351 |
+
|
| 352 |
+
The CYB005 sample dataset README describes some fields differently
|
| 353 |
+
from the actual schema. The model was trained on the actual schema;
|
| 354 |
+
this note helps buyers reconcile what they read with what they receive.
|
| 355 |
+
|
| 356 |
+
| What the README says | What the data actually contains |
|
| 357 |
+
|---|---|
|
| 358 |
+
| "7 attack phases" (initial_access, persistence, privilege_escalation, lateral_movement, data_exfiltration, encryption_deployment, ransom_demand) | **8 attack phases**: `initial_access`, `internal_recon`, `privilege_escalation`, `lateral_movement`, `exfiltration_staging`, `encryption_detonation`, `ransom_negotiation`, `recovery_in_progress`. (No `persistence` phase as a distinct value; `recovery_in_progress` is the dominant phase at 35% of rows because campaigns run beyond detonation.) |
|
| 359 |
+
| Backup tiers include `cloud_replicated`, `immutable_object_lock` | Backup tiers in the actual data use `offsite_unverified`, `offsite_verified_immutable` for those concepts |
|
| 360 |
+
| Summary has `campaign_outcome`, `dwell_time_pre_detonation_hrs` | Neither field exists. Use `total_dwell_time_hrs` and `campaign_success_flag` / `detection_phase` instead |
|
| 361 |
+
| Per-timestep includes `endpoints_compromised`, `lateral_pivots`, `edr_alerted`, `siem_correlated`, `lotl_technique_used`, `vss_deletion_attempted`, `wiper_component_deployed`, `dwell_hours`, `c2_beacon_active`, `backup_maturity_tier` | Actual per-timestep columns: `endpoints_compromised` ✓, `lateral_move_count` (not pivots), no `edr_alerted`/`siem_correlated`/`vss_deletion_attempted`/`dwell_hours`/`c2_beacon_active`; `defender_alert_score` and `attribution_risk_score` exist instead; `backup_maturity_tier` is only on per-campaign `victim_topology`, not per-timestep |
|
| 362 |
+
|
| 363 |
+
None of these discrepancies affects model correctness — the feature
|
| 364 |
+
pipeline uses the actual column names. If you build your own pipeline
|
| 365 |
+
against the dataset, use the actual columns.
|
| 366 |
+
|
| 367 |
+
## Intended use
|
| 368 |
+
|
| 369 |
+
- **Evaluating fit** of the CYB005 dataset for your threat-attribution
|
| 370 |
+
or ransomware-research work
|
| 371 |
+
- **Baseline reference** for new model architectures (especially
|
| 372 |
+
sequence models, which should beat this baseline by leveraging
|
| 373 |
+
temporal context across the 75-step campaign)
|
| 374 |
+
- **Teaching and demo** for multi-class tabular classification on
|
| 375 |
+
cybersecurity telemetry
|
| 376 |
+
- **Feature engineering reference** for ransomware campaign attribution
|
| 377 |
+
|
| 378 |
+
## Out-of-scope use
|
| 379 |
+
|
| 380 |
+
- Production threat-actor attribution on real ransomware campaigns
|
| 381 |
+
- Incident-response decision-making on real systems
|
| 382 |
+
- Adversarial-evasion evaluation (dataset not adversarially generated)
|
| 383 |
+
- Any operational security or law-enforcement decision
|
| 384 |
+
|
| 385 |
+
## Reproducibility
|
| 386 |
+
|
| 387 |
+
Outputs above were produced with `seed = 42` (published artifact),
|
| 388 |
+
group-aware nested `GroupShuffleSplit` (70/15/15 by campaign_id), on the
|
| 389 |
+
published sample (`xpertsystems/cyb005-sample`, version 1.0.0, generated
|
| 390 |
+
2026-05-16). The feature pipeline in `feature_engineering.py` is
|
| 391 |
+
deterministic and the trained weights in this repo correspond exactly
|
| 392 |
+
to the metrics above.
|
| 393 |
+
|
| 394 |
+
Multi-seed results (seeds 42, 7, 13, 17, 23, 31, 45, 99, 123, 200) in
|
| 395 |
+
`multi_seed_results.json` confirm robust performance across splits.
|
| 396 |
+
|
| 397 |
+
The training script itself is private to XpertSystems.
|
| 398 |
+
|
| 399 |
+
## Files in this repo
|
| 400 |
+
|
| 401 |
+
| File | Purpose |
|
| 402 |
+
|---|---|
|
| 403 |
+
| `model_xgb.json` | XGBoost weights (seed 42) |
|
| 404 |
+
| `model_mlp.safetensors` | PyTorch MLP weights (seed 42) |
|
| 405 |
+
| `feature_engineering.py` | Feature pipeline (load → join topology → engineer → encode) |
|
| 406 |
+
| `feature_meta.json` | Feature column order + categorical levels |
|
| 407 |
+
| `feature_scaler.json` | MLP input mean/std (XGBoost ignores) |
|
| 408 |
+
| `validation_results.json` | Per-class metrics, confusion matrix, architecture |
|
| 409 |
+
| `ablation_results.json` | Per-feature-group ablation |
|
| 410 |
+
| `multi_seed_results.json` | XGBoost metrics across 10 seeds with aggregate statistics |
|
| 411 |
+
| `inference_example.ipynb` | End-to-end inference demo notebook |
|
| 412 |
+
| `README.md` | This file |
|
| 413 |
+
|
| 414 |
+
## Contact and full product
|
| 415 |
+
|
| 416 |
+
The full **CYB005** dataset contains ~358,000 rows across four files,
|
| 417 |
+
with calibrated benchmark validation against 12 metrics drawn from
|
| 418 |
+
authoritative ransomware threat-intelligence sources (Mandiant
|
| 419 |
+
M-Trends, CrowdStrike GTR, Coveware Quarterly Ransomware Report,
|
| 420 |
+
Sophos State of Ransomware, IBM CODB, Verizon DBIR, CISA
|
| 421 |
+
#StopRansomware, Chainalysis). The full XpertSystems.ai synthetic
|
| 422 |
+
data catalogue spans 41 SKUs across Cybersecurity, Healthcare,
|
| 423 |
+
Insurance & Risk, Oil & Gas, and Materials & Energy.
|
| 424 |
+
|
| 425 |
+
- 📧 **pradeep@xpertsystems.ai**
|
| 426 |
+
- 🌐 **https://xpertsystems.ai**
|
| 427 |
+
- 🗂 Dataset: https://huggingface.co/datasets/xpertsystems/cyb005-sample
|
| 428 |
+
- 🤖 Companion models:
|
| 429 |
+
- https://huggingface.co/xpertsystems/cyb001-baseline-classifier (network traffic)
|
| 430 |
+
- https://huggingface.co/xpertsystems/cyb002-baseline-classifier (ATT&CK kill-chain)
|
| 431 |
+
- https://huggingface.co/xpertsystems/cyb003-baseline-classifier (malware execution phase)
|
| 432 |
+
- https://huggingface.co/xpertsystems/cyb004-baseline-classifier (phishing campaign phase)
|
| 433 |
+
|
| 434 |
+
## Citation
|
| 435 |
+
|
| 436 |
+
```bibtex
|
| 437 |
+
@misc{xpertsystems_cyb005_baseline_2026,
|
| 438 |
+
title = {CYB005 Baseline Classifier: XGBoost and MLP for Ransomware Actor-Tier Attribution},
|
| 439 |
+
author = {XpertSystems.ai},
|
| 440 |
+
year = {2026},
|
| 441 |
+
url = {https://huggingface.co/xpertsystems/cyb005-baseline-classifier},
|
| 442 |
+
note = {Baseline reference model trained on xpertsystems/cyb005-sample}
|
| 443 |
+
}
|
| 444 |
+
```
|
ablation_results.json
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"purpose": "Quantify how much each feature group contributes to the headline XGBoost score. Identical architecture, same group-aware split, with one feature group dropped at a time.",
|
| 3 |
+
"full_model_metrics": {
|
| 4 |
+
"model": "xgboost",
|
| 5 |
+
"accuracy": 0.6898452783211808,
|
| 6 |
+
"macro_f1": 0.6751447018282526,
|
| 7 |
+
"weighted_f1": 0.6881356546405818,
|
| 8 |
+
"per_class_f1": {
|
| 9 |
+
"lone_actor": 0.6297297297297297,
|
| 10 |
+
"organised_syndicate": 0.7391393864525427,
|
| 11 |
+
"raas_affiliate": 0.6458906202260922,
|
| 12 |
+
"nation_state_nexus": 0.6858190709046454
|
| 13 |
+
},
|
| 14 |
+
"confusion_matrix": {
|
| 15 |
+
"labels": [
|
| 16 |
+
"lone_actor",
|
| 17 |
+
"organised_syndicate",
|
| 18 |
+
"raas_affiliate",
|
| 19 |
+
"nation_state_nexus"
|
| 20 |
+
],
|
| 21 |
+
"matrix": [
|
| 22 |
+
[
|
| 23 |
+
466,
|
| 24 |
+
67,
|
| 25 |
+
216,
|
| 26 |
+
1
|
| 27 |
+
],
|
| 28 |
+
[
|
| 29 |
+
83,
|
| 30 |
+
1795,
|
| 31 |
+
275,
|
| 32 |
+
172
|
| 33 |
+
],
|
| 34 |
+
[
|
| 35 |
+
156,
|
| 36 |
+
433,
|
| 37 |
+
1057,
|
| 38 |
+
79
|
| 39 |
+
],
|
| 40 |
+
[
|
| 41 |
+
25,
|
| 42 |
+
237,
|
| 43 |
+
0,
|
| 44 |
+
561
|
| 45 |
+
]
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
"macro_roc_auc_ovr": 0.873606865711172
|
| 49 |
+
},
|
| 50 |
+
"ablations": {
|
| 51 |
+
"no_topology": {
|
| 52 |
+
"n_features": 35,
|
| 53 |
+
"dropped_count": 28,
|
| 54 |
+
"metrics": {
|
| 55 |
+
"model": "xgboost_no_topology",
|
| 56 |
+
"accuracy": 0.6146185310332563,
|
| 57 |
+
"macro_f1": 0.630244354214636,
|
| 58 |
+
"weighted_f1": 0.6146007963862242,
|
| 59 |
+
"per_class_f1": {
|
| 60 |
+
"lone_actor": 0.5802285146547441,
|
| 61 |
+
"organised_syndicate": 0.595659765527563,
|
| 62 |
+
"raas_affiliate": 0.5862656072644722,
|
| 63 |
+
"nation_state_nexus": 0.7588235294117647
|
| 64 |
+
},
|
| 65 |
+
"confusion_matrix": {
|
| 66 |
+
"labels": [
|
| 67 |
+
"lone_actor",
|
| 68 |
+
"organised_syndicate",
|
| 69 |
+
"raas_affiliate",
|
| 70 |
+
"nation_state_nexus"
|
| 71 |
+
],
|
| 72 |
+
"matrix": [
|
| 73 |
+
[
|
| 74 |
+
584,
|
| 75 |
+
41,
|
| 76 |
+
110,
|
| 77 |
+
15
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
308,
|
| 81 |
+
1194,
|
| 82 |
+
655,
|
| 83 |
+
168
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
273,
|
| 87 |
+
370,
|
| 88 |
+
1033,
|
| 89 |
+
49
|
| 90 |
+
],
|
| 91 |
+
[
|
| 92 |
+
98,
|
| 93 |
+
79,
|
| 94 |
+
1,
|
| 95 |
+
645
|
| 96 |
+
]
|
| 97 |
+
]
|
| 98 |
+
},
|
| 99 |
+
"macro_roc_auc_ovr": 0.8706652220620055
|
| 100 |
+
},
|
| 101 |
+
"delta_accuracy": 0.07522674728792456,
|
| 102 |
+
"delta_macro_f1": 0.04490034761361661
|
| 103 |
+
},
|
| 104 |
+
"no_behavioural": {
|
| 105 |
+
"n_features": 36,
|
| 106 |
+
"dropped_count": 27,
|
| 107 |
+
"metrics": {
|
| 108 |
+
"model": "xgboost_no_behavioural",
|
| 109 |
+
"accuracy": 0.5673128223368309,
|
| 110 |
+
"macro_f1": 0.5213632789864133,
|
| 111 |
+
"weighted_f1": 0.5706324884542183,
|
| 112 |
+
"per_class_f1": {
|
| 113 |
+
"lone_actor": 0.44366608289550497,
|
| 114 |
+
"organised_syndicate": 0.6739977090492555,
|
| 115 |
+
"raas_affiliate": 0.5680505911465493,
|
| 116 |
+
"nation_state_nexus": 0.3997387328543436
|
| 117 |
+
},
|
| 118 |
+
"confusion_matrix": {
|
| 119 |
+
"labels": [
|
| 120 |
+
"lone_actor",
|
| 121 |
+
"organised_syndicate",
|
| 122 |
+
"raas_affiliate",
|
| 123 |
+
"nation_state_nexus"
|
| 124 |
+
],
|
| 125 |
+
"matrix": [
|
| 126 |
+
[
|
| 127 |
+
380,
|
| 128 |
+
45,
|
| 129 |
+
306,
|
| 130 |
+
19
|
| 131 |
+
],
|
| 132 |
+
[
|
| 133 |
+
101,
|
| 134 |
+
1471,
|
| 135 |
+
498,
|
| 136 |
+
255
|
| 137 |
+
],
|
| 138 |
+
[
|
| 139 |
+
319,
|
| 140 |
+
245,
|
| 141 |
+
1033,
|
| 142 |
+
128
|
| 143 |
+
],
|
| 144 |
+
[
|
| 145 |
+
163,
|
| 146 |
+
279,
|
| 147 |
+
75,
|
| 148 |
+
306
|
| 149 |
+
]
|
| 150 |
+
]
|
| 151 |
+
},
|
| 152 |
+
"macro_roc_auc_ovr": 0.8106558391572862
|
| 153 |
+
},
|
| 154 |
+
"delta_accuracy": 0.12253245598434992,
|
| 155 |
+
"delta_macro_f1": 0.15378142284183927
|
| 156 |
+
},
|
| 157 |
+
"no_timestep": {
|
| 158 |
+
"n_features": 62,
|
| 159 |
+
"dropped_count": 1,
|
| 160 |
+
"metrics": {
|
| 161 |
+
"model": "xgboost_no_timestep",
|
| 162 |
+
"accuracy": 0.6717054952872132,
|
| 163 |
+
"macro_f1": 0.6417349625987673,
|
| 164 |
+
"weighted_f1": 0.6719572046072043,
|
| 165 |
+
"per_class_f1": {
|
| 166 |
+
"lone_actor": 0.5438813349814586,
|
| 167 |
+
"organised_syndicate": 0.7479365079365079,
|
| 168 |
+
"raas_affiliate": 0.6453731343283582,
|
| 169 |
+
"nation_state_nexus": 0.6297488731487444
|
| 170 |
+
},
|
| 171 |
+
"confusion_matrix": {
|
| 172 |
+
"labels": [
|
| 173 |
+
"lone_actor",
|
| 174 |
+
"organised_syndicate",
|
| 175 |
+
"raas_affiliate",
|
| 176 |
+
"nation_state_nexus"
|
| 177 |
+
],
|
| 178 |
+
"matrix": [
|
| 179 |
+
[
|
| 180 |
+
440,
|
| 181 |
+
66,
|
| 182 |
+
240,
|
| 183 |
+
4
|
| 184 |
+
],
|
| 185 |
+
[
|
| 186 |
+
154,
|
| 187 |
+
1767,
|
| 188 |
+
230,
|
| 189 |
+
174
|
| 190 |
+
],
|
| 191 |
+
[
|
| 192 |
+
169,
|
| 193 |
+
412,
|
| 194 |
+
1081,
|
| 195 |
+
63
|
| 196 |
+
],
|
| 197 |
+
[
|
| 198 |
+
105,
|
| 199 |
+
155,
|
| 200 |
+
74,
|
| 201 |
+
489
|
| 202 |
+
]
|
| 203 |
+
]
|
| 204 |
+
},
|
| 205 |
+
"macro_roc_auc_ovr": 0.8672596014037719
|
| 206 |
+
},
|
| 207 |
+
"delta_accuracy": 0.01813978303396757,
|
| 208 |
+
"delta_macro_f1": 0.033409739229485313
|
| 209 |
+
},
|
| 210 |
+
"no_engineered": {
|
| 211 |
+
"n_features": 57,
|
| 212 |
+
"dropped_count": 6,
|
| 213 |
+
"metrics": {
|
| 214 |
+
"model": "xgboost_no_engineered",
|
| 215 |
+
"accuracy": 0.6882447092299484,
|
| 216 |
+
"macro_f1": 0.6562913668551777,
|
| 217 |
+
"weighted_f1": 0.6881813027750402,
|
| 218 |
+
"per_class_f1": {
|
| 219 |
+
"lone_actor": 0.5686274509803921,
|
| 220 |
+
"organised_syndicate": 0.7419631375910845,
|
| 221 |
+
"raas_affiliate": 0.7053364269141531,
|
| 222 |
+
"nation_state_nexus": 0.6092384519350812
|
| 223 |
+
},
|
| 224 |
+
"confusion_matrix": {
|
| 225 |
+
"labels": [
|
| 226 |
+
"lone_actor",
|
| 227 |
+
"organised_syndicate",
|
| 228 |
+
"raas_affiliate",
|
| 229 |
+
"nation_state_nexus"
|
| 230 |
+
],
|
| 231 |
+
"matrix": [
|
| 232 |
+
[
|
| 233 |
+
435,
|
| 234 |
+
71,
|
| 235 |
+
219,
|
| 236 |
+
25
|
| 237 |
+
],
|
| 238 |
+
[
|
| 239 |
+
127,
|
| 240 |
+
1731,
|
| 241 |
+
287,
|
| 242 |
+
180
|
| 243 |
+
],
|
| 244 |
+
[
|
| 245 |
+
107,
|
| 246 |
+
316,
|
| 247 |
+
1216,
|
| 248 |
+
86
|
| 249 |
+
],
|
| 250 |
+
[
|
| 251 |
+
111,
|
| 252 |
+
223,
|
| 253 |
+
1,
|
| 254 |
+
488
|
| 255 |
+
]
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
"macro_roc_auc_ovr": 0.874702950892121
|
| 259 |
+
},
|
| 260 |
+
"delta_accuracy": 0.0016005690912324066,
|
| 261 |
+
"delta_macro_f1": 0.018853334973074842
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
}
|
feature_engineering.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
feature_engineering.py
|
| 3 |
+
======================
|
| 4 |
+
|
| 5 |
+
Feature pipeline for the CYB005 baseline classifier.
|
| 6 |
+
|
| 7 |
+
Predicts `actor_capability_tier` (4-class) from per-timestep ransomware
|
| 8 |
+
campaign telemetry on the CYB005 sample dataset.
|
| 9 |
+
|
| 10 |
+
CSV inputs:
|
| 11 |
+
attack_timelines.csv (primary, one row per timestep, 500 campaigns
|
| 12 |
+
x 75 timesteps = 37,489 rows)
|
| 13 |
+
victim_topology.csv (per-segment defender configuration, joined
|
| 14 |
+
on target_segment_id; one row per segment)
|
| 15 |
+
campaign_summary.csv (per-campaign aggregates; reserved for future
|
| 16 |
+
work - many fields are post-hoc outcomes that
|
| 17 |
+
would leak the tier through training)
|
| 18 |
+
campaign_events.csv (discrete event log; reserved for future work)
|
| 19 |
+
|
| 20 |
+
Target classes (4):
|
| 21 |
+
lone_actor, organised_syndicate, raas_affiliate, nation_state_nexus
|
| 22 |
+
|
| 23 |
+
Sample size note
|
| 24 |
+
----------------
|
| 25 |
+
CYB005's sample is intentionally larger than its sister datasets (500
|
| 26 |
+
campaigns vs 100 in CYB002/3/4). The README states this is because
|
| 27 |
+
"benchmarks are conditional on small actor-tier subsets". The larger
|
| 28 |
+
sample makes tier attribution genuinely learnable here, where it was
|
| 29 |
+
not in CYB003/CYB004.
|
| 30 |
+
|
| 31 |
+
Leakage audit
|
| 32 |
+
-------------
|
| 33 |
+
Three columns inspected for tier leakage:
|
| 34 |
+
- `attribution_risk_score` - mean 0.016-0.026 across tiers, ranges
|
| 35 |
+
overlap heavily. NOT an oracle; keep.
|
| 36 |
+
- `living_off_land_score` - mean 0.05 (lone) to 0.20 (nation_state),
|
| 37 |
+
with substantial overlap (std 0.08-0.25). Real observable, not
|
| 38 |
+
an oracle; keep.
|
| 39 |
+
- `attack_phase` - 89% purity vs `detection_outcome` (recovery_in_progress
|
| 40 |
+
is a 1:1 alias), but for TIER prediction it has no oracle relationship.
|
| 41 |
+
Keep.
|
| 42 |
+
|
| 43 |
+
No columns are dropped for tier prediction. The model is trained on what
|
| 44 |
+
a SOC analyst would actually see at observation time.
|
| 45 |
+
|
| 46 |
+
Public API
|
| 47 |
+
----------
|
| 48 |
+
build_features(timelines_path, topology_path)
|
| 49 |
+
-> (X, y, groups, meta)
|
| 50 |
+
transform_single(record, meta, segment_aggregates=None) -> np.ndarray
|
| 51 |
+
save_meta(meta, path) / load_meta(path)
|
| 52 |
+
build_segment_lookup(topology_path) -> dict
|
| 53 |
+
|
| 54 |
+
License
|
| 55 |
+
-------
|
| 56 |
+
Ships with the public model on Hugging Face under CC-BY-NC-4.0,
|
| 57 |
+
matching the dataset license. See README.md.
|
| 58 |
+
"""
|
| 59 |
+
|
| 60 |
+
from __future__ import annotations
|
| 61 |
+
|
| 62 |
+
import json
|
| 63 |
+
from pathlib import Path
|
| 64 |
+
from typing import Any
|
| 65 |
+
|
| 66 |
+
import numpy as np
|
| 67 |
+
import pandas as pd
|
| 68 |
+
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
# Label space
|
| 71 |
+
# ---------------------------------------------------------------------------
|
| 72 |
+
|
| 73 |
+
# Ordered roughly by capability: lone -> nation_state. Class imbalance:
|
| 74 |
+
# organised_syndicate (40%), raas_affiliate (30%), lone_actor (15%),
|
| 75 |
+
# nation_state_nexus (15%).
|
| 76 |
+
LABEL_ORDER = [
|
| 77 |
+
"lone_actor",
|
| 78 |
+
"organised_syndicate",
|
| 79 |
+
"raas_affiliate",
|
| 80 |
+
"nation_state_nexus",
|
| 81 |
+
]
|
| 82 |
+
LABEL_TO_INT = {lbl: i for i, lbl in enumerate(LABEL_ORDER)}
|
| 83 |
+
INT_TO_LABEL = {i: lbl for lbl, i in LABEL_TO_INT.items()}
|
| 84 |
+
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
# Identifier and target columns - not features
|
| 87 |
+
# ---------------------------------------------------------------------------
|
| 88 |
+
|
| 89 |
+
ID_COLUMNS = ["campaign_id", "actor_id"]
|
| 90 |
+
TARGET_COLUMN = "actor_capability_tier"
|
| 91 |
+
|
| 92 |
+
# No columns dropped for leakage. See module docstring's "Leakage audit"
|
| 93 |
+
# for the rationale on each candidate.
|
| 94 |
+
LEAKY_COLUMNS: list[str] = []
|
| 95 |
+
|
| 96 |
+
# ---------------------------------------------------------------------------
|
| 97 |
+
# Per-timestep numeric features
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
|
| 100 |
+
DIRECT_NUMERIC_TIMESTEP_FEATURES = [
|
| 101 |
+
"timestep", # position in 75-step lifecycle
|
| 102 |
+
"files_encrypted_cumulative",
|
| 103 |
+
"encryption_throughput_mbps",
|
| 104 |
+
"endpoints_compromised",
|
| 105 |
+
"lateral_move_count",
|
| 106 |
+
"credential_harvest_count",
|
| 107 |
+
"c2_bytes_exfiltrated",
|
| 108 |
+
"defender_alert_score",
|
| 109 |
+
"blast_radius_pct",
|
| 110 |
+
"living_off_land_score",
|
| 111 |
+
"attribution_risk_score",
|
| 112 |
+
"data_exfiltrated_gb",
|
| 113 |
+
"wiper_flag",
|
| 114 |
+
"double_extortion_flag",
|
| 115 |
+
"ir_activated",
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
# Per-timestep categoricals to one-hot
|
| 119 |
+
CATEGORICAL_TIMESTEP_FEATURES = [
|
| 120 |
+
"attack_phase", # 8 phases
|
| 121 |
+
"detection_outcome", # 5 outcomes incl. recovery_in_progress
|
| 122 |
+
]
|
| 123 |
+
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
# Victim topology features (joined on target_segment_id == segment_id)
|
| 126 |
+
# ---------------------------------------------------------------------------
|
| 127 |
+
# victim_topology.csv is segment-level (300 rows, one per segment). Each
|
| 128 |
+
# campaign targets one segment, so these become per-campaign-constant
|
| 129 |
+
# features. They provide useful conditioning context (what defender
|
| 130 |
+
# posture is the actor working against) without being tier oracles.
|
| 131 |
+
|
| 132 |
+
TOPOLOGY_NUMERIC_FEATURES = [
|
| 133 |
+
"edr_coverage_rate",
|
| 134 |
+
"network_segmentation_quality",
|
| 135 |
+
"patch_posture_score",
|
| 136 |
+
"ir_activation_latency_hrs",
|
| 137 |
+
"endpoint_count",
|
| 138 |
+
"ad_domain_complexity",
|
| 139 |
+
"soc_maturity_score",
|
| 140 |
+
"backup_recovery_prob",
|
| 141 |
+
"backup_recovery_hrs_mean",
|
| 142 |
+
"siem_rule_refresh_cadence_days",
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
TOPOLOGY_CATEGORICAL_FEATURES = [
|
| 146 |
+
"segment_type", # 8 values: corporate_lan / dmz / cloud_workload / ot_ics_control / ...
|
| 147 |
+
"soc_maturity_tier", # tier label
|
| 148 |
+
"backup_maturity_tier", # 6 values: no_backup / local_only / network_attached / ...
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# ---------------------------------------------------------------------------
|
| 153 |
+
# Engineered features
|
| 154 |
+
# ---------------------------------------------------------------------------
|
| 155 |
+
|
| 156 |
+
def _add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
|
| 157 |
+
"""
|
| 158 |
+
Six engineered features encoding tier-discriminative hypotheses.
|
| 159 |
+
Each is a behavioural composite that a threat analyst would compute
|
| 160 |
+
by hand to distinguish actor sophistication levels.
|
| 161 |
+
"""
|
| 162 |
+
df = df.copy()
|
| 163 |
+
|
| 164 |
+
# 1. C2 intensity: data exfiltration combined with encryption throughput.
|
| 165 |
+
# Nation-state and organised tiers tend to sustain higher both;
|
| 166 |
+
# lone actors burst then quiet down.
|
| 167 |
+
df["c2_intensity_score"] = np.log1p(
|
| 168 |
+
df["c2_bytes_exfiltrated"].clip(lower=0)
|
| 169 |
+
* df["encryption_throughput_mbps"].clip(lower=0)
|
| 170 |
+
).astype(float)
|
| 171 |
+
|
| 172 |
+
# 2. Escalation velocity: lateral moves per timestep elapsed.
|
| 173 |
+
# Higher = aggressive (raas/syndicate). Lower = methodical (apt).
|
| 174 |
+
df["escalation_velocity"] = (
|
| 175 |
+
df["lateral_move_count"] / df["timestep"].clip(lower=1)
|
| 176 |
+
).astype(float)
|
| 177 |
+
|
| 178 |
+
# 3. Destructive intent: wiper or double_extortion deployed.
|
| 179 |
+
# Wiper is a strong nation_state signature.
|
| 180 |
+
df["is_destructive"] = (
|
| 181 |
+
(df["wiper_flag"] == 1) | (df["double_extortion_flag"] == 1)
|
| 182 |
+
).astype(int)
|
| 183 |
+
|
| 184 |
+
# 4. Dwell efficiency: blast radius per timestep. High = fast,
|
| 185 |
+
# low = patient. Helps separate organised_syndicate (fast) from
|
| 186 |
+
# nation_state_nexus (patient).
|
| 187 |
+
df["dwell_efficiency"] = (
|
| 188 |
+
df["blast_radius_pct"] / df["timestep"].clip(lower=1)
|
| 189 |
+
).astype(float)
|
| 190 |
+
|
| 191 |
+
# 5. Post-detonation indicator. Timesteps after 50 are typically
|
| 192 |
+
# encryption_detonation / ransom_negotiation / recovery phases,
|
| 193 |
+
# which surface tier signal through ransom posture.
|
| 194 |
+
df["is_post_detonation"] = (df["timestep"] > 50).astype(int)
|
| 195 |
+
|
| 196 |
+
# 6. LotL intensity bin. Quartile bins of living_off_land_score
|
| 197 |
+
# give the trees a categorical view of an otherwise continuous
|
| 198 |
+
# tier-correlated feature.
|
| 199 |
+
df["lotl_intensity_bin"] = pd.cut(
|
| 200 |
+
df["living_off_land_score"], bins=[-0.01, 0.1, 0.3, 0.6, 1.01],
|
| 201 |
+
labels=[0, 1, 2, 3],
|
| 202 |
+
).astype(int)
|
| 203 |
+
|
| 204 |
+
return df
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# ---------------------------------------------------------------------------
|
| 208 |
+
# Public API
|
| 209 |
+
# ---------------------------------------------------------------------------
|
| 210 |
+
|
| 211 |
+
def build_features(
|
| 212 |
+
timelines_path: str | Path,
|
| 213 |
+
topology_path: str | Path,
|
| 214 |
+
) -> tuple[pd.DataFrame, pd.Series, pd.Series, dict[str, Any]]:
|
| 215 |
+
"""
|
| 216 |
+
Load CSVs, join topology, drop target + identifiers, engineer features,
|
| 217 |
+
one-hot encode, return (X, y, groups, meta).
|
| 218 |
+
|
| 219 |
+
`groups` is a Series of campaign_id values aligned with X. Use it with
|
| 220 |
+
GroupShuffleSplit / GroupKFold so train and test sets contain disjoint
|
| 221 |
+
campaigns - each campaign generates 75 highly-correlated timesteps.
|
| 222 |
+
"""
|
| 223 |
+
timelines = pd.read_csv(timelines_path)
|
| 224 |
+
topo = pd.read_csv(topology_path)
|
| 225 |
+
|
| 226 |
+
y = timelines[TARGET_COLUMN].map(LABEL_TO_INT)
|
| 227 |
+
if y.isna().any():
|
| 228 |
+
bad = timelines.loc[y.isna(), TARGET_COLUMN].unique()
|
| 229 |
+
raise ValueError(f"Unknown actor_capability_tier values: {bad}")
|
| 230 |
+
y = y.astype(int)
|
| 231 |
+
groups = timelines["campaign_id"].copy()
|
| 232 |
+
|
| 233 |
+
timelines = timelines.drop(
|
| 234 |
+
columns=ID_COLUMNS + [TARGET_COLUMN] + LEAKY_COLUMNS, errors="ignore",
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Join victim topology features on target_segment_id == segment_id
|
| 238 |
+
topo_cols_needed = (
|
| 239 |
+
["segment_id"] + TOPOLOGY_NUMERIC_FEATURES + TOPOLOGY_CATEGORICAL_FEATURES
|
| 240 |
+
)
|
| 241 |
+
timelines = timelines.merge(
|
| 242 |
+
topo[topo_cols_needed],
|
| 243 |
+
left_on="target_segment_id", right_on="segment_id", how="left",
|
| 244 |
+
).drop(columns=["segment_id"], errors="ignore")
|
| 245 |
+
|
| 246 |
+
# target_segment_id is high-cardinality (251 unique). Use it as an
|
| 247 |
+
# ordinal feature by hashing to integer rather than one-hot.
|
| 248 |
+
timelines["segment_id_hash"] = (
|
| 249 |
+
timelines["target_segment_id"].astype("category").cat.codes.astype(float)
|
| 250 |
+
)
|
| 251 |
+
timelines = timelines.drop(columns=["target_segment_id"])
|
| 252 |
+
|
| 253 |
+
timelines = _add_engineered_features(timelines)
|
| 254 |
+
|
| 255 |
+
numeric_features = (
|
| 256 |
+
DIRECT_NUMERIC_TIMESTEP_FEATURES
|
| 257 |
+
+ TOPOLOGY_NUMERIC_FEATURES
|
| 258 |
+
+ [
|
| 259 |
+
"segment_id_hash",
|
| 260 |
+
"c2_intensity_score", "escalation_velocity", "is_destructive",
|
| 261 |
+
"dwell_efficiency", "is_post_detonation", "lotl_intensity_bin",
|
| 262 |
+
]
|
| 263 |
+
)
|
| 264 |
+
X_numeric = timelines[numeric_features].astype(float)
|
| 265 |
+
|
| 266 |
+
all_categorical = (
|
| 267 |
+
[(col, "timestep") for col in CATEGORICAL_TIMESTEP_FEATURES]
|
| 268 |
+
+ [(col, "topology") for col in TOPOLOGY_CATEGORICAL_FEATURES]
|
| 269 |
+
)
|
| 270 |
+
categorical_levels: dict[str, list[str]] = {}
|
| 271 |
+
blocks: list[pd.DataFrame] = []
|
| 272 |
+
for col, _src in all_categorical:
|
| 273 |
+
if col not in timelines.columns:
|
| 274 |
+
continue
|
| 275 |
+
levels = sorted(timelines[col].dropna().unique().tolist())
|
| 276 |
+
categorical_levels[col] = levels
|
| 277 |
+
block = pd.get_dummies(
|
| 278 |
+
timelines[col].astype("category").cat.set_categories(levels),
|
| 279 |
+
prefix=col, dummy_na=False,
|
| 280 |
+
).astype(int)
|
| 281 |
+
blocks.append(block)
|
| 282 |
+
|
| 283 |
+
X = pd.concat(
|
| 284 |
+
[X_numeric.reset_index(drop=True)]
|
| 285 |
+
+ [b.reset_index(drop=True) for b in blocks],
|
| 286 |
+
axis=1,
|
| 287 |
+
).fillna(0.0)
|
| 288 |
+
|
| 289 |
+
meta = {
|
| 290 |
+
"feature_names": X.columns.tolist(),
|
| 291 |
+
"numeric_features": numeric_features,
|
| 292 |
+
"categorical_levels": categorical_levels,
|
| 293 |
+
"label_to_int": LABEL_TO_INT,
|
| 294 |
+
"int_to_label": INT_TO_LABEL,
|
| 295 |
+
"leakage_excluded": LEAKY_COLUMNS,
|
| 296 |
+
}
|
| 297 |
+
return X, y, groups, meta
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def transform_single(
|
| 301 |
+
record: dict | pd.DataFrame,
|
| 302 |
+
meta: dict[str, Any],
|
| 303 |
+
segment_aggregates: dict | None = None,
|
| 304 |
+
) -> np.ndarray:
|
| 305 |
+
"""Encode a single timestep record for inference."""
|
| 306 |
+
if isinstance(record, dict):
|
| 307 |
+
df = pd.DataFrame([record.copy()])
|
| 308 |
+
else:
|
| 309 |
+
df = record.copy()
|
| 310 |
+
|
| 311 |
+
if segment_aggregates is not None:
|
| 312 |
+
for k, v in segment_aggregates.items():
|
| 313 |
+
df[k] = v
|
| 314 |
+
|
| 315 |
+
# If target_segment_id is present but segment_id_hash isn't, set 0 (unknown)
|
| 316 |
+
if "segment_id_hash" not in df.columns:
|
| 317 |
+
df["segment_id_hash"] = 0.0
|
| 318 |
+
if "target_segment_id" in df.columns:
|
| 319 |
+
df = df.drop(columns=["target_segment_id"])
|
| 320 |
+
|
| 321 |
+
df = _add_engineered_features(df)
|
| 322 |
+
|
| 323 |
+
numeric = pd.DataFrame({
|
| 324 |
+
col: df.get(col, pd.Series([0.0] * len(df))).astype(float).values
|
| 325 |
+
for col in meta["numeric_features"]
|
| 326 |
+
})
|
| 327 |
+
blocks: list[pd.DataFrame] = [numeric]
|
| 328 |
+
for col, levels in meta["categorical_levels"].items():
|
| 329 |
+
val = df.get(col, pd.Series([None] * len(df)))
|
| 330 |
+
block = pd.get_dummies(
|
| 331 |
+
val.astype("category").cat.set_categories(levels),
|
| 332 |
+
prefix=col, dummy_na=False,
|
| 333 |
+
).astype(int)
|
| 334 |
+
for lvl in levels:
|
| 335 |
+
cname = f"{col}_{lvl}"
|
| 336 |
+
if cname not in block.columns:
|
| 337 |
+
block[cname] = 0
|
| 338 |
+
block = block[[f"{col}_{lvl}" for lvl in levels]]
|
| 339 |
+
blocks.append(block)
|
| 340 |
+
|
| 341 |
+
X = pd.concat(blocks, axis=1).fillna(0.0)
|
| 342 |
+
X = X.reindex(columns=meta["feature_names"], fill_value=0.0)
|
| 343 |
+
return X.values.astype(np.float32)
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
def save_meta(meta: dict[str, Any], path: str | Path) -> None:
|
| 347 |
+
serializable = {
|
| 348 |
+
"feature_names": meta["feature_names"],
|
| 349 |
+
"numeric_features": meta["numeric_features"],
|
| 350 |
+
"categorical_levels": meta["categorical_levels"],
|
| 351 |
+
"label_to_int": meta["label_to_int"],
|
| 352 |
+
"int_to_label": {str(k): v for k, v in meta["int_to_label"].items()},
|
| 353 |
+
"leakage_excluded": meta.get("leakage_excluded", []),
|
| 354 |
+
}
|
| 355 |
+
with open(path, "w") as f:
|
| 356 |
+
json.dump(serializable, f, indent=2)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def load_meta(path: str | Path) -> dict[str, Any]:
|
| 360 |
+
with open(path) as f:
|
| 361 |
+
meta = json.load(f)
|
| 362 |
+
meta["int_to_label"] = {int(k): v for k, v in meta["int_to_label"].items()}
|
| 363 |
+
return meta
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def build_segment_lookup(topology_path: str | Path) -> dict[str, dict]:
|
| 367 |
+
"""Build {segment_id: {topology feature values}} for inference-time lookup."""
|
| 368 |
+
topo = pd.read_csv(topology_path)
|
| 369 |
+
cols = TOPOLOGY_NUMERIC_FEATURES + TOPOLOGY_CATEGORICAL_FEATURES
|
| 370 |
+
out = {}
|
| 371 |
+
for _, row in topo.iterrows():
|
| 372 |
+
out[row["segment_id"]] = {c: row[c] for c in cols if c in topo.columns}
|
| 373 |
+
return out
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
if __name__ == "__main__":
|
| 377 |
+
import sys
|
| 378 |
+
base = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("/mnt/user-data/uploads")
|
| 379 |
+
X, y, groups, meta = build_features(
|
| 380 |
+
base / "attack_timelines.csv",
|
| 381 |
+
base / "victim_topology.csv",
|
| 382 |
+
)
|
| 383 |
+
print(f"X shape: {X.shape}")
|
| 384 |
+
print(f"y shape: {y.shape}")
|
| 385 |
+
print(f"groups: {groups.nunique()} campaigns")
|
| 386 |
+
print(f"n features: {len(meta['feature_names'])}")
|
| 387 |
+
print(f"label distribution:\n{y.map(INT_TO_LABEL).value_counts()}")
|
| 388 |
+
print(f"X has NaN: {X.isnull().any().any()}")
|
feature_meta.json
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"feature_names": [
|
| 3 |
+
"timestep",
|
| 4 |
+
"files_encrypted_cumulative",
|
| 5 |
+
"encryption_throughput_mbps",
|
| 6 |
+
"endpoints_compromised",
|
| 7 |
+
"lateral_move_count",
|
| 8 |
+
"credential_harvest_count",
|
| 9 |
+
"c2_bytes_exfiltrated",
|
| 10 |
+
"defender_alert_score",
|
| 11 |
+
"blast_radius_pct",
|
| 12 |
+
"living_off_land_score",
|
| 13 |
+
"attribution_risk_score",
|
| 14 |
+
"data_exfiltrated_gb",
|
| 15 |
+
"wiper_flag",
|
| 16 |
+
"double_extortion_flag",
|
| 17 |
+
"ir_activated",
|
| 18 |
+
"edr_coverage_rate",
|
| 19 |
+
"network_segmentation_quality",
|
| 20 |
+
"patch_posture_score",
|
| 21 |
+
"ir_activation_latency_hrs",
|
| 22 |
+
"endpoint_count",
|
| 23 |
+
"ad_domain_complexity",
|
| 24 |
+
"soc_maturity_score",
|
| 25 |
+
"backup_recovery_prob",
|
| 26 |
+
"backup_recovery_hrs_mean",
|
| 27 |
+
"siem_rule_refresh_cadence_days",
|
| 28 |
+
"segment_id_hash",
|
| 29 |
+
"c2_intensity_score",
|
| 30 |
+
"escalation_velocity",
|
| 31 |
+
"is_destructive",
|
| 32 |
+
"dwell_efficiency",
|
| 33 |
+
"is_post_detonation",
|
| 34 |
+
"lotl_intensity_bin",
|
| 35 |
+
"attack_phase_encryption_detonation",
|
| 36 |
+
"attack_phase_exfiltration_staging",
|
| 37 |
+
"attack_phase_initial_access",
|
| 38 |
+
"attack_phase_internal_recon",
|
| 39 |
+
"attack_phase_lateral_movement",
|
| 40 |
+
"attack_phase_privilege_escalation",
|
| 41 |
+
"attack_phase_ransom_negotiation",
|
| 42 |
+
"attack_phase_recovery_in_progress",
|
| 43 |
+
"detection_outcome_alert_generated",
|
| 44 |
+
"detection_outcome_delayed_detection",
|
| 45 |
+
"detection_outcome_no_detection",
|
| 46 |
+
"detection_outcome_partial_containment",
|
| 47 |
+
"detection_outcome_recovery_in_progress",
|
| 48 |
+
"segment_type_active_directory_domain",
|
| 49 |
+
"segment_type_backup_infrastructure",
|
| 50 |
+
"segment_type_cloud_workload_tier",
|
| 51 |
+
"segment_type_corporate_workstation_fleet",
|
| 52 |
+
"segment_type_dmz_perimeter",
|
| 53 |
+
"segment_type_executive_endpoint_zone",
|
| 54 |
+
"segment_type_file_server_cluster",
|
| 55 |
+
"segment_type_ot_ics_control_network",
|
| 56 |
+
"soc_maturity_tier_none",
|
| 57 |
+
"soc_maturity_tier_tier1",
|
| 58 |
+
"soc_maturity_tier_tier2",
|
| 59 |
+
"soc_maturity_tier_tier3_mdr",
|
| 60 |
+
"backup_maturity_tier_air_gapped_gold_standard",
|
| 61 |
+
"backup_maturity_tier_local_only",
|
| 62 |
+
"backup_maturity_tier_network_attached",
|
| 63 |
+
"backup_maturity_tier_no_backup",
|
| 64 |
+
"backup_maturity_tier_offsite_unverified",
|
| 65 |
+
"backup_maturity_tier_offsite_verified_immutable"
|
| 66 |
+
],
|
| 67 |
+
"numeric_features": [
|
| 68 |
+
"timestep",
|
| 69 |
+
"files_encrypted_cumulative",
|
| 70 |
+
"encryption_throughput_mbps",
|
| 71 |
+
"endpoints_compromised",
|
| 72 |
+
"lateral_move_count",
|
| 73 |
+
"credential_harvest_count",
|
| 74 |
+
"c2_bytes_exfiltrated",
|
| 75 |
+
"defender_alert_score",
|
| 76 |
+
"blast_radius_pct",
|
| 77 |
+
"living_off_land_score",
|
| 78 |
+
"attribution_risk_score",
|
| 79 |
+
"data_exfiltrated_gb",
|
| 80 |
+
"wiper_flag",
|
| 81 |
+
"double_extortion_flag",
|
| 82 |
+
"ir_activated",
|
| 83 |
+
"edr_coverage_rate",
|
| 84 |
+
"network_segmentation_quality",
|
| 85 |
+
"patch_posture_score",
|
| 86 |
+
"ir_activation_latency_hrs",
|
| 87 |
+
"endpoint_count",
|
| 88 |
+
"ad_domain_complexity",
|
| 89 |
+
"soc_maturity_score",
|
| 90 |
+
"backup_recovery_prob",
|
| 91 |
+
"backup_recovery_hrs_mean",
|
| 92 |
+
"siem_rule_refresh_cadence_days",
|
| 93 |
+
"segment_id_hash",
|
| 94 |
+
"c2_intensity_score",
|
| 95 |
+
"escalation_velocity",
|
| 96 |
+
"is_destructive",
|
| 97 |
+
"dwell_efficiency",
|
| 98 |
+
"is_post_detonation",
|
| 99 |
+
"lotl_intensity_bin"
|
| 100 |
+
],
|
| 101 |
+
"categorical_levels": {
|
| 102 |
+
"attack_phase": [
|
| 103 |
+
"encryption_detonation",
|
| 104 |
+
"exfiltration_staging",
|
| 105 |
+
"initial_access",
|
| 106 |
+
"internal_recon",
|
| 107 |
+
"lateral_movement",
|
| 108 |
+
"privilege_escalation",
|
| 109 |
+
"ransom_negotiation",
|
| 110 |
+
"recovery_in_progress"
|
| 111 |
+
],
|
| 112 |
+
"detection_outcome": [
|
| 113 |
+
"alert_generated",
|
| 114 |
+
"delayed_detection",
|
| 115 |
+
"no_detection",
|
| 116 |
+
"partial_containment",
|
| 117 |
+
"recovery_in_progress"
|
| 118 |
+
],
|
| 119 |
+
"segment_type": [
|
| 120 |
+
"active_directory_domain",
|
| 121 |
+
"backup_infrastructure",
|
| 122 |
+
"cloud_workload_tier",
|
| 123 |
+
"corporate_workstation_fleet",
|
| 124 |
+
"dmz_perimeter",
|
| 125 |
+
"executive_endpoint_zone",
|
| 126 |
+
"file_server_cluster",
|
| 127 |
+
"ot_ics_control_network"
|
| 128 |
+
],
|
| 129 |
+
"soc_maturity_tier": [
|
| 130 |
+
"none",
|
| 131 |
+
"tier1",
|
| 132 |
+
"tier2",
|
| 133 |
+
"tier3_mdr"
|
| 134 |
+
],
|
| 135 |
+
"backup_maturity_tier": [
|
| 136 |
+
"air_gapped_gold_standard",
|
| 137 |
+
"local_only",
|
| 138 |
+
"network_attached",
|
| 139 |
+
"no_backup",
|
| 140 |
+
"offsite_unverified",
|
| 141 |
+
"offsite_verified_immutable"
|
| 142 |
+
]
|
| 143 |
+
},
|
| 144 |
+
"label_to_int": {
|
| 145 |
+
"lone_actor": 0,
|
| 146 |
+
"organised_syndicate": 1,
|
| 147 |
+
"raas_affiliate": 2,
|
| 148 |
+
"nation_state_nexus": 3
|
| 149 |
+
},
|
| 150 |
+
"int_to_label": {
|
| 151 |
+
"0": "lone_actor",
|
| 152 |
+
"1": "organised_syndicate",
|
| 153 |
+
"2": "raas_affiliate",
|
| 154 |
+
"3": "nation_state_nexus"
|
| 155 |
+
},
|
| 156 |
+
"leakage_excluded": []
|
| 157 |
+
}
|
feature_scaler.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"mean": [37.00038106851612, 17991.317582501335, 14.544094504991996, 74.80169194421157, 138.9533191067754, 7.282105022483043, 27653651.954919595, 0.37065581891624116, 0.022953715418032164, 0.13800967914030943, 0.02037649569392577, 3.1598579757640426, 0.0799862815334197, 0.5887508574041612, 0.11912201813886136, 0.5610207110738511, 0.5038509793460865, 0.6147713474582729, 211.73937619083915, 403.9024845667251, 0.42209049615120803, 0.4762784848715799, 0.3303223839646368, 232.6633640728603, 97.50316286868379, 123.18851459492417, 2.460235469644747, 3.32033272147805, 0.6430150140995351, 0.0003669160190541892, 0.32009755354012653, 0.5515585702309275, 0.11199603688743236, 0.0907324136879811, 0.07987196097858396, 0.11043365597134365, 0.13082082158372074, 0.0968676167975002, 0.033800777379772884, 0.34547671671366514, 0.3409801082234586, 0.033724563676549045, 0.19442115692401493, 0.08539745446231232, 0.34547671671366514, 0.15707644234433352, 0.09999237862967762, 0.13143053120951148, 0.1342123313771816, 0.14575870741559332, 0.10288849935218353, 0.09717247161039555, 0.13146863806112338, 0.21141681274293117, 0.26575718314152885, 0.30569316363082083, 0.21713284048471915, 0.05716027741787973, 0.16850849782790947, 0.31422909839189084, 0.054302263546985745, 0.2343571374133069, 0.1714427254020273], "std": [21.65185417886005, 70075.14437404645, 46.623050778770434, 138.8927820606075, 333.3671687687729, 10.411700758798842, 112444742.30416173, 0.45135105008302945, 0.07477089986783442, 0.20000045211083176, 0.05994709512338441, 8.340943676601865, 0.27127712884039307, 0.49206962131159826, 0.32393820662667727, 0.17535640681475026, 0.19840256506792886, 0.206597959364603, 215.90980436865328, 223.6938623978402, 0.2049917965317542, 0.30290492736879715, 0.26507070287211687, 186.3537217786501, 49.81550590467409, 68.8922730412834, 6.962899822938947, 8.322473481208718, 0.47911945627130226, 0.0012057216657705649, 0.46652267197064756, 0.8570014461906231, 0.3153675864625374, 0.28723367966000857, 0.271100039653078, 0.3134354914270541, 0.3372107166070688, 0.29578305477334943, 0.18071947703591806, 0.4755325142084713, 0.4740477164149472, 0.1805227390800372, 0.3957619729670082, 0.2794775584414953, 0.4755325142084713, 0.36387975936434375, 0.2999955539014646, 0.3378770441859857, 0.34088679887143486, 0.3528708710157629, 0.3038189815388037, 0.2961981188534845, 0.3379186095056718, 0.4083210715101615, 0.4417439743053282, 0.46070917250642185, 0.41230164679802983, 0.2321530397678419, 0.37432435596831565, 0.4642169579437471, 0.2266174854606215, 0.423604423344658, 0.37690254787912686]}
|
inference_example.ipynb
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# CYB005 Baseline Classifier — Inference Example\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"End-to-end demo: load the trained XGBoost and PyTorch MLP models from the Hugging Face repo and predict the **threat-actor capability tier** of a ransomware campaign from a per-timestep telemetry record.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"**Models predict one of 4 tiers:** `lone_actor`, `organised_syndicate`, `raas_affiliate`, `nation_state_nexus`.\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"**This is a baseline reference model**, not a production threat-attribution system. See the model card for full metrics and limitations."
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "markdown",
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"source": [
|
| 20 |
+
"## 1. Install dependencies"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": null,
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"outputs": [],
|
| 28 |
+
"source": [
|
| 29 |
+
"%pip install --quiet xgboost torch safetensors pandas numpy huggingface_hub"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"cell_type": "markdown",
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"source": [
|
| 36 |
+
"## 2. Download model artifacts from Hugging Face"
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": null,
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"outputs": [],
|
| 44 |
+
"source": [
|
| 45 |
+
"from huggingface_hub import hf_hub_download\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"REPO_ID = \"xpertsystems/cyb005-baseline-classifier\"\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"files = {}\n",
|
| 50 |
+
"for name in [\"model_xgb.json\", \"model_mlp.safetensors\",\n",
|
| 51 |
+
" \"feature_engineering.py\", \"feature_meta.json\",\n",
|
| 52 |
+
" \"feature_scaler.json\"]:\n",
|
| 53 |
+
" files[name] = hf_hub_download(repo_id=REPO_ID, filename=name)\n",
|
| 54 |
+
" print(f\" downloaded: {name}\")"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": null,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [],
|
| 62 |
+
"source": [
|
| 63 |
+
"import sys, os\n",
|
| 64 |
+
"fe_dir = os.path.dirname(files[\"feature_engineering.py\"])\n",
|
| 65 |
+
"if fe_dir not in sys.path:\n",
|
| 66 |
+
" sys.path.insert(0, fe_dir)\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"from feature_engineering import (\n",
|
| 69 |
+
" transform_single, load_meta, INT_TO_LABEL, build_segment_lookup\n",
|
| 70 |
+
")"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "markdown",
|
| 75 |
+
"metadata": {},
|
| 76 |
+
"source": [
|
| 77 |
+
"## 3. Load models and metadata"
|
| 78 |
+
]
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"cell_type": "code",
|
| 82 |
+
"execution_count": null,
|
| 83 |
+
"metadata": {},
|
| 84 |
+
"outputs": [],
|
| 85 |
+
"source": [
|
| 86 |
+
"import json\n",
|
| 87 |
+
"import numpy as np\n",
|
| 88 |
+
"import torch\n",
|
| 89 |
+
"import torch.nn as nn\n",
|
| 90 |
+
"import xgboost as xgb\n",
|
| 91 |
+
"from safetensors.torch import load_file\n",
|
| 92 |
+
"\n",
|
| 93 |
+
"meta = load_meta(files[\"feature_meta.json\"])\n",
|
| 94 |
+
"with open(files[\"feature_scaler.json\"]) as f:\n",
|
| 95 |
+
" scaler = json.load(f)\n",
|
| 96 |
+
"\n",
|
| 97 |
+
"N_FEATURES = len(meta[\"feature_names\"])\n",
|
| 98 |
+
"N_CLASSES = len(meta[\"int_to_label\"])\n",
|
| 99 |
+
"print(f\"feature count: {N_FEATURES}\")\n",
|
| 100 |
+
"print(f\"class count: {N_CLASSES}\")\n",
|
| 101 |
+
"print(f\"label classes: {list(meta['int_to_label'].values())}\")"
|
| 102 |
+
]
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"cell_type": "code",
|
| 106 |
+
"execution_count": null,
|
| 107 |
+
"metadata": {},
|
| 108 |
+
"outputs": [],
|
| 109 |
+
"source": [
|
| 110 |
+
"# XGBoost\n",
|
| 111 |
+
"xgb_model = xgb.XGBClassifier()\n",
|
| 112 |
+
"xgb_model.load_model(files[\"model_xgb.json\"])\n",
|
| 113 |
+
"\n",
|
| 114 |
+
"# MLP architecture (must match training)\n",
|
| 115 |
+
"class TierMLP(nn.Module):\n",
|
| 116 |
+
" def __init__(self, n_features, n_classes=4, hidden1=128, hidden2=64, dropout=0.3):\n",
|
| 117 |
+
" super().__init__()\n",
|
| 118 |
+
" self.net = nn.Sequential(\n",
|
| 119 |
+
" nn.Linear(n_features, hidden1),\n",
|
| 120 |
+
" nn.BatchNorm1d(hidden1),\n",
|
| 121 |
+
" nn.ReLU(),\n",
|
| 122 |
+
" nn.Dropout(dropout),\n",
|
| 123 |
+
" nn.Linear(hidden1, hidden2),\n",
|
| 124 |
+
" nn.BatchNorm1d(hidden2),\n",
|
| 125 |
+
" nn.ReLU(),\n",
|
| 126 |
+
" nn.Dropout(dropout),\n",
|
| 127 |
+
" nn.Linear(hidden2, n_classes),\n",
|
| 128 |
+
" )\n",
|
| 129 |
+
" def forward(self, x):\n",
|
| 130 |
+
" return self.net(x)\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"mlp_model = TierMLP(N_FEATURES, n_classes=N_CLASSES)\n",
|
| 133 |
+
"mlp_model.load_state_dict(load_file(files[\"model_mlp.safetensors\"]))\n",
|
| 134 |
+
"mlp_model.eval()\n",
|
| 135 |
+
"print(\"models loaded\")"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "markdown",
|
| 140 |
+
"metadata": {},
|
| 141 |
+
"source": [
|
| 142 |
+
"## 4. Build the segment lookup\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"Per-segment topology features (SOC maturity, EDR coverage, backup tier, etc.) are pulled from `victim_topology.csv` and merged into each timestep record by `target_segment_id`."
|
| 145 |
+
]
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"cell_type": "code",
|
| 149 |
+
"execution_count": null,
|
| 150 |
+
"metadata": {},
|
| 151 |
+
"outputs": [],
|
| 152 |
+
"source": [
|
| 153 |
+
"from huggingface_hub import snapshot_download\n",
|
| 154 |
+
"\n",
|
| 155 |
+
"ds_path = snapshot_download(repo_id=\"xpertsystems/cyb005-sample\", repo_type=\"dataset\")\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"seg_lookup = build_segment_lookup(\n",
|
| 158 |
+
" os.path.join(ds_path, \"victim_topology.csv\")\n",
|
| 159 |
+
")\n",
|
| 160 |
+
"print(f\"loaded {len(seg_lookup)} segment profiles\")"
|
| 161 |
+
]
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"cell_type": "markdown",
|
| 165 |
+
"metadata": {},
|
| 166 |
+
"source": [
|
| 167 |
+
"## 5. Prediction helper"
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"cell_type": "code",
|
| 172 |
+
"execution_count": null,
|
| 173 |
+
"metadata": {},
|
| 174 |
+
"outputs": [],
|
| 175 |
+
"source": [
|
| 176 |
+
"MU = np.array(scaler[\"mean\"], dtype=np.float32)\n",
|
| 177 |
+
"SD = np.array(scaler[\"std\"], dtype=np.float32)\n",
|
| 178 |
+
"\n",
|
| 179 |
+
"def predict_tier(record: dict) -> dict:\n",
|
| 180 |
+
" \"\"\"Predict the threat-actor tier for one per-timestep telemetry record.\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" Per-segment topology features are pulled automatically via\n",
|
| 183 |
+
" `target_segment_id` from the seg_lookup loaded above.\n",
|
| 184 |
+
" \"\"\"\n",
|
| 185 |
+
" seg_id = record.get(\"target_segment_id\")\n",
|
| 186 |
+
" seg_aggs = seg_lookup.get(seg_id, {})\n",
|
| 187 |
+
" X = transform_single(record, meta, segment_aggregates=seg_aggs)\n",
|
| 188 |
+
"\n",
|
| 189 |
+
" xgb_proba = xgb_model.predict_proba(X)[0]\n",
|
| 190 |
+
" xgb_label = INT_TO_LABEL[int(np.argmax(xgb_proba))]\n",
|
| 191 |
+
"\n",
|
| 192 |
+
" Xs = ((X - MU) / SD).astype(np.float32)\n",
|
| 193 |
+
" with torch.no_grad():\n",
|
| 194 |
+
" logits = mlp_model(torch.tensor(Xs))\n",
|
| 195 |
+
" mlp_proba = torch.softmax(logits, dim=1).numpy()[0]\n",
|
| 196 |
+
" mlp_label = INT_TO_LABEL[int(np.argmax(mlp_proba))]\n",
|
| 197 |
+
"\n",
|
| 198 |
+
" return {\n",
|
| 199 |
+
" \"xgboost\": {\n",
|
| 200 |
+
" \"label\": xgb_label,\n",
|
| 201 |
+
" \"probabilities\": {INT_TO_LABEL[i]: float(p) for i, p in enumerate(xgb_proba)},\n",
|
| 202 |
+
" },\n",
|
| 203 |
+
" \"mlp\": {\n",
|
| 204 |
+
" \"label\": mlp_label,\n",
|
| 205 |
+
" \"probabilities\": {INT_TO_LABEL[i]: float(p) for i, p in enumerate(mlp_proba)},\n",
|
| 206 |
+
" },\n",
|
| 207 |
+
" }"
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"cell_type": "markdown",
|
| 212 |
+
"metadata": {},
|
| 213 |
+
"source": [
|
| 214 |
+
"## 6. Run on an example record\n",
|
| 215 |
+
"\n",
|
| 216 |
+
"Real `encryption_detonation` event from the sample dataset: a nation-state-tier ransomware campaign at timestep 68, with a wiper component deployed and 36,586 files encrypted across 634 endpoints. Both models should lean toward `nation_state_nexus`."
|
| 217 |
+
]
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
"cell_type": "code",
|
| 221 |
+
"execution_count": null,
|
| 222 |
+
"metadata": {},
|
| 223 |
+
"outputs": [],
|
| 224 |
+
"source": [
|
| 225 |
+
"# Real timestep record from the sample dataset (true tier: nation_state_nexus)\n",
|
| 226 |
+
"example_record = {\n",
|
| 227 |
+
" \"timestep\": 68,\n",
|
| 228 |
+
" \"attack_phase\": \"encryption_detonation\",\n",
|
| 229 |
+
" \"files_encrypted_cumulative\": 36586,\n",
|
| 230 |
+
" \"encryption_throughput_mbps\": 244.913,\n",
|
| 231 |
+
" \"endpoints_compromised\": 634,\n",
|
| 232 |
+
" \"lateral_move_count\": 1498,\n",
|
| 233 |
+
" \"credential_harvest_count\": 17,\n",
|
| 234 |
+
" \"c2_bytes_exfiltrated\": 138747511.1,\n",
|
| 235 |
+
" \"defender_alert_score\": 1.0,\n",
|
| 236 |
+
" \"detection_outcome\": \"alert_generated\",\n",
|
| 237 |
+
" \"blast_radius_pct\": 0.4032,\n",
|
| 238 |
+
" \"living_off_land_score\": 0.35,\n",
|
| 239 |
+
" \"attribution_risk_score\": 0.0,\n",
|
| 240 |
+
" \"data_exfiltrated_gb\": 14.852,\n",
|
| 241 |
+
" \"wiper_flag\": 1,\n",
|
| 242 |
+
" \"double_extortion_flag\": 0,\n",
|
| 243 |
+
" \"ir_activated\": 0,\n",
|
| 244 |
+
" \"target_segment_id\": \"SEG00150\",\n",
|
| 245 |
+
"}\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"result = predict_tier(example_record)\n",
|
| 248 |
+
"\n",
|
| 249 |
+
"print(f\"XGBoost -> {result['xgboost']['label']}\")\n",
|
| 250 |
+
"for lbl, p in sorted(result['xgboost']['probabilities'].items(), key=lambda x: -x[1]):\n",
|
| 251 |
+
" print(f\" P({lbl:25s}) = {p:.4f}\")\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"print(f\"\\nMLP -> {result['mlp']['label']}\")\n",
|
| 254 |
+
"for lbl, p in sorted(result['mlp']['probabilities'].items(), key=lambda x: -x[1]):\n",
|
| 255 |
+
" print(f\" P({lbl:25s}) = {p:.4f}\")"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"cell_type": "markdown",
|
| 260 |
+
"metadata": {},
|
| 261 |
+
"source": [
|
| 262 |
+
"### When the two models disagree\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"XGBoost and the MLP can disagree on borderline cases — `lone_actor` ↔ `nation_state_nexus` (low blast radius can look similar across both extremes), or `raas_affiliate` ↔ `organised_syndicate` (operational similarity). In threat-attribution workflows, disagreement is a useful triage signal for human analyst review."
|
| 265 |
+
]
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"cell_type": "markdown",
|
| 269 |
+
"metadata": {},
|
| 270 |
+
"source": [
|
| 271 |
+
"## 7. Batch prediction on the sample dataset"
|
| 272 |
+
]
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"cell_type": "code",
|
| 276 |
+
"execution_count": null,
|
| 277 |
+
"metadata": {},
|
| 278 |
+
"outputs": [],
|
| 279 |
+
"source": [
|
| 280 |
+
"import pandas as pd\n",
|
| 281 |
+
"\n",
|
| 282 |
+
"timelines = pd.read_csv(f\"{ds_path}/attack_timelines.csv\")\n",
|
| 283 |
+
"\n",
|
| 284 |
+
"# Score the first 500 timesteps\n",
|
| 285 |
+
"sample = timelines.head(500).copy()\n",
|
| 286 |
+
"preds = [predict_tier(row.to_dict())[\"xgboost\"][\"label\"] for _, row in sample.iterrows()]\n",
|
| 287 |
+
"sample[\"xgb_pred\"] = preds\n",
|
| 288 |
+
"\n",
|
| 289 |
+
"ct = pd.crosstab(sample[\"actor_capability_tier\"], sample[\"xgb_pred\"],\n",
|
| 290 |
+
" rownames=[\"true\"], colnames=[\"pred\"])\n",
|
| 291 |
+
"print(\"Confusion on first 500 sample rows (XGBoost):\")\n",
|
| 292 |
+
"print(ct)\n",
|
| 293 |
+
"acc = (sample[\"actor_capability_tier\"] == sample[\"xgb_pred\"]).mean()\n",
|
| 294 |
+
"print(f\"\\nbatch accuracy on first 500 rows (in-distribution): {acc:.4f}\")\n",
|
| 295 |
+
"print(\"\\nNote: these rows include training-set campaigns. See validation_results.json\\n\"\n",
|
| 296 |
+
" \"for proper held-out test metrics from disjoint campaigns.\")"
|
| 297 |
+
]
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"cell_type": "markdown",
|
| 301 |
+
"metadata": {},
|
| 302 |
+
"source": [
|
| 303 |
+
"## 8. Next steps\n",
|
| 304 |
+
"\n",
|
| 305 |
+
"- See `validation_results.json` for held-out test metrics (75 disjoint campaigns, ~5,600 timesteps).\n",
|
| 306 |
+
"- See `multi_seed_results.json` for the across-10-seeds robustness picture (accuracy 0.603 ± 0.040, ROC-AUC 0.853 ± 0.031).\n",
|
| 307 |
+
"- See `ablation_results.json` for per-feature-group contribution. Behavioural features carry the most tier signal (−12pp accuracy when removed).\n",
|
| 308 |
+
"- The model card explains the leakage audit and the per-class tier-confusion patterns.\n",
|
| 309 |
+
"- For the full ~358k-row CYB005 dataset and commercial licensing, contact **pradeep@xpertsystems.ai**."
|
| 310 |
+
]
|
| 311 |
+
}
|
| 312 |
+
],
|
| 313 |
+
"metadata": {
|
| 314 |
+
"kernelspec": {
|
| 315 |
+
"display_name": "Python 3",
|
| 316 |
+
"language": "python",
|
| 317 |
+
"name": "python3"
|
| 318 |
+
},
|
| 319 |
+
"language_info": {
|
| 320 |
+
"name": "python",
|
| 321 |
+
"version": "3.10"
|
| 322 |
+
}
|
| 323 |
+
},
|
| 324 |
+
"nbformat": 4,
|
| 325 |
+
"nbformat_minor": 5
|
| 326 |
+
}
|
model_mlp.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0bec2554d1504c06a93a9d6ad1b6de3fa12d0a09eb6468e6b874cc778739840
|
| 3 |
+
size 71128
|
model_xgb.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
multi_seed_results.json
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"purpose": "Multi-seed evaluation across 10 random splits of the 500 ransomware campaigns. Reports XGBoost performance averaged over the full set of seeds for a robust performance picture.",
|
| 3 |
+
"seeds_evaluated": [
|
| 4 |
+
42,
|
| 5 |
+
7,
|
| 6 |
+
13,
|
| 7 |
+
17,
|
| 8 |
+
23,
|
| 9 |
+
31,
|
| 10 |
+
45,
|
| 11 |
+
99,
|
| 12 |
+
123,
|
| 13 |
+
200
|
| 14 |
+
],
|
| 15 |
+
"per_seed": [
|
| 16 |
+
{
|
| 17 |
+
"seed": 42,
|
| 18 |
+
"test_n_classes": 4,
|
| 19 |
+
"accuracy": 0.6898452783211808,
|
| 20 |
+
"macro_f1": 0.6751447018282526,
|
| 21 |
+
"macro_roc_auc_ovr": 0.873606865711172
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"seed": 7,
|
| 25 |
+
"test_n_classes": 4,
|
| 26 |
+
"accuracy": 0.5936,
|
| 27 |
+
"macro_f1": 0.6058668770031597,
|
| 28 |
+
"macro_roc_auc_ovr": 0.8807958394340375
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"seed": 13,
|
| 32 |
+
"test_n_classes": 4,
|
| 33 |
+
"accuracy": 0.6160412591143518,
|
| 34 |
+
"macro_f1": 0.6098050823090829,
|
| 35 |
+
"macro_roc_auc_ovr": 0.891446004502376
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"seed": 17,
|
| 39 |
+
"test_n_classes": 4,
|
| 40 |
+
"accuracy": 0.5668563300142248,
|
| 41 |
+
"macro_f1": 0.5260776400679491,
|
| 42 |
+
"macro_roc_auc_ovr": 0.8435537531292995
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"seed": 23,
|
| 46 |
+
"test_n_classes": 4,
|
| 47 |
+
"accuracy": 0.5331673483905388,
|
| 48 |
+
"macro_f1": 0.5092426374129808,
|
| 49 |
+
"macro_roc_auc_ovr": 0.8177927651119797
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"seed": 31,
|
| 53 |
+
"test_n_classes": 4,
|
| 54 |
+
"accuracy": 0.6072953736654805,
|
| 55 |
+
"macro_f1": 0.6146362246152752,
|
| 56 |
+
"macro_roc_auc_ovr": 0.8585576361068035
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
"seed": 45,
|
| 60 |
+
"test_n_classes": 4,
|
| 61 |
+
"accuracy": 0.5793777777777778,
|
| 62 |
+
"macro_f1": 0.5739793543388237,
|
| 63 |
+
"macro_roc_auc_ovr": 0.8200552847948792
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"seed": 99,
|
| 67 |
+
"test_n_classes": 4,
|
| 68 |
+
"accuracy": 0.6200640341515475,
|
| 69 |
+
"macro_f1": 0.6242476136431796,
|
| 70 |
+
"macro_roc_auc_ovr": 0.8679174384576391
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"seed": 123,
|
| 74 |
+
"test_n_classes": 4,
|
| 75 |
+
"accuracy": 0.6323372465314835,
|
| 76 |
+
"macro_f1": 0.6277596831292473,
|
| 77 |
+
"macro_roc_auc_ovr": 0.8854323799134519
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"seed": 200,
|
| 81 |
+
"test_n_classes": 4,
|
| 82 |
+
"accuracy": 0.587157595161864,
|
| 83 |
+
"macro_f1": 0.5653959696484754,
|
| 84 |
+
"macro_roc_auc_ovr": 0.7957473212581817
|
| 85 |
+
}
|
| 86 |
+
],
|
| 87 |
+
"aggregate": {
|
| 88 |
+
"accuracy_mean": 0.602574224312845,
|
| 89 |
+
"accuracy_std": 0.039951201198129296,
|
| 90 |
+
"accuracy_min": 0.5331673483905388,
|
| 91 |
+
"accuracy_max": 0.6898452783211808,
|
| 92 |
+
"macro_f1_mean": 0.5932155783996427,
|
| 93 |
+
"macro_f1_std": 0.04739799073577289,
|
| 94 |
+
"roc_auc_mean": 0.853490528841982,
|
| 95 |
+
"roc_auc_std": 0.031096980060089464
|
| 96 |
+
},
|
| 97 |
+
"published_artifact_seed": 42
|
| 98 |
+
}
|
validation_results.json
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0.0",
|
| 3 |
+
"dataset": "xpertsystems/cyb005-sample",
|
| 4 |
+
"task": "4-class actor_capability_tier classification",
|
| 5 |
+
"baselines": {
|
| 6 |
+
"always_predict_majority_accuracy": 0.41348034856837984,
|
| 7 |
+
"majority_class": "organised_syndicate",
|
| 8 |
+
"random_guess_accuracy": 0.25
|
| 9 |
+
},
|
| 10 |
+
"split": {
|
| 11 |
+
"strategy": "group_aware (GroupShuffleSplit by campaign_id, nested)",
|
| 12 |
+
"rationale": "500 ransomware campaigns generate ~37,489 timesteps (75 per campaign). Random row-split would leak per-campaign correlations into the test fold. Group-aware split keeps train/val/test campaigns disjoint.",
|
| 13 |
+
"campaigns_train": 350,
|
| 14 |
+
"campaigns_val": 75,
|
| 15 |
+
"campaigns_test": 75,
|
| 16 |
+
"timesteps_train": 26242,
|
| 17 |
+
"timesteps_val": 5624,
|
| 18 |
+
"timesteps_test": 5623,
|
| 19 |
+
"seed": 42
|
| 20 |
+
},
|
| 21 |
+
"n_features": 63,
|
| 22 |
+
"label_classes": [
|
| 23 |
+
"lone_actor",
|
| 24 |
+
"organised_syndicate",
|
| 25 |
+
"raas_affiliate",
|
| 26 |
+
"nation_state_nexus"
|
| 27 |
+
],
|
| 28 |
+
"class_distribution_train": {
|
| 29 |
+
"organised_syndicate": 10423,
|
| 30 |
+
"raas_affiliate": 7950,
|
| 31 |
+
"lone_actor": 4125,
|
| 32 |
+
"nation_state_nexus": 3744
|
| 33 |
+
},
|
| 34 |
+
"class_distribution_test": {
|
| 35 |
+
"organised_syndicate": 2325,
|
| 36 |
+
"raas_affiliate": 1725,
|
| 37 |
+
"nation_state_nexus": 823,
|
| 38 |
+
"lone_actor": 750
|
| 39 |
+
},
|
| 40 |
+
"leakage_excluded_features": [],
|
| 41 |
+
"leakage_audit_notes": "Three columns were audited as potential tier oracles: attribution_risk_score (mean 0.016-0.026 with overlapping ranges - not an oracle, kept); living_off_land_score (mean 0.05-0.20 with large overlap - real observable, kept); attack_phase (no oracle relationship to tier - kept). detection_outcome contains a recovery_in_progress value that is 1:1 with the attack_phase of the same name, but this is a phase-prediction leak, not a tier-prediction one. No features dropped for this task.",
|
| 42 |
+
"models": {
|
| 43 |
+
"xgboost": {
|
| 44 |
+
"architecture": "Gradient-boosted decision trees, multi:softprob, 4 classes",
|
| 45 |
+
"framework": "xgboost",
|
| 46 |
+
"test_metrics": {
|
| 47 |
+
"model": "xgboost",
|
| 48 |
+
"accuracy": 0.6898452783211808,
|
| 49 |
+
"macro_f1": 0.6751447018282526,
|
| 50 |
+
"weighted_f1": 0.6881356546405818,
|
| 51 |
+
"per_class_f1": {
|
| 52 |
+
"lone_actor": 0.6297297297297297,
|
| 53 |
+
"organised_syndicate": 0.7391393864525427,
|
| 54 |
+
"raas_affiliate": 0.6458906202260922,
|
| 55 |
+
"nation_state_nexus": 0.6858190709046454
|
| 56 |
+
},
|
| 57 |
+
"confusion_matrix": {
|
| 58 |
+
"labels": [
|
| 59 |
+
"lone_actor",
|
| 60 |
+
"organised_syndicate",
|
| 61 |
+
"raas_affiliate",
|
| 62 |
+
"nation_state_nexus"
|
| 63 |
+
],
|
| 64 |
+
"matrix": [
|
| 65 |
+
[
|
| 66 |
+
466,
|
| 67 |
+
67,
|
| 68 |
+
216,
|
| 69 |
+
1
|
| 70 |
+
],
|
| 71 |
+
[
|
| 72 |
+
83,
|
| 73 |
+
1795,
|
| 74 |
+
275,
|
| 75 |
+
172
|
| 76 |
+
],
|
| 77 |
+
[
|
| 78 |
+
156,
|
| 79 |
+
433,
|
| 80 |
+
1057,
|
| 81 |
+
79
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
25,
|
| 85 |
+
237,
|
| 86 |
+
0,
|
| 87 |
+
561
|
| 88 |
+
]
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
"macro_roc_auc_ovr": 0.873606865711172
|
| 92 |
+
}
|
| 93 |
+
},
|
| 94 |
+
"mlp": {
|
| 95 |
+
"architecture": "PyTorch MLP, 63 -> 128 -> 64 -> 4, BatchNorm1d + ReLU + Dropout, weighted cross-entropy loss",
|
| 96 |
+
"framework": "pytorch",
|
| 97 |
+
"test_metrics": {
|
| 98 |
+
"model": "mlp",
|
| 99 |
+
"accuracy": 0.5118264271741063,
|
| 100 |
+
"macro_f1": 0.512148917800585,
|
| 101 |
+
"weighted_f1": 0.5133102239521222,
|
| 102 |
+
"per_class_f1": {
|
| 103 |
+
"lone_actor": 0.427515633882888,
|
| 104 |
+
"organised_syndicate": 0.5204107187578262,
|
| 105 |
+
"raas_affiliate": 0.49878147847278637,
|
| 106 |
+
"nation_state_nexus": 0.6018878400888396
|
| 107 |
+
},
|
| 108 |
+
"confusion_matrix": {
|
| 109 |
+
"labels": [
|
| 110 |
+
"lone_actor",
|
| 111 |
+
"organised_syndicate",
|
| 112 |
+
"raas_affiliate",
|
| 113 |
+
"nation_state_nexus"
|
| 114 |
+
],
|
| 115 |
+
"matrix": [
|
| 116 |
+
[
|
| 117 |
+
376,
|
| 118 |
+
17,
|
| 119 |
+
280,
|
| 120 |
+
77
|
| 121 |
+
],
|
| 122 |
+
[
|
| 123 |
+
282,
|
| 124 |
+
1039,
|
| 125 |
+
745,
|
| 126 |
+
259
|
| 127 |
+
],
|
| 128 |
+
[
|
| 129 |
+
248,
|
| 130 |
+
456,
|
| 131 |
+
921,
|
| 132 |
+
100
|
| 133 |
+
],
|
| 134 |
+
[
|
| 135 |
+
103,
|
| 136 |
+
156,
|
| 137 |
+
22,
|
| 138 |
+
542
|
| 139 |
+
]
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
"macro_roc_auc_ovr": 0.8071564672462985
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
}
|