Initial release: XGBoost + MLP baseline on CYB001 sample
Browse files- README.md +344 -0
- ablation_results.json +85 -0
- feature_engineering.py +363 -0
- feature_meta.json +236 -0
- feature_scaler.json +1 -0
- inference_example.ipynb +343 -0
- model_mlp.safetensors +3 -0
- model_xgb.json +0 -0
- validation_results.json +109 -0
README.md
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
library_name: pytorch
|
| 4 |
+
tags:
|
| 5 |
+
- cybersecurity
|
| 6 |
+
- network-traffic
|
| 7 |
+
- intrusion-detection
|
| 8 |
+
- tabular-classification
|
| 9 |
+
- synthetic-data
|
| 10 |
+
- xgboost
|
| 11 |
+
- baseline
|
| 12 |
+
pipeline_tag: tabular-classification
|
| 13 |
+
base_model: []
|
| 14 |
+
datasets:
|
| 15 |
+
- xpertsystems/cyb001-sample
|
| 16 |
+
metrics:
|
| 17 |
+
- accuracy
|
| 18 |
+
- f1
|
| 19 |
+
model-index:
|
| 20 |
+
- name: cyb001-baseline-classifier
|
| 21 |
+
results:
|
| 22 |
+
- task:
|
| 23 |
+
type: tabular-classification
|
| 24 |
+
name: 3-class network flow classification
|
| 25 |
+
dataset:
|
| 26 |
+
type: xpertsystems/cyb001-sample
|
| 27 |
+
name: CYB001 Synthetic Network Traffic (Sample)
|
| 28 |
+
metrics:
|
| 29 |
+
- type: accuracy
|
| 30 |
+
value: 0.9980
|
| 31 |
+
name: Test accuracy (XGBoost)
|
| 32 |
+
- type: f1
|
| 33 |
+
value: 0.9961
|
| 34 |
+
name: Test macro-F1 (XGBoost)
|
| 35 |
+
- type: accuracy
|
| 36 |
+
value: 0.9932
|
| 37 |
+
name: Test accuracy (MLP)
|
| 38 |
+
- type: f1
|
| 39 |
+
value: 0.9869
|
| 40 |
+
name: Test macro-F1 (MLP)
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
# CYB001 Baseline Classifier
|
| 44 |
+
|
| 45 |
+
**Multi-class network flow classifier trained on the CYB001 synthetic
|
| 46 |
+
network traffic sample. Predicts `BENIGN`, `MALICIOUS`, or `AMBIGUOUS`
|
| 47 |
+
from per-flow features.**
|
| 48 |
+
|
| 49 |
+
> **Baseline reference, not for production use.** This model demonstrates
|
| 50 |
+
> that the [CYB001 sample dataset](https://huggingface.co/datasets/xpertsystems/cyb001-sample)
|
| 51 |
+
> is learnable end-to-end and gives prospective buyers a working starting
|
| 52 |
+
> point to evaluate against their own pipelines. It is not an intrusion
|
| 53 |
+
> detection system. See [Limitations](#limitations).
|
| 54 |
+
|
| 55 |
+
## Model overview
|
| 56 |
+
|
| 57 |
+
| Property | Value |
|
| 58 |
+
|---|---|
|
| 59 |
+
| Task | 3-class flow classification (BENIGN / MALICIOUS / AMBIGUOUS) |
|
| 60 |
+
| Training data | `xpertsystems/cyb001-sample` (9,770 flows, sample only) |
|
| 61 |
+
| Models | XGBoost + PyTorch MLP |
|
| 62 |
+
| Input features | 101 (after one-hot encoding) |
|
| 63 |
+
| License | CC-BY-NC-4.0 (matches dataset) |
|
| 64 |
+
| Status | Reference baseline |
|
| 65 |
+
|
| 66 |
+
Two model artifacts are published. They are designed to be used together — disagreement between them is itself a useful triage signal:
|
| 67 |
+
|
| 68 |
+
- `model_xgb.json` — gradient-boosted trees, primary recommendation
|
| 69 |
+
- `model_mlp.safetensors` — PyTorch MLP in SafeTensors format
|
| 70 |
+
|
| 71 |
+
## Quick start
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
pip install xgboost torch safetensors pandas huggingface_hub
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
```python
|
| 78 |
+
from huggingface_hub import hf_hub_download
|
| 79 |
+
import json, numpy as np, torch, xgboost as xgb
|
| 80 |
+
from safetensors.torch import load_file
|
| 81 |
+
|
| 82 |
+
REPO = "xpertsystems/cyb001-baseline-classifier"
|
| 83 |
+
|
| 84 |
+
# Download artifacts
|
| 85 |
+
paths = {n: hf_hub_download(REPO, n) for n in [
|
| 86 |
+
"model_xgb.json", "model_mlp.safetensors",
|
| 87 |
+
"feature_engineering.py", "feature_meta.json", "feature_scaler.json",
|
| 88 |
+
]}
|
| 89 |
+
|
| 90 |
+
# Make feature pipeline importable
|
| 91 |
+
import sys, os
|
| 92 |
+
sys.path.insert(0, os.path.dirname(paths["feature_engineering.py"]))
|
| 93 |
+
from feature_engineering import transform_single, load_meta, INT_TO_LABEL
|
| 94 |
+
|
| 95 |
+
meta = load_meta(paths["feature_meta.json"])
|
| 96 |
+
xgb_model = xgb.XGBClassifier(); xgb_model.load_model(paths["model_xgb.json"])
|
| 97 |
+
|
| 98 |
+
# Predict (see inference_example.ipynb for full single-record example)
|
| 99 |
+
X = transform_single(my_flow_record_dict, meta)
|
| 100 |
+
proba = xgb_model.predict_proba(X)[0]
|
| 101 |
+
print(INT_TO_LABEL[int(np.argmax(proba))])
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
See [`inference_example.ipynb`](./inference_example.ipynb) for a full
|
| 105 |
+
copy-paste demo including the MLP load path and a batch run on 200 rows
|
| 106 |
+
from the public sample.
|
| 107 |
+
|
| 108 |
+
## Training data
|
| 109 |
+
|
| 110 |
+
Trained on the public sample of CYB001, 9,770 flows with:
|
| 111 |
+
|
| 112 |
+
| Label | Train (n=6,838) | Test (n=1,466) | Test share |
|
| 113 |
+
|---|---:|---:|---:|
|
| 114 |
+
| BENIGN | 4,916 | 1,054 | 71.9% |
|
| 115 |
+
| MALICIOUS | 1,378 | 295 | 20.1% |
|
| 116 |
+
| AMBIGUOUS | 544 | 117 | 8.0% |
|
| 117 |
+
|
| 118 |
+
Split: 70 / 15 / 15 stratified by label, seed 42.
|
| 119 |
+
|
| 120 |
+
Class imbalance was addressed with `class_weight='balanced'` (XGBoost
|
| 121 |
+
`sample_weight`) and weighted cross-entropy (MLP). Stratified splitting
|
| 122 |
+
preserves the proportion in each fold.
|
| 123 |
+
|
| 124 |
+
### Dataset calibration anchors
|
| 125 |
+
|
| 126 |
+
The CYB001 sample is calibrated to 12 named industry signatures. The
|
| 127 |
+
features that surface most prominently in the baseline correspond to
|
| 128 |
+
these anchors:
|
| 129 |
+
|
| 130 |
+
| Calibrated signature | Target | Observed (sample) | Feature(s) the model uses |
|
| 131 |
+
|---|---:|---:|---|
|
| 132 |
+
| `c2_beacon_regularity_score` | 0.78 | 0.77 | `iat_cv`, `inter_arrival_time_std` |
|
| 133 |
+
| `payload_entropy_benign_mean` | 4.80 | 4.86 | `payload_entropy_mean` |
|
| 134 |
+
| `fwd_bwd_byte_ratio_benign` | 1.34 | 1.41 | `fwd_bwd_byte_ratio` |
|
| 135 |
+
| `malicious_flow_rate` | 0.172 | 0.202 | (class prior) |
|
| 136 |
+
| `protocol_violation_rate` | 0.015 | 0.016 | `protocol_violation_flag`, `protocol_violation_count` |
|
| 137 |
+
| `scan_probe_density` | 0.043 | 0.045 | `tcp_flag_anomaly_score`, port features |
|
| 138 |
+
|
| 139 |
+
Full benchmark table in the [dataset card](https://huggingface.co/datasets/xpertsystems/cyb001-sample).
|
| 140 |
+
|
| 141 |
+
## Feature pipeline
|
| 142 |
+
|
| 143 |
+
The bundled `feature_engineering.py` is the canonical feature recipe.
|
| 144 |
+
The training script and the inference example both call into it.
|
| 145 |
+
|
| 146 |
+
**Three columns are deliberately excluded** because they leak the label:
|
| 147 |
+
|
| 148 |
+
- `traffic_category` — perfectly deterministic of label (every `attack_*`
|
| 149 |
+
category is 100% MALICIOUS, etc.).
|
| 150 |
+
- `attack_subcategory` — non-null iff label is MALICIOUS.
|
| 151 |
+
- `attacker_capability_tier` — generator metadata labeled per flow
|
| 152 |
+
including benign flows; not a real-world observable at inference time.
|
| 153 |
+
|
| 154 |
+
**Five session-level features were kept** after a per-label leakage audit
|
| 155 |
+
(`payload_entropy_mean`, `retransmission_rate`, `protocol_violation_count`,
|
| 156 |
+
`c2_beacon_flag`, `session_risk_score`) because their distributions
|
| 157 |
+
overlap meaningfully across labels (i.e. they behave like detector
|
| 158 |
+
outputs, not oracles). **Three were dropped** (`exfil_volume_bytes`,
|
| 159 |
+
`scan_probe_count`, `lateral_move_flag`) because they are zero for all
|
| 160 |
+
non-MALICIOUS rows.
|
| 161 |
+
|
| 162 |
+
Engineered features (each encodes a stated domain hypothesis, see source
|
| 163 |
+
for the one-line rationale per feature):
|
| 164 |
+
|
| 165 |
+
- `iat_cv` — inter-arrival-time coefficient of variation. C2 beacon signature.
|
| 166 |
+
- `fwd_bwd_byte_ratio` — exfiltration signature.
|
| 167 |
+
- `bytes_per_packet_fwd`, `payload_density` — flow shape.
|
| 168 |
+
- `tcp_flag_anomaly_score` — RST/URG/FIN density. Scan and protocol-misuse signature.
|
| 169 |
+
- `hour_of_day`, `is_off_hours` — diurnal pattern. APT and insider tiers are off-peak biased in the dataset calibration.
|
| 170 |
+
- `is_well_known_dest_port`, `is_ephemeral_src_port` — port observables.
|
| 171 |
+
|
| 172 |
+
## Evaluation
|
| 173 |
+
|
| 174 |
+
### Test-set metrics (n = 1,466, stratified)
|
| 175 |
+
|
| 176 |
+
**XGBoost**
|
| 177 |
+
|
| 178 |
+
| Metric | Value |
|
| 179 |
+
|---|---:|
|
| 180 |
+
| Accuracy | 0.9980 |
|
| 181 |
+
| Macro-F1 | 0.9961 |
|
| 182 |
+
| Weighted-F1 | 0.9980 |
|
| 183 |
+
| Macro ROC-AUC (OvR) | ≈ 1.00 |
|
| 184 |
+
|
| 185 |
+
| Class | F1 | Support |
|
| 186 |
+
|---|---:|---:|
|
| 187 |
+
| BENIGN | 0.9986 | 1,054 |
|
| 188 |
+
| MALICIOUS | 0.9983 | 295 |
|
| 189 |
+
| AMBIGUOUS | 0.9915 | 117 |
|
| 190 |
+
|
| 191 |
+
**MLP**
|
| 192 |
+
|
| 193 |
+
| Metric | Value |
|
| 194 |
+
|---|---:|
|
| 195 |
+
| Accuracy | 0.9932 |
|
| 196 |
+
| Macro-F1 | 0.9869 |
|
| 197 |
+
| Weighted-F1 | 0.9932 |
|
| 198 |
+
|
| 199 |
+
| Class | F1 | Support |
|
| 200 |
+
|---|---:|---:|
|
| 201 |
+
| BENIGN | 0.9962 | 1,054 |
|
| 202 |
+
| MALICIOUS | 0.9899 | 295 |
|
| 203 |
+
| AMBIGUOUS | 0.9746 | 117 |
|
| 204 |
+
|
| 205 |
+
Confusion matrices and per-class precision/recall are in
|
| 206 |
+
[`validation_results.json`](./validation_results.json).
|
| 207 |
+
|
| 208 |
+
### Ablation: contribution of session-level features
|
| 209 |
+
|
| 210 |
+
To check whether the model is genuinely reading the flow-level signal or
|
| 211 |
+
leaning on session aggregates, the same XGBoost configuration was trained
|
| 212 |
+
with all five session-aggregate features removed:
|
| 213 |
+
|
| 214 |
+
| Configuration | Accuracy | Macro-F1 | AMBIGUOUS F1 |
|
| 215 |
+
|---|---:|---:|---:|
|
| 216 |
+
| Full feature set (published) | 0.9980 | 0.9961 | 0.991 |
|
| 217 |
+
| Flow-only (session aggregates dropped) | 0.9884 | 0.9776 | 0.957 |
|
| 218 |
+
|
| 219 |
+
The session join contributes about **+1.0 pp** of accuracy and **+0.02**
|
| 220 |
+
macro-F1. The model is not session-dominated; the flow-level features
|
| 221 |
+
carry the bulk of the signal. The full numbers for both configurations
|
| 222 |
+
are in [`ablation_results.json`](./ablation_results.json).
|
| 223 |
+
|
| 224 |
+
### Architecture
|
| 225 |
+
|
| 226 |
+
**XGBoost:** multi-class gradient boosting (`multi:softprob`, 3 classes),
|
| 227 |
+
`hist` tree method, class-balanced sample weights, early stopping on
|
| 228 |
+
validation macro-F1.
|
| 229 |
+
|
| 230 |
+
**MLP:** `n_features → 128 → 64 → 3`, each hidden layer followed by
|
| 231 |
+
`BatchNorm1d` → `ReLU` → `Dropout(0.3)`, weighted cross-entropy loss,
|
| 232 |
+
AdamW optimizer, early stopping on validation macro-F1.
|
| 233 |
+
|
| 234 |
+
Training hyperparameters (learning rate, batch size, n_estimators,
|
| 235 |
+
early-stopping patience, weight decay, class-weighting strategy) are
|
| 236 |
+
held internally by XpertSystems and are not part of this release.
|
| 237 |
+
|
| 238 |
+
## Limitations
|
| 239 |
+
|
| 240 |
+
**This is a baseline reference, not an intrusion detection system.**
|
| 241 |
+
|
| 242 |
+
1. **Performance is inflated by synthetic structure.** The numbers above
|
| 243 |
+
reflect performance on calibrated synthetic data where the BENIGN and
|
| 244 |
+
attack categories sit on distinct statistical signatures by
|
| 245 |
+
construction. A real production IDS facing live traffic must contend
|
| 246 |
+
with concept drift, adversarial evasion, encrypted-traffic ambiguity,
|
| 247 |
+
and a much fatter long tail of benign behaviour. Expect substantial
|
| 248 |
+
degradation when transferring to real CICIDS-style datasets or
|
| 249 |
+
in-the-wild traffic.
|
| 250 |
+
|
| 251 |
+
2. **Sample size for `AMBIGUOUS` is small.** Only 117 test examples;
|
| 252 |
+
the per-class F1 has wide confidence bands. The full CYB001 product
|
| 253 |
+
(~62k AMBIGUOUS flows out of ~500k) supports more reliable estimation.
|
| 254 |
+
|
| 255 |
+
3. **Trained on the public 1/60th sample only.** The full product
|
| 256 |
+
contains additional traffic categories, longer sequences, and
|
| 257 |
+
richer adversary behaviour. A model trained on the full dataset
|
| 258 |
+
would perform differently — likely lower headline accuracy with
|
| 259 |
+
better calibration and generalisation. The intent of this release
|
| 260 |
+
is reference, not state-of-the-art.
|
| 261 |
+
|
| 262 |
+
4. **Topology features are static labels, not signals.** Fields like
|
| 263 |
+
`defender_architecture` and `firewall_policy` are descriptive
|
| 264 |
+
categorical attributes of the network segment, not learned defender
|
| 265 |
+
responses. They help the model condition on context but do not
|
| 266 |
+
simulate real adversarial dynamics.
|
| 267 |
+
|
| 268 |
+
5. **MLP brittleness on OOD inputs.** With ~7k training rows, the MLP
|
| 269 |
+
can produce confidently-wrong predictions on hand-crafted records
|
| 270 |
+
whose feature combinations are far from the training manifold. The
|
| 271 |
+
inference notebook demonstrates this. XGBoost is more robust here.
|
| 272 |
+
In practice, use both and treat disagreement as a signal for review.
|
| 273 |
+
|
| 274 |
+
6. **Class imbalance handling is straightforward.** Class-balanced
|
| 275 |
+
weights work for this sample but production-scale rare-class
|
| 276 |
+
detection (e.g. APT C2 at < 0.1% of traffic) needs more careful
|
| 277 |
+
threshold calibration, ranking metrics, and likely calibrated
|
| 278 |
+
probabilities rather than argmax classification.
|
| 279 |
+
|
| 280 |
+
## Intended use
|
| 281 |
+
|
| 282 |
+
- **Evaluating fit** of the CYB001 dataset for your IDS / NDR research
|
| 283 |
+
- **Baseline reference** for new model architectures on synthetic
|
| 284 |
+
network traffic
|
| 285 |
+
- **Teaching and demo** for tabular classification on flow-level features
|
| 286 |
+
- **Feature engineering reference** for CICFlowMeter-compatible fields
|
| 287 |
+
|
| 288 |
+
## Out-of-scope use
|
| 289 |
+
|
| 290 |
+
- Production intrusion detection on real network traffic
|
| 291 |
+
- Forensic attribution of real attacks
|
| 292 |
+
- Adversarial robustness evaluation (the dataset is not adversarially
|
| 293 |
+
generated)
|
| 294 |
+
- Any safety-critical decision
|
| 295 |
+
|
| 296 |
+
## Reproducibility
|
| 297 |
+
|
| 298 |
+
Outputs above were produced with `seed = 42`, stratified 70/15/15 split,
|
| 299 |
+
on the published sample (`xpertsystems/cyb001-sample`, version 1.0.0,
|
| 300 |
+
generated 2026-05-16). The feature pipeline in `feature_engineering.py`
|
| 301 |
+
is deterministic and the trained weights in this repo correspond exactly
|
| 302 |
+
to the metrics above.
|
| 303 |
+
|
| 304 |
+
The training script itself is private to XpertSystems. The published
|
| 305 |
+
artifacts contain the feature pipeline, model weights, scaler, metadata,
|
| 306 |
+
and validation results — sufficient to reproduce inference but not
|
| 307 |
+
training.
|
| 308 |
+
|
| 309 |
+
## Files in this repo
|
| 310 |
+
|
| 311 |
+
| File | Purpose |
|
| 312 |
+
|---|---|
|
| 313 |
+
| `model_xgb.json` | XGBoost weights |
|
| 314 |
+
| `model_mlp.safetensors` | PyTorch MLP weights |
|
| 315 |
+
| `feature_engineering.py` | Feature pipeline (load → engineer → encode) |
|
| 316 |
+
| `feature_meta.json` | Feature column order + categorical levels |
|
| 317 |
+
| `feature_scaler.json` | MLP input mean/std (XGBoost ignores) |
|
| 318 |
+
| `validation_results.json` | Per-class metrics, confusion matrix, architecture |
|
| 319 |
+
| `ablation_results.json` | Flow-only vs full feature set comparison |
|
| 320 |
+
| `inference_example.ipynb` | End-to-end inference demo notebook |
|
| 321 |
+
| `README.md` | This file |
|
| 322 |
+
|
| 323 |
+
## Contact and full product
|
| 324 |
+
|
| 325 |
+
The full **CYB001** dataset contains ~685,000 rows across four files
|
| 326 |
+
with calibrated A+ benchmark validation. The full XpertSystems.ai
|
| 327 |
+
synthetic data catalogue spans 41 SKUs across Cybersecurity, Healthcare,
|
| 328 |
+
Insurance & Risk, Oil & Gas, and Materials & Energy.
|
| 329 |
+
|
| 330 |
+
- 📧 **pradeep@xpertsystems.ai**
|
| 331 |
+
- 🌐 **https://xpertsystems.ai**
|
| 332 |
+
- 🗂 Dataset: https://huggingface.co/datasets/xpertsystems/cyb001-sample
|
| 333 |
+
|
| 334 |
+
## Citation
|
| 335 |
+
|
| 336 |
+
```bibtex
|
| 337 |
+
@misc{xpertsystems_cyb001_baseline_2026,
|
| 338 |
+
title = {CYB001 Baseline Classifier: XGBoost and MLP for Synthetic Network Flow Classification},
|
| 339 |
+
author = {XpertSystems.ai},
|
| 340 |
+
year = {2026},
|
| 341 |
+
url = {https://huggingface.co/xpertsystems/cyb001-baseline-classifier},
|
| 342 |
+
note = {Baseline reference model trained on xpertsystems/cyb001-sample}
|
| 343 |
+
}
|
| 344 |
+
```
|
ablation_results.json
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"purpose": "Quantify how much the session-aggregate features contribute to the headline number. Trained with identical architecture on the same split, with session features dropped.",
|
| 3 |
+
"session_features_dropped": [
|
| 4 |
+
"payload_entropy_mean",
|
| 5 |
+
"retransmission_rate",
|
| 6 |
+
"protocol_violation_count",
|
| 7 |
+
"c2_beacon_flag",
|
| 8 |
+
"session_risk_score"
|
| 9 |
+
],
|
| 10 |
+
"n_features_full": 101,
|
| 11 |
+
"n_features_flow_only": 96,
|
| 12 |
+
"full_model_metrics": {
|
| 13 |
+
"model": "xgboost",
|
| 14 |
+
"accuracy": 0.9979536152796725,
|
| 15 |
+
"macro_f1": 0.9961123729105247,
|
| 16 |
+
"weighted_f1": 0.9979537067605843,
|
| 17 |
+
"per_class_f1": {
|
| 18 |
+
"BENIGN": 0.9985761746559089,
|
| 19 |
+
"MALICIOUS": 0.9983079526226735,
|
| 20 |
+
"AMBIGUOUS": 0.9914529914529915
|
| 21 |
+
},
|
| 22 |
+
"confusion_matrix": {
|
| 23 |
+
"labels": [
|
| 24 |
+
"BENIGN",
|
| 25 |
+
"MALICIOUS",
|
| 26 |
+
"AMBIGUOUS"
|
| 27 |
+
],
|
| 28 |
+
"matrix": [
|
| 29 |
+
[
|
| 30 |
+
1052,
|
| 31 |
+
1,
|
| 32 |
+
1
|
| 33 |
+
],
|
| 34 |
+
[
|
| 35 |
+
0,
|
| 36 |
+
295,
|
| 37 |
+
0
|
| 38 |
+
],
|
| 39 |
+
[
|
| 40 |
+
1,
|
| 41 |
+
0,
|
| 42 |
+
116
|
| 43 |
+
]
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
"macro_roc_auc_ovr": 0.9999888611978185
|
| 47 |
+
},
|
| 48 |
+
"flow_only_model_metrics": {
|
| 49 |
+
"model": "xgboost_flow_only",
|
| 50 |
+
"accuracy": 0.9884038199181446,
|
| 51 |
+
"macro_f1": 0.9776308066176851,
|
| 52 |
+
"weighted_f1": 0.9883464558152856,
|
| 53 |
+
"per_class_f1": {
|
| 54 |
+
"BENIGN": 0.9933774834437086,
|
| 55 |
+
"MALICIOUS": 0.9829931972789115,
|
| 56 |
+
"AMBIGUOUS": 0.9565217391304348
|
| 57 |
+
},
|
| 58 |
+
"confusion_matrix": {
|
| 59 |
+
"labels": [
|
| 60 |
+
"BENIGN",
|
| 61 |
+
"MALICIOUS",
|
| 62 |
+
"AMBIGUOUS"
|
| 63 |
+
],
|
| 64 |
+
"matrix": [
|
| 65 |
+
[
|
| 66 |
+
1050,
|
| 67 |
+
2,
|
| 68 |
+
2
|
| 69 |
+
],
|
| 70 |
+
[
|
| 71 |
+
5,
|
| 72 |
+
289,
|
| 73 |
+
1
|
| 74 |
+
],
|
| 75 |
+
[
|
| 76 |
+
5,
|
| 77 |
+
2,
|
| 78 |
+
110
|
| 79 |
+
]
|
| 80 |
+
]
|
| 81 |
+
},
|
| 82 |
+
"macro_roc_auc_ovr": 0.9988745635051176
|
| 83 |
+
},
|
| 84 |
+
"interpretation": "Removing session aggregates costs roughly 1 percentage point of accuracy. The model is not session-dominated; the flow-level features carry the bulk of the signal."
|
| 85 |
+
}
|
feature_engineering.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
feature_engineering.py
|
| 3 |
+
======================
|
| 4 |
+
|
| 5 |
+
Feature pipeline for the CYB001 baseline classifier.
|
| 6 |
+
|
| 7 |
+
This module produces a flow-level feature matrix and label vector from the
|
| 8 |
+
four CSV files distributed with the CYB001 sample dataset on Hugging Face:
|
| 9 |
+
|
| 10 |
+
network_flows.csv (primary, one row per flow)
|
| 11 |
+
session_summary.csv (one row per session, joined on session_id)
|
| 12 |
+
network_topology.csv (one row per network segment, joined on segment_id)
|
| 13 |
+
flow_events.csv (one row per security event - NOT used for v1
|
| 14 |
+
features; flows lose temporal granularity if
|
| 15 |
+
aggregated naively. Reserved for future work.)
|
| 16 |
+
|
| 17 |
+
The pipeline is deliberately written to be read end-to-end. Every dropped
|
| 18 |
+
column is dropped with a one-line explanation. Every engineered feature
|
| 19 |
+
sits next to a one-sentence motivation. If you are evaluating the CYB001
|
| 20 |
+
product, this file is the feature recipe; what the model "sees" is exactly
|
| 21 |
+
what this file emits.
|
| 22 |
+
|
| 23 |
+
Public API
|
| 24 |
+
----------
|
| 25 |
+
build_features(flows_path, sessions_path, topology_path) -> (X, y, meta)
|
| 26 |
+
|
| 27 |
+
X : pd.DataFrame - feature matrix, all numeric, no NaNs
|
| 28 |
+
y : pd.Series - integer-encoded label (0=BENIGN, 1=MALICIOUS, 2=AMBIGUOUS)
|
| 29 |
+
meta : dict - {feature_names, label_encoder, categorical_levels}
|
| 30 |
+
|
| 31 |
+
The same `meta` dict is used at inference time so a new flow record gets
|
| 32 |
+
encoded identically to training.
|
| 33 |
+
|
| 34 |
+
transform_single(record, meta) -> np.ndarray
|
| 35 |
+
|
| 36 |
+
Encode a single flow record (dict or 1-row DataFrame) for inference.
|
| 37 |
+
|
| 38 |
+
License
|
| 39 |
+
-------
|
| 40 |
+
This file ships with the public model on Hugging Face under CC-BY-NC-4.0,
|
| 41 |
+
matching the dataset license. See README.md.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
from __future__ import annotations
|
| 45 |
+
|
| 46 |
+
import json
|
| 47 |
+
from pathlib import Path
|
| 48 |
+
from typing import Any
|
| 49 |
+
|
| 50 |
+
import numpy as np
|
| 51 |
+
import pandas as pd
|
| 52 |
+
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
# Constants - what we keep, what we drop, and why
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
|
| 57 |
+
LABEL_ORDER = ["BENIGN", "MALICIOUS", "AMBIGUOUS"] # index 0, 1, 2
|
| 58 |
+
LABEL_TO_INT = {lbl: i for i, lbl in enumerate(LABEL_ORDER)}
|
| 59 |
+
INT_TO_LABEL = {i: lbl for lbl, i in LABEL_TO_INT.items()}
|
| 60 |
+
|
| 61 |
+
# Columns dropped from network_flows.csv because they are ground-truth
|
| 62 |
+
# generator metadata, not observables a real IDS would have at inference time.
|
| 63 |
+
# Including any of these gives perfect or near-perfect accuracy that does
|
| 64 |
+
# not reflect real-world performance.
|
| 65 |
+
LEAKY_FLOW_COLUMNS = [
|
| 66 |
+
"traffic_category", # 100% deterministic of label (attack_*/benign_*/ambiguous_*)
|
| 67 |
+
"attack_subcategory", # null iff label != MALICIOUS
|
| 68 |
+
"attacker_capability_tier", # labeled per flow including benign - generator metadata
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
# Identifier / non-feature columns
|
| 72 |
+
ID_COLUMNS = [
|
| 73 |
+
"flow_id", "session_id",
|
| 74 |
+
"source_ip_hash", "destination_ip_hash", # SHA-256 pseudonyms, not useful as features
|
| 75 |
+
"flow_start_timestamp", # consumed by is_off_hours engineered feature
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
# Direct numeric features from network_flows.csv (pass-through)
|
| 79 |
+
DIRECT_NUMERIC_FLOW_FEATURES = [
|
| 80 |
+
"source_port", "dest_port",
|
| 81 |
+
"flow_duration_ms",
|
| 82 |
+
"total_fwd_packets", "total_bwd_packets",
|
| 83 |
+
"total_bytes_fwd", "total_bytes_bwd",
|
| 84 |
+
"fwd_packet_len_mean", "fwd_packet_len_std",
|
| 85 |
+
"bwd_packet_len_mean", "bwd_packet_len_std",
|
| 86 |
+
"flow_bytes_per_sec", "flow_packets_per_sec",
|
| 87 |
+
"inter_arrival_time_mean", "inter_arrival_time_std",
|
| 88 |
+
"tcp_flag_syn_count", "tcp_flag_ack_count", "tcp_flag_fin_count",
|
| 89 |
+
"tcp_flag_rst_count", "tcp_flag_psh_count", "tcp_flag_urg_count",
|
| 90 |
+
"retransmission_flag", "fragmentation_flag", "protocol_violation_flag",
|
| 91 |
+
]
|
| 92 |
+
|
| 93 |
+
# Session-level numeric features (joined on session_id).
|
| 94 |
+
# Selected after a per-label leakage audit:
|
| 95 |
+
# KEEP: payload_entropy_mean, retransmission_rate, protocol_violation_count,
|
| 96 |
+
# c2_beacon_flag, session_risk_score (overlapping distributions across labels)
|
| 97 |
+
# DROP: exfil_volume_bytes, scan_probe_count, lateral_move_flag
|
| 98 |
+
# (zero for all BENIGN/AMBIGUOUS - generator oracles, not detector outputs)
|
| 99 |
+
SESSION_FEATURES_KEEP = [
|
| 100 |
+
"payload_entropy_mean",
|
| 101 |
+
"retransmission_rate",
|
| 102 |
+
"protocol_violation_count",
|
| 103 |
+
"c2_beacon_flag",
|
| 104 |
+
"session_risk_score",
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
# Topology-level numeric features (joined on segment_id)
|
| 108 |
+
TOPOLOGY_NUMERIC_FEATURES = [
|
| 109 |
+
"trust_level", "avg_concurrent_flows", "bandwidth_mbps",
|
| 110 |
+
"nat_enabled", "ids_coverage", "diurnal_peak_factor",
|
| 111 |
+
"feature_space_dim", "alert_threshold",
|
| 112 |
+
"retraining_cadence_days", "ensemble_size", "device_count",
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
# Categorical columns that get one-hot encoded
|
| 116 |
+
CATEGORICAL_FEATURES = [
|
| 117 |
+
("protocol", "flows"), # TCP / UDP / HTTPS / DNS / SMTP / SSH / FTP / NTP
|
| 118 |
+
("flow_lifecycle_phase", "flows"), # initiation / handshake / transfer / ...
|
| 119 |
+
("source_device_type", "flows"), # workstation / server / iot / mobile / cloud / ot
|
| 120 |
+
("dest_device_type", "flows"),
|
| 121 |
+
("segment_type", "topology"), # corporate_lan / dmz / cloud_workload / ...
|
| 122 |
+
("firewall_policy", "topology"),
|
| 123 |
+
("qos_policy", "topology"),
|
| 124 |
+
("defender_architecture","topology"),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# ---------------------------------------------------------------------------
|
| 129 |
+
# Engineered features
|
| 130 |
+
# ---------------------------------------------------------------------------
|
| 131 |
+
|
| 132 |
+
def _safe_divide(num: pd.Series, denom: pd.Series, fill: float = 0.0) -> pd.Series:
|
| 133 |
+
"""Element-wise divide, replacing inf/nan from div-by-zero with `fill`."""
|
| 134 |
+
out = num / denom.replace(0, np.nan)
|
| 135 |
+
return out.replace([np.inf, -np.inf], np.nan).fillna(fill)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _add_engineered_features(df: pd.DataFrame) -> pd.DataFrame:
|
| 139 |
+
"""
|
| 140 |
+
Add eight engineered features that encode domain hypotheses about how
|
| 141 |
+
each label class behaves. These are NOT learned; they are stated by hand
|
| 142 |
+
so a buyer can read this function and see what the model is told to look
|
| 143 |
+
at. Tree models can recover most of these on their own, but giving them
|
| 144 |
+
explicitly improves both XGBoost convergence and MLP performance.
|
| 145 |
+
"""
|
| 146 |
+
df = df.copy()
|
| 147 |
+
|
| 148 |
+
# IAT coefficient of variation. Low cv => regular inter-arrival times
|
| 149 |
+
# => C2 beacon signature (the dataset is calibrated to cv ~= 0.065 for
|
| 150 |
+
# APT beacons, regularity score ~= 0.93 per the README).
|
| 151 |
+
df["iat_cv"] = _safe_divide(df["inter_arrival_time_std"],
|
| 152 |
+
df["inter_arrival_time_mean"])
|
| 153 |
+
|
| 154 |
+
# Forward/backward byte ratio. >> 1 indicates upload-heavy flow, which
|
| 155 |
+
# is the exfiltration signature.
|
| 156 |
+
df["fwd_bwd_byte_ratio"] = _safe_divide(df["total_bytes_fwd"],
|
| 157 |
+
df["total_bytes_bwd"])
|
| 158 |
+
|
| 159 |
+
# Bytes per packet (forward direction). Combined with packet length
|
| 160 |
+
# std, separates streaming traffic from short-message protocols.
|
| 161 |
+
total_fwd = df["total_fwd_packets"].replace(0, np.nan)
|
| 162 |
+
df["bytes_per_packet_fwd"] = (df["total_bytes_fwd"] / total_fwd).fillna(0)
|
| 163 |
+
|
| 164 |
+
# TCP flag anomaly score. RST and URG together, or high counts relative
|
| 165 |
+
# to total packets, indicate scan/probe or protocol misuse.
|
| 166 |
+
total_packets = (df["total_fwd_packets"] + df["total_bwd_packets"]).replace(0, np.nan)
|
| 167 |
+
flag_total = (df["tcp_flag_rst_count"] + df["tcp_flag_urg_count"]
|
| 168 |
+
+ df["tcp_flag_fin_count"])
|
| 169 |
+
df["tcp_flag_anomaly_score"] = (flag_total / total_packets).fillna(0)
|
| 170 |
+
|
| 171 |
+
# Payload density. Bytes per packet, normalized to MTU. Low density on
|
| 172 |
+
# high packet counts indicates beaconing or keep-alive.
|
| 173 |
+
total_bytes = df["total_bytes_fwd"] + df["total_bytes_bwd"]
|
| 174 |
+
df["payload_density"] = (total_bytes / (total_packets * 1500)).fillna(0)
|
| 175 |
+
|
| 176 |
+
# Hour of day from timestamp. Off-hours bias is calibrated into the
|
| 177 |
+
# APT and insider-threat tiers.
|
| 178 |
+
ts = pd.to_datetime(df["flow_start_timestamp"], errors="coerce")
|
| 179 |
+
hour = ts.dt.hour.fillna(12).astype(int)
|
| 180 |
+
df["hour_of_day"] = hour
|
| 181 |
+
df["is_off_hours"] = ((hour < 6) | (hour > 22)).astype(int)
|
| 182 |
+
|
| 183 |
+
# Port observables. Well-known ports < 1024, ephemeral ports >= 49152.
|
| 184 |
+
df["is_well_known_dest_port"] = (df["dest_port"] < 1024).astype(int)
|
| 185 |
+
df["is_ephemeral_src_port"] = (df["source_port"] >= 49152).astype(int)
|
| 186 |
+
|
| 187 |
+
return df
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
# ---------------------------------------------------------------------------
|
| 191 |
+
# Public API
|
| 192 |
+
# ---------------------------------------------------------------------------
|
| 193 |
+
|
| 194 |
+
def build_features(
|
| 195 |
+
flows_path: str | Path,
|
| 196 |
+
sessions_path: str | Path,
|
| 197 |
+
topology_path: str | Path,
|
| 198 |
+
) -> tuple[pd.DataFrame, pd.Series, dict[str, Any]]:
|
| 199 |
+
"""
|
| 200 |
+
Load the three CSVs, join them, drop leaky columns, engineer features,
|
| 201 |
+
one-hot encode categoricals, and return (X, y, meta).
|
| 202 |
+
|
| 203 |
+
The returned `meta` dict captures the column order and the categorical
|
| 204 |
+
level set, which is what `transform_single` needs at inference time to
|
| 205 |
+
encode a new record identically.
|
| 206 |
+
"""
|
| 207 |
+
flows = pd.read_csv(flows_path)
|
| 208 |
+
sessions = pd.read_csv(sessions_path)
|
| 209 |
+
topology = pd.read_csv(topology_path)
|
| 210 |
+
|
| 211 |
+
# Drop columns that leak the label (see LEAKY_FLOW_COLUMNS for rationale)
|
| 212 |
+
flows = flows.drop(columns=LEAKY_FLOW_COLUMNS, errors="ignore")
|
| 213 |
+
|
| 214 |
+
# Join session-level aggregates
|
| 215 |
+
df = flows.merge(
|
| 216 |
+
sessions[["session_id"] + SESSION_FEATURES_KEEP],
|
| 217 |
+
on="session_id", how="left",
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# Join topology features (numeric + categorical)
|
| 221 |
+
topo_cols = ["segment_id"] + TOPOLOGY_NUMERIC_FEATURES + [
|
| 222 |
+
col for col, src in CATEGORICAL_FEATURES if src == "topology"
|
| 223 |
+
]
|
| 224 |
+
df = df.merge(topology[topo_cols], on="segment_id", how="left")
|
| 225 |
+
|
| 226 |
+
# Extract labels before adding features
|
| 227 |
+
y = df["label"].map(LABEL_TO_INT).astype(int)
|
| 228 |
+
|
| 229 |
+
# Engineered features
|
| 230 |
+
df = _add_engineered_features(df)
|
| 231 |
+
|
| 232 |
+
# Assemble feature columns
|
| 233 |
+
numeric_features = (
|
| 234 |
+
DIRECT_NUMERIC_FLOW_FEATURES
|
| 235 |
+
+ SESSION_FEATURES_KEEP
|
| 236 |
+
+ TOPOLOGY_NUMERIC_FEATURES
|
| 237 |
+
+ [
|
| 238 |
+
"iat_cv", "fwd_bwd_byte_ratio", "bytes_per_packet_fwd",
|
| 239 |
+
"tcp_flag_anomaly_score", "payload_density",
|
| 240 |
+
"hour_of_day", "is_off_hours",
|
| 241 |
+
"is_well_known_dest_port", "is_ephemeral_src_port",
|
| 242 |
+
]
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
X_numeric = df[numeric_features].astype(float)
|
| 246 |
+
|
| 247 |
+
# One-hot encode categoricals. Record the level set in `meta` so we can
|
| 248 |
+
# reproduce the same columns at inference time even if a new record
|
| 249 |
+
# contains an unseen level (it will encode to all-zero, which is the
|
| 250 |
+
# correct fallback for one-hot).
|
| 251 |
+
categorical_levels: dict[str, list[str]] = {}
|
| 252 |
+
one_hot_blocks: list[pd.DataFrame] = []
|
| 253 |
+
for col, _src in CATEGORICAL_FEATURES:
|
| 254 |
+
levels = sorted(df[col].dropna().unique().tolist())
|
| 255 |
+
categorical_levels[col] = levels
|
| 256 |
+
block = pd.get_dummies(
|
| 257 |
+
df[col].astype("category").cat.set_categories(levels),
|
| 258 |
+
prefix=col, dummy_na=False,
|
| 259 |
+
).astype(int)
|
| 260 |
+
one_hot_blocks.append(block)
|
| 261 |
+
|
| 262 |
+
X = pd.concat([X_numeric.reset_index(drop=True)]
|
| 263 |
+
+ [b.reset_index(drop=True) for b in one_hot_blocks], axis=1)
|
| 264 |
+
|
| 265 |
+
# Final NaN sweep (defensive - session join can introduce NaN if a
|
| 266 |
+
# session_id is missing from session_summary.csv).
|
| 267 |
+
X = X.fillna(0.0)
|
| 268 |
+
|
| 269 |
+
meta = {
|
| 270 |
+
"feature_names": X.columns.tolist(),
|
| 271 |
+
"numeric_features": numeric_features,
|
| 272 |
+
"categorical_levels": categorical_levels,
|
| 273 |
+
"label_to_int": LABEL_TO_INT,
|
| 274 |
+
"int_to_label": INT_TO_LABEL,
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
return X, y, meta
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def transform_single(record: dict | pd.DataFrame, meta: dict[str, Any]) -> np.ndarray:
|
| 281 |
+
"""
|
| 282 |
+
Encode a single flow record for inference.
|
| 283 |
+
|
| 284 |
+
`record` must contain the same columns as network_flows.csv (minus the
|
| 285 |
+
leaky columns), plus the joined session and topology fields. If you only
|
| 286 |
+
have the flow row, you must look up the matching session_summary row and
|
| 287 |
+
network_topology row and merge them into `record` before calling this.
|
| 288 |
+
|
| 289 |
+
Returns a (1, n_features) numpy array ready for model.predict_proba.
|
| 290 |
+
"""
|
| 291 |
+
if isinstance(record, dict):
|
| 292 |
+
df = pd.DataFrame([record])
|
| 293 |
+
else:
|
| 294 |
+
df = record.copy()
|
| 295 |
+
|
| 296 |
+
df = _add_engineered_features(df)
|
| 297 |
+
|
| 298 |
+
# Numeric features in fixed order
|
| 299 |
+
numeric = pd.DataFrame({
|
| 300 |
+
col: df.get(col, pd.Series([0.0] * len(df))).astype(float).values
|
| 301 |
+
for col in meta["numeric_features"]
|
| 302 |
+
})
|
| 303 |
+
|
| 304 |
+
# One-hot blocks in fixed order, using the levels seen at fit time
|
| 305 |
+
blocks: list[pd.DataFrame] = [numeric]
|
| 306 |
+
for col, levels in meta["categorical_levels"].items():
|
| 307 |
+
val = df.get(col, pd.Series([None] * len(df)))
|
| 308 |
+
block = pd.get_dummies(
|
| 309 |
+
val.astype("category").cat.set_categories(levels),
|
| 310 |
+
prefix=col, dummy_na=False,
|
| 311 |
+
).astype(int)
|
| 312 |
+
# Ensure all expected level columns are present (in case a level
|
| 313 |
+
# didn't appear in this single record)
|
| 314 |
+
for lvl in levels:
|
| 315 |
+
colname = f"{col}_{lvl}"
|
| 316 |
+
if colname not in block.columns:
|
| 317 |
+
block[colname] = 0
|
| 318 |
+
block = block[[f"{col}_{lvl}" for lvl in levels]]
|
| 319 |
+
blocks.append(block)
|
| 320 |
+
|
| 321 |
+
X = pd.concat(blocks, axis=1).fillna(0.0)
|
| 322 |
+
|
| 323 |
+
# Reorder to match training column order exactly
|
| 324 |
+
X = X.reindex(columns=meta["feature_names"], fill_value=0.0)
|
| 325 |
+
return X.values.astype(np.float32)
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def save_meta(meta: dict[str, Any], path: str | Path) -> None:
|
| 329 |
+
"""Persist meta to JSON for inference-time reuse."""
|
| 330 |
+
serializable = {
|
| 331 |
+
"feature_names": meta["feature_names"],
|
| 332 |
+
"numeric_features": meta["numeric_features"],
|
| 333 |
+
"categorical_levels": meta["categorical_levels"],
|
| 334 |
+
"label_to_int": meta["label_to_int"],
|
| 335 |
+
"int_to_label": {str(k): v for k, v in meta["int_to_label"].items()},
|
| 336 |
+
}
|
| 337 |
+
with open(path, "w") as f:
|
| 338 |
+
json.dump(serializable, f, indent=2)
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def load_meta(path: str | Path) -> dict[str, Any]:
|
| 342 |
+
"""Load meta from JSON."""
|
| 343 |
+
with open(path) as f:
|
| 344 |
+
meta = json.load(f)
|
| 345 |
+
meta["int_to_label"] = {int(k): v for k, v in meta["int_to_label"].items()}
|
| 346 |
+
return meta
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
if __name__ == "__main__":
|
| 350 |
+
# Smoke test
|
| 351 |
+
import sys
|
| 352 |
+
base = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("/mnt/user-data/uploads")
|
| 353 |
+
X, y, meta = build_features(
|
| 354 |
+
base / "network_flows.csv",
|
| 355 |
+
base / "session_summary.csv",
|
| 356 |
+
base / "network_topology.csv",
|
| 357 |
+
)
|
| 358 |
+
print(f"X shape: {X.shape}")
|
| 359 |
+
print(f"y shape: {y.shape}")
|
| 360 |
+
print(f"n features: {len(meta['feature_names'])}")
|
| 361 |
+
print(f"label distribution:\n{y.map(INT_TO_LABEL).value_counts()}")
|
| 362 |
+
print(f"X dtypes unique: {X.dtypes.unique()}")
|
| 363 |
+
print(f"X has NaN: {X.isnull().any().any()}")
|
feature_meta.json
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"feature_names": [
|
| 3 |
+
"source_port",
|
| 4 |
+
"dest_port",
|
| 5 |
+
"flow_duration_ms",
|
| 6 |
+
"total_fwd_packets",
|
| 7 |
+
"total_bwd_packets",
|
| 8 |
+
"total_bytes_fwd",
|
| 9 |
+
"total_bytes_bwd",
|
| 10 |
+
"fwd_packet_len_mean",
|
| 11 |
+
"fwd_packet_len_std",
|
| 12 |
+
"bwd_packet_len_mean",
|
| 13 |
+
"bwd_packet_len_std",
|
| 14 |
+
"flow_bytes_per_sec",
|
| 15 |
+
"flow_packets_per_sec",
|
| 16 |
+
"inter_arrival_time_mean",
|
| 17 |
+
"inter_arrival_time_std",
|
| 18 |
+
"tcp_flag_syn_count",
|
| 19 |
+
"tcp_flag_ack_count",
|
| 20 |
+
"tcp_flag_fin_count",
|
| 21 |
+
"tcp_flag_rst_count",
|
| 22 |
+
"tcp_flag_psh_count",
|
| 23 |
+
"tcp_flag_urg_count",
|
| 24 |
+
"retransmission_flag",
|
| 25 |
+
"fragmentation_flag",
|
| 26 |
+
"protocol_violation_flag",
|
| 27 |
+
"payload_entropy_mean",
|
| 28 |
+
"retransmission_rate",
|
| 29 |
+
"protocol_violation_count",
|
| 30 |
+
"c2_beacon_flag",
|
| 31 |
+
"session_risk_score",
|
| 32 |
+
"trust_level",
|
| 33 |
+
"avg_concurrent_flows",
|
| 34 |
+
"bandwidth_mbps",
|
| 35 |
+
"nat_enabled",
|
| 36 |
+
"ids_coverage",
|
| 37 |
+
"diurnal_peak_factor",
|
| 38 |
+
"feature_space_dim",
|
| 39 |
+
"alert_threshold",
|
| 40 |
+
"retraining_cadence_days",
|
| 41 |
+
"ensemble_size",
|
| 42 |
+
"device_count",
|
| 43 |
+
"iat_cv",
|
| 44 |
+
"fwd_bwd_byte_ratio",
|
| 45 |
+
"bytes_per_packet_fwd",
|
| 46 |
+
"tcp_flag_anomaly_score",
|
| 47 |
+
"payload_density",
|
| 48 |
+
"hour_of_day",
|
| 49 |
+
"is_off_hours",
|
| 50 |
+
"is_well_known_dest_port",
|
| 51 |
+
"is_ephemeral_src_port",
|
| 52 |
+
"protocol_DNS",
|
| 53 |
+
"protocol_FTP",
|
| 54 |
+
"protocol_HTTPS",
|
| 55 |
+
"protocol_NTP",
|
| 56 |
+
"protocol_SMTP",
|
| 57 |
+
"protocol_SSH",
|
| 58 |
+
"protocol_TCP",
|
| 59 |
+
"protocol_UDP",
|
| 60 |
+
"flow_lifecycle_phase_connection_initiation",
|
| 61 |
+
"flow_lifecycle_phase_connection_teardown",
|
| 62 |
+
"flow_lifecycle_phase_data_transfer",
|
| 63 |
+
"flow_lifecycle_phase_protocol_handshake",
|
| 64 |
+
"flow_lifecycle_phase_session_maintenance",
|
| 65 |
+
"source_device_type_cloud_service",
|
| 66 |
+
"source_device_type_iot_device",
|
| 67 |
+
"source_device_type_mobile_endpoint",
|
| 68 |
+
"source_device_type_ot_controller",
|
| 69 |
+
"source_device_type_server",
|
| 70 |
+
"source_device_type_workstation",
|
| 71 |
+
"dest_device_type_cloud_service",
|
| 72 |
+
"dest_device_type_iot_device",
|
| 73 |
+
"dest_device_type_mobile_endpoint",
|
| 74 |
+
"dest_device_type_ot_controller",
|
| 75 |
+
"dest_device_type_server",
|
| 76 |
+
"dest_device_type_workstation",
|
| 77 |
+
"segment_type_cloud_workload",
|
| 78 |
+
"segment_type_corporate_lan",
|
| 79 |
+
"segment_type_data_centre_spine",
|
| 80 |
+
"segment_type_dmz_perimeter",
|
| 81 |
+
"segment_type_endpoint_fleet",
|
| 82 |
+
"segment_type_guest_wifi",
|
| 83 |
+
"segment_type_ot_ics_control_network",
|
| 84 |
+
"segment_type_soc_management_plane",
|
| 85 |
+
"segment_type_zero_trust_segment",
|
| 86 |
+
"firewall_policy_default_deny",
|
| 87 |
+
"firewall_policy_open_permissive",
|
| 88 |
+
"firewall_policy_stateful_inspection",
|
| 89 |
+
"firewall_policy_strict_allowlist",
|
| 90 |
+
"firewall_policy_zone_based",
|
| 91 |
+
"qos_policy_best_effort",
|
| 92 |
+
"qos_policy_dscp_expedited",
|
| 93 |
+
"qos_policy_none",
|
| 94 |
+
"qos_policy_priority_queue",
|
| 95 |
+
"qos_policy_weighted_fair_queue",
|
| 96 |
+
"defender_architecture_autoencoder_anomaly",
|
| 97 |
+
"defender_architecture_ensemble_stacked",
|
| 98 |
+
"defender_architecture_gradient_boosted_tree",
|
| 99 |
+
"defender_architecture_isolation_forest",
|
| 100 |
+
"defender_architecture_lstm_behavioural",
|
| 101 |
+
"defender_architecture_neural_network_dense",
|
| 102 |
+
"defender_architecture_rule_based_threshold",
|
| 103 |
+
"defender_architecture_transformer_sequence"
|
| 104 |
+
],
|
| 105 |
+
"numeric_features": [
|
| 106 |
+
"source_port",
|
| 107 |
+
"dest_port",
|
| 108 |
+
"flow_duration_ms",
|
| 109 |
+
"total_fwd_packets",
|
| 110 |
+
"total_bwd_packets",
|
| 111 |
+
"total_bytes_fwd",
|
| 112 |
+
"total_bytes_bwd",
|
| 113 |
+
"fwd_packet_len_mean",
|
| 114 |
+
"fwd_packet_len_std",
|
| 115 |
+
"bwd_packet_len_mean",
|
| 116 |
+
"bwd_packet_len_std",
|
| 117 |
+
"flow_bytes_per_sec",
|
| 118 |
+
"flow_packets_per_sec",
|
| 119 |
+
"inter_arrival_time_mean",
|
| 120 |
+
"inter_arrival_time_std",
|
| 121 |
+
"tcp_flag_syn_count",
|
| 122 |
+
"tcp_flag_ack_count",
|
| 123 |
+
"tcp_flag_fin_count",
|
| 124 |
+
"tcp_flag_rst_count",
|
| 125 |
+
"tcp_flag_psh_count",
|
| 126 |
+
"tcp_flag_urg_count",
|
| 127 |
+
"retransmission_flag",
|
| 128 |
+
"fragmentation_flag",
|
| 129 |
+
"protocol_violation_flag",
|
| 130 |
+
"payload_entropy_mean",
|
| 131 |
+
"retransmission_rate",
|
| 132 |
+
"protocol_violation_count",
|
| 133 |
+
"c2_beacon_flag",
|
| 134 |
+
"session_risk_score",
|
| 135 |
+
"trust_level",
|
| 136 |
+
"avg_concurrent_flows",
|
| 137 |
+
"bandwidth_mbps",
|
| 138 |
+
"nat_enabled",
|
| 139 |
+
"ids_coverage",
|
| 140 |
+
"diurnal_peak_factor",
|
| 141 |
+
"feature_space_dim",
|
| 142 |
+
"alert_threshold",
|
| 143 |
+
"retraining_cadence_days",
|
| 144 |
+
"ensemble_size",
|
| 145 |
+
"device_count",
|
| 146 |
+
"iat_cv",
|
| 147 |
+
"fwd_bwd_byte_ratio",
|
| 148 |
+
"bytes_per_packet_fwd",
|
| 149 |
+
"tcp_flag_anomaly_score",
|
| 150 |
+
"payload_density",
|
| 151 |
+
"hour_of_day",
|
| 152 |
+
"is_off_hours",
|
| 153 |
+
"is_well_known_dest_port",
|
| 154 |
+
"is_ephemeral_src_port"
|
| 155 |
+
],
|
| 156 |
+
"categorical_levels": {
|
| 157 |
+
"protocol": [
|
| 158 |
+
"DNS",
|
| 159 |
+
"FTP",
|
| 160 |
+
"HTTPS",
|
| 161 |
+
"NTP",
|
| 162 |
+
"SMTP",
|
| 163 |
+
"SSH",
|
| 164 |
+
"TCP",
|
| 165 |
+
"UDP"
|
| 166 |
+
],
|
| 167 |
+
"flow_lifecycle_phase": [
|
| 168 |
+
"connection_initiation",
|
| 169 |
+
"connection_teardown",
|
| 170 |
+
"data_transfer",
|
| 171 |
+
"protocol_handshake",
|
| 172 |
+
"session_maintenance"
|
| 173 |
+
],
|
| 174 |
+
"source_device_type": [
|
| 175 |
+
"cloud_service",
|
| 176 |
+
"iot_device",
|
| 177 |
+
"mobile_endpoint",
|
| 178 |
+
"ot_controller",
|
| 179 |
+
"server",
|
| 180 |
+
"workstation"
|
| 181 |
+
],
|
| 182 |
+
"dest_device_type": [
|
| 183 |
+
"cloud_service",
|
| 184 |
+
"iot_device",
|
| 185 |
+
"mobile_endpoint",
|
| 186 |
+
"ot_controller",
|
| 187 |
+
"server",
|
| 188 |
+
"workstation"
|
| 189 |
+
],
|
| 190 |
+
"segment_type": [
|
| 191 |
+
"cloud_workload",
|
| 192 |
+
"corporate_lan",
|
| 193 |
+
"data_centre_spine",
|
| 194 |
+
"dmz_perimeter",
|
| 195 |
+
"endpoint_fleet",
|
| 196 |
+
"guest_wifi",
|
| 197 |
+
"ot_ics_control_network",
|
| 198 |
+
"soc_management_plane",
|
| 199 |
+
"zero_trust_segment"
|
| 200 |
+
],
|
| 201 |
+
"firewall_policy": [
|
| 202 |
+
"default_deny",
|
| 203 |
+
"open_permissive",
|
| 204 |
+
"stateful_inspection",
|
| 205 |
+
"strict_allowlist",
|
| 206 |
+
"zone_based"
|
| 207 |
+
],
|
| 208 |
+
"qos_policy": [
|
| 209 |
+
"best_effort",
|
| 210 |
+
"dscp_expedited",
|
| 211 |
+
"none",
|
| 212 |
+
"priority_queue",
|
| 213 |
+
"weighted_fair_queue"
|
| 214 |
+
],
|
| 215 |
+
"defender_architecture": [
|
| 216 |
+
"autoencoder_anomaly",
|
| 217 |
+
"ensemble_stacked",
|
| 218 |
+
"gradient_boosted_tree",
|
| 219 |
+
"isolation_forest",
|
| 220 |
+
"lstm_behavioural",
|
| 221 |
+
"neural_network_dense",
|
| 222 |
+
"rule_based_threshold",
|
| 223 |
+
"transformer_sequence"
|
| 224 |
+
]
|
| 225 |
+
},
|
| 226 |
+
"label_to_int": {
|
| 227 |
+
"BENIGN": 0,
|
| 228 |
+
"MALICIOUS": 1,
|
| 229 |
+
"AMBIGUOUS": 2
|
| 230 |
+
},
|
| 231 |
+
"int_to_label": {
|
| 232 |
+
"0": "BENIGN",
|
| 233 |
+
"1": "MALICIOUS",
|
| 234 |
+
"2": "AMBIGUOUS"
|
| 235 |
+
}
|
| 236 |
+
}
|
feature_scaler.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"mean": [33246.74978063761, 3092.6561860193037, 3568.162913132495, 40.45100906697865, 37.033050599590524, 27141.939894706054, 18373.229745539633, 681.214243930974, 309.9934191284001, 498.80652237496344, 259.9314126937701, 42512.715467973096, 88.85309593448376, 6202.614864434045, 1564.1459230769228, 0.30622989178122256, 5.673588768645803, 0.02749341912840012, 0.13688212927756654, 0.4839134249780638, 0.006580871599883006, 0.0337818075460661, 0.020181339572974553, 0.04591985960807254, 4.918210324656333, 0.03811861655454812, 0.12445159403334308, 0.03568294823047675, 0.2893867358876865, 0.5161035682948231, 109.0, 884.3067198011114, 0.5624451594033343, 0.7456058496636443, 1.3842245978356242, 70.62840011699328, 0.6860095349517403, 47.037876572097105, 4.353758408891489, 975.8917812225797, 0.5939957658598679, 4.050707090716325, 681.214243930974, 0.007334989923027464, 0.3969841932563371, 12.341620356829482, 0.2244808423515648, 0.8253875402164376, 0.2535829189821585, 0.08028663351857268, 0.0987130739982451, 0.18909037730330505, 0.08730622989178122, 0.10266159695817491, 0.03231939163498099, 0.2244808423515648, 0.18514185434337527, 0.11011991810470897, 0.11085112606025153, 0.4928341620356829, 0.1440479672418836, 0.14214682655747293, 0.1830944720678561, 0.1506288388417666, 0.1646680315881837, 0.15750219362386664, 0.1858730622989178, 0.15823340157940918, 0.16276689090377303, 0.1975723895875987, 0.14360924246855805, 0.15691722725943258, 0.14229306814858145, 0.19684118163205616, 0.0962269669494004, 0.09812810763381105, 0.09213220239836209, 0.09534951740274934, 0.1130447499268792, 0.11801696402456859, 0.1414156186019304, 0.10485522082480257, 0.14083065223749636, 0.19684118163205616, 0.20605440187189236, 0.1715413863702837, 0.23062298917812227, 0.1949400409476455, 0.12035682948230476, 0.2298917812225797, 0.1971336648142732, 0.23866627668909038, 0.213951447791752, 0.0985668324071366, 0.12708394267329629, 0.20108218777420298, 0.15545481134834746, 0.03860778005264697, 0.18207078093009652, 0.10836501901140684, 0.08876864580286634], "std": [18749.233853911337, 9865.164126780137, 4190.513917198199, 70.31297496929587, 55.68108028369978, 52722.78701229442, 33694.402383674795, 335.5644018194022, 49.512342840830556, 270.8822397388281, 40.15262463355536, 244466.99124808173, 651.6831675402408, 42653.08093220378, 11932.405984612855, 1.9458350117686587, 12.533888245900245, 0.1635281068930336, 0.5531404520192984, 1.6803909327338207, 0.08086111508275948, 0.18068030090744233, 0.14063052768856918, 0.20932662052918952, 1.0708323782832943, 0.026009753855312713, 0.39246443174561263, 0.18551201658587205, 0.2398908556875923, 0.21862773256336868, 1.0, 863.9054990654234, 0.49612155514020645, 0.11670762081450198, 0.18533417502330754, 28.409299682046896, 0.16902662104519858, 23.189955831779056, 2.311340124434154, 542.2224222337586, 0.18671542343860886, 16.074926614793874, 335.5644018194022, 0.05411044984657537, 0.16080068264645078, 6.561931141230041, 0.4172704837070426, 0.37966304603421225, 0.4350934458689722, 0.2717563065620617, 0.2982981995627756, 0.3916090317893234, 0.2823039264898729, 0.30353857668308043, 0.1768598962792337, 0.4172704837070426, 0.3884410045076719, 0.31306206184383634, 0.3139706515776878, 0.4999852087822069, 0.3511640419097188, 0.3492262042384573, 0.3867722366583929, 0.3577128801235108, 0.37090779150009945, 0.36430023473877865, 0.3890326466363804, 0.3649864021965326, 0.3691798504061366, 0.3981968465890802, 0.3507187137731729, 0.363749310369518, 0.3493760176096391, 0.39764035792703506, 0.29492381708363086, 0.2975095397646019, 0.28923363165412663, 0.2937185783775046, 0.3166706484526433, 0.3226518008451023, 0.34847525057745316, 0.3063891835921014, 0.3478722137010598, 0.39764035792703506, 0.40449958390284957, 0.37700891937335396, 0.42126236286042335, 0.3961835126103755, 0.3254021329010349, 0.4207938269197238, 0.3978632080793417, 0.42629949768833464, 0.4101229373439798, 0.2981013378512827, 0.3330913383277822, 0.40083866880603014, 0.3623642030345733, 0.19267238579308466, 0.38593107324064563, 0.3108635937093521, 0.28443031546987846]}
|
inference_example.ipynb
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# CYB001 Baseline Classifier — Inference Example\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"End-to-end demo: load the trained XGBoost and PyTorch MLP models from the Hugging Face repo and predict on a new flow record.\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"**Models predict one of three labels:** `BENIGN`, `MALICIOUS`, or `AMBIGUOUS`.\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"**This is a baseline reference model**, not a production IDS. See the model card for full limitations."
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "markdown",
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"source": [
|
| 20 |
+
"## 1. Install dependencies"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": null,
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"outputs": [],
|
| 28 |
+
"source": [
|
| 29 |
+
"%pip install --quiet xgboost torch safetensors pandas numpy huggingface_hub"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"cell_type": "markdown",
|
| 34 |
+
"metadata": {},
|
| 35 |
+
"source": [
|
| 36 |
+
"## 2. Download model artifacts from Hugging Face\n",
|
| 37 |
+
"\n",
|
| 38 |
+
"Five files are needed:\n",
|
| 39 |
+
"- `model_xgb.json` — XGBoost weights\n",
|
| 40 |
+
"- `model_mlp.safetensors` — PyTorch MLP weights\n",
|
| 41 |
+
"- `feature_engineering.py` — feature pipeline (must match the one used at training)\n",
|
| 42 |
+
"- `feature_meta.json` — feature column order + categorical levels\n",
|
| 43 |
+
"- `feature_scaler.json` — MLP input standardization (mean / std)"
|
| 44 |
+
]
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"cell_type": "code",
|
| 48 |
+
"execution_count": null,
|
| 49 |
+
"metadata": {},
|
| 50 |
+
"outputs": [],
|
| 51 |
+
"source": [
|
| 52 |
+
"from huggingface_hub import hf_hub_download\n",
|
| 53 |
+
"\n",
|
| 54 |
+
"REPO_ID = \"xpertsystems/cyb001-baseline-classifier\"\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"files = {}\n",
|
| 57 |
+
"for name in [\"model_xgb.json\", \"model_mlp.safetensors\",\n",
|
| 58 |
+
" \"feature_engineering.py\", \"feature_meta.json\",\n",
|
| 59 |
+
" \"feature_scaler.json\"]:\n",
|
| 60 |
+
" files[name] = hf_hub_download(repo_id=REPO_ID, filename=name)\n",
|
| 61 |
+
" print(f\" downloaded: {name}\")"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "code",
|
| 66 |
+
"execution_count": null,
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"# Make feature_engineering.py importable\n",
|
| 71 |
+
"import sys, shutil, os\n",
|
| 72 |
+
"fe_dir = os.path.dirname(files[\"feature_engineering.py\"])\n",
|
| 73 |
+
"if fe_dir not in sys.path:\n",
|
| 74 |
+
" sys.path.insert(0, fe_dir)\n",
|
| 75 |
+
"\n",
|
| 76 |
+
"from feature_engineering import transform_single, load_meta, INT_TO_LABEL"
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"cell_type": "markdown",
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"source": [
|
| 83 |
+
"## 3. Load models and metadata"
|
| 84 |
+
]
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"cell_type": "code",
|
| 88 |
+
"execution_count": null,
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"outputs": [],
|
| 91 |
+
"source": [
|
| 92 |
+
"import json\n",
|
| 93 |
+
"import numpy as np\n",
|
| 94 |
+
"import torch\n",
|
| 95 |
+
"import torch.nn as nn\n",
|
| 96 |
+
"import xgboost as xgb\n",
|
| 97 |
+
"from safetensors.torch import load_file\n",
|
| 98 |
+
"\n",
|
| 99 |
+
"# --- Metadata ---\n",
|
| 100 |
+
"meta = load_meta(files[\"feature_meta.json\"])\n",
|
| 101 |
+
"with open(files[\"feature_scaler.json\"]) as f:\n",
|
| 102 |
+
" scaler = json.load(f)\n",
|
| 103 |
+
"\n",
|
| 104 |
+
"N_FEATURES = len(meta[\"feature_names\"])\n",
|
| 105 |
+
"print(f\"feature count: {N_FEATURES}\")\n",
|
| 106 |
+
"print(f\"label classes: {list(meta['int_to_label'].values())}\")"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "code",
|
| 111 |
+
"execution_count": null,
|
| 112 |
+
"metadata": {},
|
| 113 |
+
"outputs": [],
|
| 114 |
+
"source": [
|
| 115 |
+
"# --- XGBoost ---\n",
|
| 116 |
+
"xgb_model = xgb.XGBClassifier()\n",
|
| 117 |
+
"xgb_model.load_model(files[\"model_xgb.json\"])\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"# --- MLP architecture (must match training) ---\n",
|
| 120 |
+
"class FlowMLP(nn.Module):\n",
|
| 121 |
+
" def __init__(self, n_features, n_classes=3, hidden1=128, hidden2=64, dropout=0.3):\n",
|
| 122 |
+
" super().__init__()\n",
|
| 123 |
+
" self.net = nn.Sequential(\n",
|
| 124 |
+
" nn.Linear(n_features, hidden1),\n",
|
| 125 |
+
" nn.BatchNorm1d(hidden1),\n",
|
| 126 |
+
" nn.ReLU(),\n",
|
| 127 |
+
" nn.Dropout(dropout),\n",
|
| 128 |
+
" nn.Linear(hidden1, hidden2),\n",
|
| 129 |
+
" nn.BatchNorm1d(hidden2),\n",
|
| 130 |
+
" nn.ReLU(),\n",
|
| 131 |
+
" nn.Dropout(dropout),\n",
|
| 132 |
+
" nn.Linear(hidden2, n_classes),\n",
|
| 133 |
+
" )\n",
|
| 134 |
+
" def forward(self, x):\n",
|
| 135 |
+
" return self.net(x)\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"mlp_model = FlowMLP(N_FEATURES)\n",
|
| 138 |
+
"mlp_model.load_state_dict(load_file(files[\"model_mlp.safetensors\"]))\n",
|
| 139 |
+
"mlp_model.eval()\n",
|
| 140 |
+
"print(\"models loaded\")"
|
| 141 |
+
]
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"cell_type": "markdown",
|
| 145 |
+
"metadata": {},
|
| 146 |
+
"source": [
|
| 147 |
+
"## 4. Define a prediction function"
|
| 148 |
+
]
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"cell_type": "code",
|
| 152 |
+
"execution_count": null,
|
| 153 |
+
"metadata": {},
|
| 154 |
+
"outputs": [],
|
| 155 |
+
"source": [
|
| 156 |
+
"MU = np.array(scaler[\"mean\"], dtype=np.float32)\n",
|
| 157 |
+
"SD = np.array(scaler[\"std\"], dtype=np.float32)\n",
|
| 158 |
+
"\n",
|
| 159 |
+
"def predict_flow(record: dict) -> dict:\n",
|
| 160 |
+
" \"\"\"\n",
|
| 161 |
+
" Predict the label for one flow record. `record` is a dict containing\n",
|
| 162 |
+
" the fields described in the model card's 'Input schema' section.\n",
|
| 163 |
+
"\n",
|
| 164 |
+
" Returns a dict with both models' predictions and per-class probabilities.\n",
|
| 165 |
+
" \"\"\"\n",
|
| 166 |
+
" X = transform_single(record, meta)\n",
|
| 167 |
+
"\n",
|
| 168 |
+
" # XGBoost\n",
|
| 169 |
+
" xgb_proba = xgb_model.predict_proba(X)[0]\n",
|
| 170 |
+
" xgb_label = INT_TO_LABEL[int(np.argmax(xgb_proba))]\n",
|
| 171 |
+
"\n",
|
| 172 |
+
" # MLP\n",
|
| 173 |
+
" Xs = ((X - MU) / SD).astype(np.float32)\n",
|
| 174 |
+
" with torch.no_grad():\n",
|
| 175 |
+
" logits = mlp_model(torch.tensor(Xs))\n",
|
| 176 |
+
" mlp_proba = torch.softmax(logits, dim=1).numpy()[0]\n",
|
| 177 |
+
" mlp_label = INT_TO_LABEL[int(np.argmax(mlp_proba))]\n",
|
| 178 |
+
"\n",
|
| 179 |
+
" return {\n",
|
| 180 |
+
" \"xgboost\": {\n",
|
| 181 |
+
" \"label\": xgb_label,\n",
|
| 182 |
+
" \"probabilities\": {INT_TO_LABEL[i]: float(p) for i, p in enumerate(xgb_proba)},\n",
|
| 183 |
+
" },\n",
|
| 184 |
+
" \"mlp\": {\n",
|
| 185 |
+
" \"label\": mlp_label,\n",
|
| 186 |
+
" \"probabilities\": {INT_TO_LABEL[i]: float(p) for i, p in enumerate(mlp_proba)},\n",
|
| 187 |
+
" },\n",
|
| 188 |
+
" }"
|
| 189 |
+
]
|
| 190 |
+
},
|
| 191 |
+
{
|
| 192 |
+
"cell_type": "markdown",
|
| 193 |
+
"metadata": {},
|
| 194 |
+
"source": [
|
| 195 |
+
"## 5. Run on an example record\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"The fields below are the union of `network_flows.csv`, the joined session-summary subset, and the joined topology fields. In a real deployment you would assemble these by joining a new flow against your session-summary store and your topology lookup.\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"This example is a real `BENIGN` HTTPS flow lifted from the sample dataset (workstation → cloud service, port 443). Both models should agree."
|
| 200 |
+
]
|
| 201 |
+
},
|
| 202 |
+
{
|
| 203 |
+
"cell_type": "code",
|
| 204 |
+
"execution_count": null,
|
| 205 |
+
"metadata": {},
|
| 206 |
+
"outputs": [],
|
| 207 |
+
"source": [
|
| 208 |
+
"# A real BENIGN HTTPS flow from the sample dataset.\n",
|
| 209 |
+
"# Workstation -> cloud service, port 443, mid-day. Both models should\n",
|
| 210 |
+
"# agree on BENIGN. If you hand-construct records, expect occasional\n",
|
| 211 |
+
"# disagreement between XGBoost and MLP on out-of-distribution inputs -\n",
|
| 212 |
+
"# disagreement is itself a useful signal; see note below.\n",
|
| 213 |
+
"example_record = {\n",
|
| 214 |
+
" # ---- flow-level fields ----\n",
|
| 215 |
+
" \"source_port\": 52789, \"dest_port\": 443, \"protocol\": \"HTTPS\",\n",
|
| 216 |
+
" \"flow_start_timestamp\": \"2024-01-20 13:27:58.967\",\n",
|
| 217 |
+
" \"flow_duration_ms\": 535,\n",
|
| 218 |
+
" \"total_fwd_packets\": 37, \"total_bwd_packets\": 30,\n",
|
| 219 |
+
" \"total_bytes_fwd\": 17020, \"total_bytes_bwd\": 23310,\n",
|
| 220 |
+
" \"fwd_packet_len_mean\": 460, \"fwd_packet_len_std\": 296,\n",
|
| 221 |
+
" \"bwd_packet_len_mean\": 777, \"bwd_packet_len_std\": 226,\n",
|
| 222 |
+
" \"flow_bytes_per_sec\": 75383.18, \"flow_packets_per_sec\": 125.23,\n",
|
| 223 |
+
" \"inter_arrival_time_mean\": 20.618, \"inter_arrival_time_std\": 8.457,\n",
|
| 224 |
+
" \"tcp_flag_syn_count\": 0, \"tcp_flag_ack_count\": 0, \"tcp_flag_fin_count\": 0,\n",
|
| 225 |
+
" \"tcp_flag_rst_count\": 0, \"tcp_flag_psh_count\": 0, \"tcp_flag_urg_count\": 0,\n",
|
| 226 |
+
" \"flow_lifecycle_phase\": \"protocol_handshake\",\n",
|
| 227 |
+
" \"source_device_type\": \"workstation\", \"dest_device_type\": \"cloud_service\",\n",
|
| 228 |
+
" \"retransmission_flag\": 0, \"fragmentation_flag\": 0, \"protocol_violation_flag\": 0,\n",
|
| 229 |
+
"\n",
|
| 230 |
+
" # ---- session-level fields (from session_summary.csv join) ----\n",
|
| 231 |
+
" \"payload_entropy_mean\": 3.6328,\n",
|
| 232 |
+
" \"retransmission_rate\": 0.0631,\n",
|
| 233 |
+
" \"protocol_violation_count\": 0,\n",
|
| 234 |
+
" \"c2_beacon_flag\": 0,\n",
|
| 235 |
+
" \"session_risk_score\": 0.1866,\n",
|
| 236 |
+
"\n",
|
| 237 |
+
" # ---- topology fields (from network_topology.csv join) ----\n",
|
| 238 |
+
" \"segment_type\": \"corporate_lan\",\n",
|
| 239 |
+
" \"trust_level\": 0.6027, \"avg_concurrent_flows\": 109, \"bandwidth_mbps\": 671.0,\n",
|
| 240 |
+
" \"nat_enabled\": 1, \"ids_coverage\": 0.8253, \"diurnal_peak_factor\": 1.6239,\n",
|
| 241 |
+
" \"feature_space_dim\": 107, \"alert_threshold\": 0.3089,\n",
|
| 242 |
+
" \"retraining_cadence_days\": 39, \"ensemble_size\": 1, \"device_count\": 302,\n",
|
| 243 |
+
" \"firewall_policy\": \"zone_based\", \"qos_policy\": \"best_effort\",\n",
|
| 244 |
+
" \"defender_architecture\": \"lstm_behavioural\",\n",
|
| 245 |
+
"}\n",
|
| 246 |
+
"\n",
|
| 247 |
+
"result = predict_flow(example_record)\n",
|
| 248 |
+
"\n",
|
| 249 |
+
"print(f\"XGBoost -> {result['xgboost']['label']}\")\n",
|
| 250 |
+
"for lbl, p in result['xgboost']['probabilities'].items():\n",
|
| 251 |
+
" print(f\" P({lbl}) = {p:.4f}\")\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"print(f\"\\nMLP -> {result['mlp']['label']}\")\n",
|
| 254 |
+
"for lbl, p in result['mlp']['probabilities'].items():\n",
|
| 255 |
+
" print(f\" P({lbl}) = {p:.4f}\")"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"cell_type": "markdown",
|
| 260 |
+
"metadata": {},
|
| 261 |
+
"source": [
|
| 262 |
+
"### Note: when the two models disagree\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"XGBoost and the MLP can disagree on out-of-distribution records — particularly hand-crafted inputs whose feature combinations don't lie on the training-data manifold. The MLP, with BatchNorm and only ~7k training rows, has narrower competence than the tree ensemble. Disagreement is itself a useful triage signal: in a production pipeline you would surface those flows for human review rather than auto-act on either prediction.\n",
|
| 265 |
+
"\n",
|
| 266 |
+
"On in-distribution records (e.g. real rows from the sample CSV, as used in section 6 below) the two models agree on >99% of cases."
|
| 267 |
+
]
|
| 268 |
+
},
|
| 269 |
+
{
|
| 270 |
+
"cell_type": "markdown",
|
| 271 |
+
"metadata": {},
|
| 272 |
+
"source": [
|
| 273 |
+
"## 6. Batch prediction on the sample dataset"
|
| 274 |
+
]
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"cell_type": "code",
|
| 278 |
+
"execution_count": null,
|
| 279 |
+
"metadata": {},
|
| 280 |
+
"outputs": [],
|
| 281 |
+
"source": [
|
| 282 |
+
"from huggingface_hub import snapshot_download\n",
|
| 283 |
+
"import pandas as pd\n",
|
| 284 |
+
"\n",
|
| 285 |
+
"# Pull the sample dataset CSVs\n",
|
| 286 |
+
"ds_path = snapshot_download(repo_id=\"xpertsystems/cyb001-sample\", repo_type=\"dataset\")\n",
|
| 287 |
+
"\n",
|
| 288 |
+
"flows = pd.read_csv(f\"{ds_path}/network_flows.csv\")\n",
|
| 289 |
+
"sessions = pd.read_csv(f\"{ds_path}/session_summary.csv\")\n",
|
| 290 |
+
"topology = pd.read_csv(f\"{ds_path}/network_topology.csv\")\n",
|
| 291 |
+
"\n",
|
| 292 |
+
"# Drop leaky columns the model was never trained on\n",
|
| 293 |
+
"flows = flows.drop(columns=[\"traffic_category\", \"attack_subcategory\",\n",
|
| 294 |
+
" \"attacker_capability_tier\"], errors=\"ignore\")\n",
|
| 295 |
+
"\n",
|
| 296 |
+
"# Build the same enriched frame the training pipeline used\n",
|
| 297 |
+
"enriched = flows.merge(\n",
|
| 298 |
+
" sessions[[\"session_id\", \"payload_entropy_mean\", \"retransmission_rate\",\n",
|
| 299 |
+
" \"protocol_violation_count\", \"c2_beacon_flag\", \"session_risk_score\"]],\n",
|
| 300 |
+
" on=\"session_id\", how=\"left\",\n",
|
| 301 |
+
").merge(topology, on=\"segment_id\", how=\"left\")\n",
|
| 302 |
+
"\n",
|
| 303 |
+
"# Score the first 200 rows\n",
|
| 304 |
+
"sample = enriched.head(200).copy()\n",
|
| 305 |
+
"preds = []\n",
|
| 306 |
+
"for _, row in sample.iterrows():\n",
|
| 307 |
+
" out = predict_flow(row.to_dict())\n",
|
| 308 |
+
" preds.append(out[\"xgboost\"][\"label\"])\n",
|
| 309 |
+
"\n",
|
| 310 |
+
"sample[\"xgb_pred\"] = preds\n",
|
| 311 |
+
"\n",
|
| 312 |
+
"# Confusion vs ground-truth label\n",
|
| 313 |
+
"ct = pd.crosstab(sample[\"label\"], sample[\"xgb_pred\"], rownames=[\"true\"], colnames=[\"pred\"])\n",
|
| 314 |
+
"print(\"Confusion on first 200 sample rows (XGBoost):\")\n",
|
| 315 |
+
"print(ct)"
|
| 316 |
+
]
|
| 317 |
+
},
|
| 318 |
+
{
|
| 319 |
+
"cell_type": "markdown",
|
| 320 |
+
"metadata": {},
|
| 321 |
+
"source": [
|
| 322 |
+
"## 7. Next steps\n",
|
| 323 |
+
"\n",
|
| 324 |
+
"- See `validation_results.json` for full test-set metrics and architecture details.\n",
|
| 325 |
+
"- The high accuracy is a property of calibrated synthetic data — see the model card's **Limitations** section before extrapolating to production traffic.\n",
|
| 326 |
+
"- For the full 685k-row CYB001 dataset and commercial licensing, contact **pradeep@xpertsystems.ai**."
|
| 327 |
+
]
|
| 328 |
+
}
|
| 329 |
+
],
|
| 330 |
+
"metadata": {
|
| 331 |
+
"kernelspec": {
|
| 332 |
+
"display_name": "Python 3",
|
| 333 |
+
"language": "python",
|
| 334 |
+
"name": "python3"
|
| 335 |
+
},
|
| 336 |
+
"language_info": {
|
| 337 |
+
"name": "python",
|
| 338 |
+
"version": "3.10"
|
| 339 |
+
}
|
| 340 |
+
},
|
| 341 |
+
"nbformat": 4,
|
| 342 |
+
"nbformat_minor": 5
|
| 343 |
+
}
|
model_mlp.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b39cd3df3edad09a9b7e41e9adb97f81c8abab37aa5e2c511e53602c74868c0
|
| 3 |
+
size 90324
|
model_xgb.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
validation_results.json
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "1.0.0",
|
| 3 |
+
"dataset": "xpertsystems/cyb001-sample",
|
| 4 |
+
"split": {
|
| 5 |
+
"train": 6838,
|
| 6 |
+
"validation": 1466,
|
| 7 |
+
"test": 1466,
|
| 8 |
+
"strategy": "stratified",
|
| 9 |
+
"seed": 42
|
| 10 |
+
},
|
| 11 |
+
"n_features": 101,
|
| 12 |
+
"label_classes": [
|
| 13 |
+
"BENIGN",
|
| 14 |
+
"MALICIOUS",
|
| 15 |
+
"AMBIGUOUS"
|
| 16 |
+
],
|
| 17 |
+
"class_distribution_train": {
|
| 18 |
+
"BENIGN": 4915,
|
| 19 |
+
"MALICIOUS": 1379,
|
| 20 |
+
"AMBIGUOUS": 544
|
| 21 |
+
},
|
| 22 |
+
"class_distribution_test": {
|
| 23 |
+
"BENIGN": 1054,
|
| 24 |
+
"MALICIOUS": 295,
|
| 25 |
+
"AMBIGUOUS": 117
|
| 26 |
+
},
|
| 27 |
+
"models": {
|
| 28 |
+
"xgboost": {
|
| 29 |
+
"architecture": "Gradient-boosted decision trees, multi:softprob, 3 classes",
|
| 30 |
+
"framework": "xgboost",
|
| 31 |
+
"test_metrics": {
|
| 32 |
+
"model": "xgboost",
|
| 33 |
+
"accuracy": 0.9979536152796725,
|
| 34 |
+
"macro_f1": 0.9961123729105247,
|
| 35 |
+
"weighted_f1": 0.9979537067605843,
|
| 36 |
+
"per_class_f1": {
|
| 37 |
+
"BENIGN": 0.9985761746559089,
|
| 38 |
+
"MALICIOUS": 0.9983079526226735,
|
| 39 |
+
"AMBIGUOUS": 0.9914529914529915
|
| 40 |
+
},
|
| 41 |
+
"confusion_matrix": {
|
| 42 |
+
"labels": [
|
| 43 |
+
"BENIGN",
|
| 44 |
+
"MALICIOUS",
|
| 45 |
+
"AMBIGUOUS"
|
| 46 |
+
],
|
| 47 |
+
"matrix": [
|
| 48 |
+
[
|
| 49 |
+
1052,
|
| 50 |
+
1,
|
| 51 |
+
1
|
| 52 |
+
],
|
| 53 |
+
[
|
| 54 |
+
0,
|
| 55 |
+
295,
|
| 56 |
+
0
|
| 57 |
+
],
|
| 58 |
+
[
|
| 59 |
+
1,
|
| 60 |
+
0,
|
| 61 |
+
116
|
| 62 |
+
]
|
| 63 |
+
]
|
| 64 |
+
},
|
| 65 |
+
"macro_roc_auc_ovr": 0.9999888611978185
|
| 66 |
+
}
|
| 67 |
+
},
|
| 68 |
+
"mlp": {
|
| 69 |
+
"architecture": "PyTorch MLP, 101 -> 128 -> 64 -> 3, BatchNorm1d + ReLU + Dropout, weighted cross-entropy loss",
|
| 70 |
+
"framework": "pytorch",
|
| 71 |
+
"test_metrics": {
|
| 72 |
+
"model": "mlp",
|
| 73 |
+
"accuracy": 0.9931787175989086,
|
| 74 |
+
"macro_f1": 0.9868796182274947,
|
| 75 |
+
"weighted_f1": 0.9931977860171972,
|
| 76 |
+
"per_class_f1": {
|
| 77 |
+
"BENIGN": 0.9961977186311787,
|
| 78 |
+
"MALICIOUS": 0.9898648648648649,
|
| 79 |
+
"AMBIGUOUS": 0.9745762711864406
|
| 80 |
+
},
|
| 81 |
+
"confusion_matrix": {
|
| 82 |
+
"labels": [
|
| 83 |
+
"BENIGN",
|
| 84 |
+
"MALICIOUS",
|
| 85 |
+
"AMBIGUOUS"
|
| 86 |
+
],
|
| 87 |
+
"matrix": [
|
| 88 |
+
[
|
| 89 |
+
1048,
|
| 90 |
+
2,
|
| 91 |
+
4
|
| 92 |
+
],
|
| 93 |
+
[
|
| 94 |
+
2,
|
| 95 |
+
293,
|
| 96 |
+
0
|
| 97 |
+
],
|
| 98 |
+
[
|
| 99 |
+
0,
|
| 100 |
+
2,
|
| 101 |
+
115
|
| 102 |
+
]
|
| 103 |
+
]
|
| 104 |
+
},
|
| 105 |
+
"macro_roc_auc_ovr": 0.9995571752214697
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
}
|