Reynier commited on
Commit
df826c3
·
verified ·
1 Parent(s): 4a34b48

Upload model.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. model.py +130 -0
model.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DGA-Logit: TF-IDF + Lexical Features + Logistic Regression for DGA detection.
3
+ Trained on 54 DGA families (~845K samples).
4
+ artifacts.joblib contains: {'model': LR, 'vectorizer': TF-IDF, 'scaler': StandardScaler}
5
+ """
6
+ import re
7
+ import math
8
+ import numpy as np
9
+ import pandas as pd
10
+ from collections import Counter
11
+ from urllib.parse import urlparse
12
+
13
+ VOWELS = set('aeiou')
14
+ CONSONANTS = set('bcdfghjklmnpqrstvwxyz')
15
+ DOMAIN_RE = re.compile(r'^[a-z0-9.-]+$')
16
+ LETTER_RE = re.compile(r'[a-z]')
17
+ DIGIT_RE = re.compile(r'[0-9]')
18
+
19
+
20
+ def _hostname_from_url(value: str) -> str:
21
+ parsed = urlparse(value)
22
+ if parsed.netloc:
23
+ return parsed.netloc
24
+ if parsed.scheme and parsed.path:
25
+ return parsed.path
26
+ return value
27
+
28
+
29
+ def normalize_domain(value: str) -> str:
30
+ domain = str(value).strip().lower()
31
+ domain = _hostname_from_url(domain)
32
+ domain = domain.split('@')[-1].split('/')[0].split(':')[0].rstrip('.')
33
+ domain = re.sub(r'\s+', '', domain)
34
+ if not DOMAIN_RE.match(domain):
35
+ domain = re.sub(r'[^a-z0-9.-]', '', domain)
36
+ return domain
37
+
38
+
39
+ def shannon_entropy(value: str) -> float:
40
+ if not value:
41
+ return 0.0
42
+ counts = Counter(value)
43
+ total = len(value)
44
+ return -sum((n / total) * math.log2(n / total) for n in counts.values())
45
+
46
+
47
+ def _max_run(value: str, matcher) -> int:
48
+ best = current = 0
49
+ for ch in value:
50
+ if matcher.match(ch):
51
+ current += 1
52
+ best = max(best, current)
53
+ else:
54
+ current = 0
55
+ return best
56
+
57
+
58
+ def _split_parts(domain: str):
59
+ parts = [p for p in domain.split('.') if p]
60
+ if not parts:
61
+ return '', ''
62
+ sld = parts[-2] if len(parts) >= 2 else parts[-1]
63
+ return sld, parts[-1]
64
+
65
+
66
+ def _extract_lexical_features(domains: pd.Series) -> np.ndarray:
67
+ rows = []
68
+ for value in domains:
69
+ domain = normalize_domain(value)
70
+ sld, tld = _split_parts(domain)
71
+ letters = [c for c in domain if c.isalpha()]
72
+ digits = [c for c in domain if c.isdigit()]
73
+ chars = [c for c in domain if c.isalnum()]
74
+ vowel_count = sum(1 for c in letters if c in VOWELS)
75
+ consonant_count = sum(1 for c in letters if c in CONSONANTS)
76
+ length = max(len(domain), 1)
77
+ rows.append([
78
+ len(domain),
79
+ len(sld),
80
+ len(tld),
81
+ max(domain.count('.') - 1, 0),
82
+ len(digits) / length,
83
+ vowel_count / length,
84
+ consonant_count / length,
85
+ (len(set(chars)) / max(len(chars), 1)) if chars else 0.0,
86
+ domain.count('-'),
87
+ domain.count('.'),
88
+ _max_run(domain, DIGIT_RE),
89
+ _max_run(domain, LETTER_RE),
90
+ shannon_entropy(domain),
91
+ float(domain[:1].isdigit()),
92
+ float(domain[-1:].isdigit()),
93
+ ])
94
+ return np.asarray(rows, dtype=float)
95
+
96
+
97
+ def load_model(artifacts_path: str):
98
+ """Load artifacts dict from joblib file."""
99
+ import joblib
100
+ return joblib.load(artifacts_path)
101
+
102
+
103
+ def predict(artifacts, domains):
104
+ """
105
+ Predict DGA vs legit for a list of domain strings.
106
+ artifacts: dict with keys 'model', 'vectorizer', 'scaler'
107
+ Returns list of dicts: [{"domain": ..., "label": "dga"/"legit", "score": float}]
108
+ """
109
+ from scipy import sparse
110
+
111
+ if isinstance(domains, str):
112
+ domains = [domains]
113
+
114
+ model = artifacts['model']
115
+ vectorizer = artifacts['vectorizer']
116
+ scaler = artifacts['scaler']
117
+
118
+ series = pd.Series(domains)
119
+ domains_norm = series.map(normalize_domain)
120
+ X_tfidf = vectorizer.transform(domains_norm)
121
+ X_lex = scaler.transform(_extract_lexical_features(series))
122
+ X = sparse.hstack([X_tfidf, sparse.csr_matrix(X_lex)], format='csr')
123
+
124
+ scores = model.predict_proba(X)[:, 1]
125
+ preds = (scores >= 0.5).astype(int)
126
+
127
+ return [
128
+ {"domain": d, "label": "dga" if p == 1 else "legit", "score": round(float(s), 4)}
129
+ for d, p, s in zip(domains, preds, scores)
130
+ ]