devorbit commited on
Commit
26e1c2e
·
0 Parent(s):

Initial deployment - secrets removed

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +18 -0
  2. .gitattributes +3 -0
  3. .gitignore +11 -0
  4. DEPLOY_GUIDE.md +103 -0
  5. Dockerfile +49 -0
  6. MurshidBackend_Colab.ipynb +967 -0
  7. MurshidBackend_Colab_Report.md +545 -0
  8. Needed/murshid_label_columns.json +22 -0
  9. Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib +3 -0
  10. Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy +3 -0
  11. Needed/murshid_query_template_structure_clean_shared.xlsx +3 -0
  12. README.md +39 -0
  13. murshid_backend/README.md +156 -0
  14. murshid_backend/TECHNICAL_REPORT.md +322 -0
  15. murshid_backend/alembic.ini +38 -0
  16. murshid_backend/alembic/env.py +52 -0
  17. murshid_backend/alembic/script.py.mako +25 -0
  18. murshid_backend/alembic/versions/0001_initial_schema.py +87 -0
  19. murshid_backend/app/__init__.py +1 -0
  20. murshid_backend/app/api/__init__.py +1 -0
  21. murshid_backend/app/api/routes/__init__.py +1 -0
  22. murshid_backend/app/api/routes/db_viewer.py +122 -0
  23. murshid_backend/app/api/routes/health.py +73 -0
  24. murshid_backend/app/api/routes/queries.py +78 -0
  25. murshid_backend/app/api/routes/rules.py +100 -0
  26. murshid_backend/app/api/routes/stats.py +43 -0
  27. murshid_backend/app/config.py +29 -0
  28. murshid_backend/app/db/__init__.py +1 -0
  29. murshid_backend/app/db/base.py +5 -0
  30. murshid_backend/app/db/session.py +25 -0
  31. murshid_backend/app/main.py +60 -0
  32. murshid_backend/app/ml/__init__.py +1 -0
  33. murshid_backend/app/ml/embedder.py +116 -0
  34. murshid_backend/app/ml/logistic_model.py +111 -0
  35. murshid_backend/app/ml/pipeline.py +225 -0
  36. murshid_backend/app/ml/sanitizer.py +32 -0
  37. murshid_backend/app/ml/summarizer.py +262 -0
  38. murshid_backend/app/ml/svm_model.py +101 -0
  39. murshid_backend/app/models/__init__.py +16 -0
  40. murshid_backend/app/models/mapping_job.py +40 -0
  41. murshid_backend/app/models/query_template.py +27 -0
  42. murshid_backend/app/models/rule.py +27 -0
  43. murshid_backend/app/models/rule_technique_mapping.py +31 -0
  44. murshid_backend/app/models/technique.py +24 -0
  45. murshid_backend/app/models/user.py +30 -0
  46. murshid_backend/app/repositories/__init__.py +1 -0
  47. murshid_backend/app/repositories/job_repo.py +44 -0
  48. murshid_backend/app/repositories/rule_repo.py +71 -0
  49. murshid_backend/app/repositories/template_repo.py +94 -0
  50. murshid_backend/app/schemas/__init__.py +1 -0
.dockerignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ .venv
5
+ venv
6
+ **/.env
7
+ .env.local
8
+ *.db
9
+ *.log
10
+ .git
11
+ .gitignore
12
+ *.zip
13
+ MurshidBackend_Colab.ipynb
14
+ MurshidBackend_Colab_Report.md
15
+ interface_pictures/
16
+ murshid_backend/.venv
17
+ murshid_backend/__pycache__
18
+ murshid_backend/TECHNICAL_REPORT.md
.gitattributes ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.joblib filter=lfs diff=lfs merge=lfs -text
2
+ *.npy filter=lfs diff=lfs merge=lfs -text
3
+ *.xlsx filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ .venv/
5
+ venv/
6
+ *.db
7
+ *.log
8
+ **/.env
9
+ .env.local
10
+ murshid_backend_for_drive.zip
11
+ interface_pictures/
DEPLOY_GUIDE.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 دليل النشر على Hugging Face Spaces
2
+
3
+ ## المتطلبات
4
+ - حساب على [Hugging Face](https://huggingface.co/) (مجاني)
5
+ - [Git](https://git-scm.com/) مثبّت على جهازك
6
+
7
+ ---
8
+
9
+ ## الخطوات
10
+
11
+ ### 1. إنشاء Space جديد
12
+
13
+ 1. اذهب إلى: https://huggingface.co/new-space
14
+ 2. **Space name**: `murshid`
15
+ 3. **SDK**: اختر **Docker**
16
+ 4. **Visibility**: Public (مجاني) أو Private
17
+ 5. اضغط **Create Space**
18
+
19
+ ### 2. رفع المشروع
20
+
21
+ ```powershell
22
+ cd d:\murishd
23
+
24
+ # تهيئة Git (إذا لم يكن موجوداً)
25
+ git init
26
+
27
+ # إضافة الـ remote (غيّر YOUR_USERNAME باسم حسابك)
28
+ git remote add space https://huggingface.co/spaces/YOUR_USERNAME/murshid
29
+
30
+ # إضافة الملفات والرفع
31
+ git add .
32
+ git commit -m "Initial deployment"
33
+ git push space main
34
+ ```
35
+
36
+ > ⚠️ إذا طلب كلمة مرور، استخدم **Access Token** من:
37
+ > https://huggingface.co/settings/tokens
38
+
39
+ ### 3. إعداد المتغيرات البيئية (Secrets)
40
+
41
+ اذهب إلى إعدادات الـ Space: `Settings → Variables and secrets`
42
+
43
+ أضف هذه المتغيرات:
44
+
45
+ | الاسم | القيمة | النوع |
46
+ |-------|--------|-------|
47
+ | `MURSHID_DB_URL` | `sqlite:////app/data/murshid.db` | Variable |
48
+ | `MURSHID_MODELS_DIR` | `/app/Needed` | Variable |
49
+ | `MURSHID_SKIP_LLM` | `true` | Variable |
50
+ | `SECRET_KEY` | (اختر كلمة سر عشوائية) | **Secret** |
51
+ | `HF_TOKEN` | (اختياري — لو تبغى Llama) | **Secret** |
52
+
53
+ ### 4. انتظر البناء
54
+
55
+ - HF Spaces يبني الـ Docker image تلقائياً
56
+ - يأخذ **3-5 دقائق** للبناء الأول
57
+ - بعد النجاح، الرابط يكون:
58
+ ```
59
+ https://YOUR_USERNAME-murshid.hf.space
60
+ ```
61
+
62
+ ---
63
+
64
+ ## الروابط بعد النشر
65
+
66
+ | الرابط | الوصف |
67
+ |--------|-------|
68
+ | `https://YOUR_USERNAME-murshid.hf.space` | الواجهة الرئيسية |
69
+ | `https://YOUR_USERNAME-murshid.hf.space/docs` | توثيق Swagger |
70
+ | `https://YOUR_USERNAME-murshid.hf.space/health` | فحص الحالة |
71
+
72
+ ---
73
+
74
+ ## ملاحظات
75
+
76
+ ### الوضع الحالي (LITE mode)
77
+ - المشروع ينشر بوضع **LITE** (بدون torch/SecureBERT+)
78
+ - تحليل القواعد يعمل لكن بدقة أقل (embeddings عشوائية)
79
+ - مناسب لاختبار الواجهة والـ API
80
+
81
+ ### للترقية إلى LOCAL mode (SecureBERT+ بدون Llama)
82
+ عدّل `Dockerfile` وأزل التعليق من سطر torch:
83
+ ```dockerfile
84
+ RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece
85
+ ```
86
+ > ⚠️ هذا يزيد حجم الصورة ~800MB ويحتاج ذاكرة أكثر
87
+
88
+ ### للترقية إلى FULL mode (مع Llama 3)
89
+ - غيّر الـ Space إلى **GPU (T4)** من الإعدادات ($0.60/ساعة)
90
+ - عدّل `MURSHID_SKIP_LLM=false`
91
+ - أضف `HF_TOKEN` في الـ Secrets
92
+ - استخدم `requirements.txt` الكامل بدل `requirements_light.txt`
93
+
94
+ ---
95
+
96
+ ## استكشاف الأخطاء
97
+
98
+ | المشكلة | الحل |
99
+ |---------|------|
100
+ | Build فشل | تحقق من الـ Logs في تبويب الـ Space |
101
+ | 502 Bad Gateway | انتظر دقيقة — الخادم يبدأ |
102
+ | DB خطأ | تحقق من `MURSHID_DB_URL` في المتغيرات |
103
+ | Frontend لا يتصل | الـ BASE URL أصبح تلقائي (`window.location.origin`) |
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ # System deps
4
+ RUN apt-get update && apt-get install -y --no-install-recommends \
5
+ build-essential libxml2-dev libxslt1-dev \
6
+ && rm -rf /var/lib/apt/lists/*
7
+
8
+ # Create non-root user (HF Spaces requirement)
9
+ RUN useradd -m -u 1000 appuser
10
+
11
+ WORKDIR /app
12
+
13
+ # Copy requirements first for layer caching
14
+ COPY murshid_backend/requirements_light.txt ./requirements.txt
15
+ RUN pip install --no-cache-dir -r requirements.txt \
16
+ && pip install --no-cache-dir openpyxl aiofiles scikit-learn
17
+
18
+ # Optional: install torch CPU-only for LOCAL mode (SecureBERT+ embeddings)
19
+ # Uncomment the next line if you want LOCAL mode (adds ~800MB to image)
20
+ # RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece
21
+
22
+ # Copy backend code
23
+ COPY murshid_backend/ ./murshid_backend/
24
+
25
+ # Copy model files
26
+ COPY Needed/ ./Needed/
27
+
28
+ # Copy frontend
29
+ COPY murshid_frontend/ ./murshid_frontend/
30
+
31
+ # Create writable directory for SQLite DB
32
+ RUN mkdir -p /app/data && chown -R appuser:appuser /app
33
+
34
+ # Setup environment
35
+ ENV MURSHID_DB_URL=sqlite:////app/data/murshid.db
36
+ ENV MURSHID_MODELS_DIR=/app/Needed
37
+ ENV MURSHID_SKIP_LLM=true
38
+ ENV SECRET_KEY=murshid_hf_space_2026
39
+ ENV PORT=7860
40
+
41
+ # Run DB migrations + import templates + start server
42
+ COPY start.sh ./start.sh
43
+ RUN chmod +x start.sh
44
+
45
+ USER appuser
46
+
47
+ EXPOSE 7860
48
+
49
+ CMD ["./start.sh"]
MurshidBackend_Colab.ipynb ADDED
@@ -0,0 +1,967 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# 🛡️ Murshid Backend — Full Mode on Colab\n",
8
+ "\n",
9
+ "**مُرشِد | From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts**\n",
10
+ "\n",
11
+ "---\n",
12
+ "\n",
13
+ "## 📁 الملفات المطلوبة على Google Drive\n",
14
+ "\n",
15
+ "```\n",
16
+ "MyDrive/\n",
17
+ "├── murshid_backend_for_drive.zip ← ارفعيه ثم شغّلي الخلية 2b لاستخراجه\n",
18
+ "│ أو\n",
19
+ "├── murshid_backend/ ← إذا استخرجته مسبقاً\n",
20
+ "│ ├── app/\n",
21
+ "│ ├── alembic/\n",
22
+ "│ ├── scripts/\n",
23
+ "│ ├── alembic.ini\n",
24
+ "│ └── requirements.txt\n",
25
+ "│\n",
26
+ "└── Needed/\n",
27
+ " ├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
28
+ " ├── murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
29
+ " ├── murshid_label_columns.json\n",
30
+ " └── murshid_query_template_structure_clean_shared.xlsx\n",
31
+ "```\n",
32
+ "\n",
33
+ "## تعليمات التشغيل\n",
34
+ "\n",
35
+ "### المتطلبات قبل التشغيل\n",
36
+ "1. ✅ **GPU مُفعَّل:** `Runtime → Change runtime type → T4 GPU`\n",
37
+ "2. ✅ **Google Drive مُتَّصل** (يحتوي مجلد `Needed` بملفات النماذج)\n",
38
+ "3. ✅ **مجلد `murshid_backend`** على Drive أو رفعه يدوياً\n",
39
+ "\n",
40
+ "### الملفات المطلوبة في Google Drive\n",
41
+ "```\n",
42
+ "MyDrive/\n",
43
+ "├── Needed/\n",
44
+ "│ ├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
45
+ "│ ├── murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
46
+ "│ ├── murshid_label_columns.json\n",
47
+ "│ └── murshid_query_template_structure_clean_shared.xlsx\n",
48
+ "└── murshid_backend/ ← مجلد الباكند كاملاً\n",
49
+ "```\n",
50
+ "\n",
51
+ "### ترتيب التشغيل\n",
52
+ "**شغّلي الخلايا بالترتيب من الأعلى للأسفل — لا تتخطّي أي خلية**\n"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "metadata": {},
58
+ "source": [
59
+ "---\n",
60
+ "## الخلية 1: التحقق من GPU\n"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "import torch\n",
70
+ "\n",
71
+ "print('CUDA available:', torch.cuda.is_available())\n",
72
+ "if torch.cuda.is_available():\n",
73
+ " print('GPU:', torch.cuda.get_device_name(0))\n",
74
+ " print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')\n",
75
+ "else:\n",
76
+ " print('⚠️ لا يوجد GPU — غيّري Runtime إلى T4 من القائمة أعلاه')"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "metadata": {},
82
+ "source": [
83
+ "---\n",
84
+ "## الخلية 2: تحميل Google Drive\n"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "markdown",
89
+ "metadata": {},
90
+ "source": [
91
+ "---\n",
92
+ "## الخلية 3: تجهيز الباكند في /content\n",
93
+ "\n",
94
+ "> تقوم هذه الخلية تلقائياً بـ:\n",
95
+ "> 1. استخراج ZIP من Drive (إذا كان ZIP موجوداً ولم يُستخرج بعد)\n",
96
+ "> 2. نسخ مجلد `murshid_backend` إلى `/content` (أسرع للقراءة)\n",
97
+ "> 3. ضبط Python path\n"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": [
106
+ "print('(هذه الخلية فارغة — الكود انتقل إلى الخلية 3 أدناه)')\n",
107
+ "\n"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "metadata": {},
114
+ "outputs": [],
115
+ "source": [
116
+ "from google.colab import drive\n",
117
+ "import os\n",
118
+ "\n",
119
+ "drive.mount('/content/drive')\n",
120
+ "\n",
121
+ "# ✏️ عدّلي هذا المسار إذا كان مجلدك مختلفاً\n",
122
+ "NEEDED_PATH = '/content/drive/MyDrive/Needed'\n",
123
+ "BACKEND_PATH = '/content/drive/MyDrive/murshid_backend'\n",
124
+ "ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip'\n",
125
+ "\n",
126
+ "print('=' * 55)\n",
127
+ "print('📂 Checking Google Drive files...')\n",
128
+ "print('=' * 55)\n",
129
+ "\n",
130
+ "# ── التحقق من ملفات Needed ────────────────────────���───────────\n",
131
+ "print('\\n📁 Needed/ (model files):')\n",
132
+ "required_files = {\n",
133
+ " 'murshid_logreg_pipeline_manual_oof_pcatuned.joblib': 'LogReg model',\n",
134
+ " 'murshid_logreg_thresholds_manual_oof_pcatuned.npy': 'LogReg thresholds',\n",
135
+ " 'murshid_label_columns.json': 'Technique names',\n",
136
+ "}\n",
137
+ "\n",
138
+ "models_ok = True\n",
139
+ "for fname, desc in required_files.items():\n",
140
+ " path = f'{NEEDED_PATH}/{fname}'\n",
141
+ " exists = os.path.isfile(path)\n",
142
+ " size = f'{os.path.getsize(path)/1024:.0f} KB' if exists else ''\n",
143
+ " status = '✅' if exists else '❌'\n",
144
+ " print(f' {status} {fname} {size}')\n",
145
+ " if not exists:\n",
146
+ " models_ok = False\n",
147
+ "\n",
148
+ "excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n",
149
+ "excel_ok = os.path.isfile(excel_path)\n",
150
+ "print(f' {\"✅\" if excel_ok else \"⚠️ \"} murshid_query_template_structure_clean_shared.xlsx (optional)')\n",
151
+ "\n",
152
+ "# ── التحقق من الباكند ─────────────────────────────────────────\n",
153
+ "print('\\n📁 murshid_backend/ (backend code):')\n",
154
+ "backend_ok = os.path.isdir(BACKEND_PATH)\n",
155
+ "zip_ok = os.path.isfile(ZIP_PATH)\n",
156
+ "\n",
157
+ "if backend_ok:\n",
158
+ " fcount = sum(len(f) for _, _, f in os.walk(BACKEND_PATH))\n",
159
+ " print(f' ✅ murshid_backend/ ({fcount} files)')\n",
160
+ "elif zip_ok:\n",
161
+ " zsize = f'{os.path.getsize(ZIP_PATH)/1024:.0f} KB'\n",
162
+ " print(f' 📦 murshid_backend_for_drive.zip ({zsize}) — سيُستخرج تلقائياً في الخلية 3')\n",
163
+ "else:\n",
164
+ " print(f' ❌ murshid_backend/ غير موجود')\n",
165
+ " print(f' ❌ murshid_backend_for_drive.zip غير موجود')\n",
166
+ " print(f'\\n ⚠️ ارفعي murshid_backend_for_drive.zip إلى:')\n",
167
+ " print(f' Google Drive → My Drive')\n",
168
+ "\n",
169
+ "# ── ملخص ──────────────────────────────────────────────────────\n",
170
+ "print('\\n' + '=' * 55)\n",
171
+ "if models_ok and (backend_ok or zip_ok):\n",
172
+ " print('✅ كل شيء جاهز — تابعي تشغيل الخلايا')\n",
173
+ "elif not models_ok:\n",
174
+ " print('❌ ملفات النماذج مفقودة من Needed/ — يجب رفعها أولاً')\n",
175
+ "else:\n",
176
+ " print('❌ ملفات الباكند مفقودة — ارفعي ZIP أولاً')"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "markdown",
181
+ "metadata": {},
182
+ "source": [
183
+ "---\n",
184
+ "## الخلية 3: نسخ الباكند إلى /content\n",
185
+ "\n",
186
+ "> نسخ الملفات من Drive إلى `/content` لتسريع القراءة\n"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": null,
192
+ "metadata": {},
193
+ "outputs": [],
194
+ "source": [
195
+ "import shutil, os, zipfile, sys\n",
196
+ "\n",
197
+ "DRIVE_BASE = '/content/drive/MyDrive'\n",
198
+ "ZIP_PATH = f'{DRIVE_BASE}/murshid_backend_for_drive.zip'\n",
199
+ "BACKEND_DRIVE= f'{DRIVE_BASE}/murshid_backend'\n",
200
+ "BACKEND_LOCAL= '/content/murshid_backend'\n",
201
+ "\n",
202
+ "# ── الخطوة 1: استخراج ZIP من Drive إذا لزم ────────────────────\n",
203
+ "if not os.path.isdir(BACKEND_DRIVE):\n",
204
+ " if os.path.isfile(ZIP_PATH):\n",
205
+ " print(f'📦 ZIP found — extracting to Drive...')\n",
206
+ " with zipfile.ZipFile(ZIP_PATH, 'r') as z:\n",
207
+ " z.extractall(DRIVE_BASE)\n",
208
+ " print(f'✅ Extracted to {BACKEND_DRIVE}')\n",
209
+ " else:\n",
210
+ " print('❌ ERROR: مجلد murshid_backend غير موجود على Drive')\n",
211
+ " print(f' المطلوب: {BACKEND_DRIVE}')\n",
212
+ " print(f' أو رفع: {ZIP_PATH}')\n",
213
+ " raise FileNotFoundError(f'Backend not found. Upload murshid_backend_for_drive.zip to Google Drive MyDrive.')\n",
214
+ "else:\n",
215
+ " print(f'✅ murshid_backend found on Drive: {BACKEND_DRIVE}')\n",
216
+ "\n",
217
+ "# ── الخطوة 2: نسخ إلى /content (أسرع بكثير من Drive أثناء التشغيل) ─\n",
218
+ "if os.path.exists(BACKEND_LOCAL):\n",
219
+ " shutil.rmtree(BACKEND_LOCAL)\n",
220
+ "\n",
221
+ "shutil.copytree(\n",
222
+ " BACKEND_DRIVE,\n",
223
+ " BACKEND_LOCAL,\n",
224
+ " ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '.venv', '*.db', '*.log')\n",
225
+ ")\n",
226
+ "\n",
227
+ "# ── الخطوة 3: إضافة للـ Python path ──────────────────────────\n",
228
+ "if BACKEND_LOCAL not in sys.path:\n",
229
+ " sys.path.insert(0, BACKEND_LOCAL)\n",
230
+ "\n",
231
+ "os.chdir(BACKEND_LOCAL)\n",
232
+ "\n",
233
+ "# ── تحقق ─────────────────────────────────────────────────────\n",
234
+ "file_count = sum(len(files) for _, _, files in os.walk(BACKEND_LOCAL))\n",
235
+ "print(f'✅ Backend ready at {BACKEND_LOCAL} ({file_count} files)')\n",
236
+ "print(f'✅ Working dir: {os.getcwd()}')\n",
237
+ "\n",
238
+ "# عرض الهيكل\n",
239
+ "print('\\nStructure:')\n",
240
+ "for item in sorted(os.listdir(BACKEND_LOCAL)):\n",
241
+ " full = os.path.join(BACKEND_LOCAL, item)\n",
242
+ " if os.path.isdir(full):\n",
243
+ " sub_count = len(os.listdir(full))\n",
244
+ " print(f' 📁 {item}/ ({sub_count} items)')\n",
245
+ " else:\n",
246
+ " size = os.path.getsize(full)\n",
247
+ " print(f' 📄 {item} ({size:,} bytes)')"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "markdown",
252
+ "metadata": {},
253
+ "source": [
254
+ "---\n",
255
+ "## الخلية 4: تثبيت المتطلبات\n"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "code",
260
+ "execution_count": null,
261
+ "metadata": {},
262
+ "outputs": [],
263
+ "source": [
264
+ "print('📦 Installing requirements...')\n",
265
+ "\n",
266
+ "# ── الحزم الأساسية للباكند ──────────────────────────────────────\n",
267
+ "!pip install -q \\\n",
268
+ " fastapi==0.115.0 \\\n",
269
+ " \"uvicorn[standard]==0.32.0\" \\\n",
270
+ " pydantic==2.9.0 \\\n",
271
+ " pydantic-settings==2.6.0 \\\n",
272
+ " python-dotenv==1.0.0 \\\n",
273
+ " sqlalchemy==2.0.0 \\\n",
274
+ " alembic==1.13.0 \\\n",
275
+ " aiofiles \\\n",
276
+ " scikit-learn==1.6.1 \\\n",
277
+ " joblib \\\n",
278
+ " lxml \\\n",
279
+ " openpyxl \\\n",
280
+ " nest-asyncio \\\n",
281
+ " pyngrok\n",
282
+ "\n",
283
+ "# ── bitsandbytes: مطلوب لتحميل LLaMA بـ 4-bit على GPU ─────────\n",
284
+ "print('📦 Installing bitsandbytes (required for LLaMA 4-bit)...')\n",
285
+ "!pip install -q -U \"bitsandbytes>=0.46.1\"\n",
286
+ "\n",
287
+ "# ── accelerate: مطلوب لـ device_map=\"auto\" ────────────────────\n",
288
+ "!pip install -q -U accelerate\n",
289
+ "\n",
290
+ "# ── تحقق من التثبيت ──────────────────────────────────────────\n",
291
+ "import importlib\n",
292
+ "for pkg in ['bitsandbytes', 'accelerate', 'fastapi', 'sklearn']:\n",
293
+ " try:\n",
294
+ " mod = importlib.import_module(pkg if pkg != 'sklearn' else 'sklearn')\n",
295
+ " ver = getattr(mod, '__version__', '?')\n",
296
+ " print(f' ✅ {pkg}=={ver}')\n",
297
+ " except ImportError:\n",
298
+ " print(f' ❌ {pkg} — فشل التثبيت')\n",
299
+ "\n",
300
+ "print('\\n✅ All requirements installed')"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "markdown",
305
+ "metadata": {},
306
+ "source": [
307
+ "---\n",
308
+ "## الخلية 5: إعداد ملف .env\n"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": null,
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": [
317
+ "import os\n",
318
+ "\n",
319
+ "# ✏️ ضعي HF Token هنا إذا لم تُضيفيه عبر Colab Secrets\n",
320
+ "HF_TOKEN = os.environ.get('HF_TOKEN', 'ادخل التوكن')\n",
321
+ "\n",
322
+ "env_content = f\"\"\"# Auto-generated .env for Colab FULL mode\n",
323
+ "MURSHID_DB_URL=sqlite:////content/murshid.db\n",
324
+ "MURSHID_MODELS_DIR={NEEDED_PATH}\n",
325
+ "HF_TOKEN={HF_TOKEN}\n",
326
+ "MURSHID_SKIP_LLM=false\n",
327
+ "SECRET_KEY=murshid_colab_2026\n",
328
+ "LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct\n",
329
+ "EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus\n",
330
+ "LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
331
+ "LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
332
+ "LABEL_COLUMNS_JSON=murshid_label_columns.json\n",
333
+ "\"\"\"\n",
334
+ "\n",
335
+ "env_path = '/content/murshid_backend/.env'\n",
336
+ "with open(env_path, 'w') as f:\n",
337
+ " f.write(env_content)\n",
338
+ "\n",
339
+ "print('✅ .env created at', env_path)\n",
340
+ "print('\\nContents:')\n",
341
+ "with open(env_path) as f:\n",
342
+ " for line in f:\n",
343
+ " if 'TOKEN' in line or 'SECRET' in line:\n",
344
+ " key = line.split('=')[0]\n",
345
+ " print(f' {key}=****')\n",
346
+ " else:\n",
347
+ " print(' ', line.rstrip())"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "markdown",
352
+ "metadata": {},
353
+ "source": [
354
+ "---\n",
355
+ "## الخلية 6: تهجير قاعدة البيانات (Alembic)\n"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": null,
361
+ "metadata": {},
362
+ "outputs": [],
363
+ "source": [
364
+ "import subprocess, os\n",
365
+ "\n",
366
+ "os.chdir('/content/murshid_backend')\n",
367
+ "\n",
368
+ "result = subprocess.run(\n",
369
+ " ['python', '-m', 'alembic', 'upgrade', 'head'],\n",
370
+ " capture_output=True, text=True\n",
371
+ ")\n",
372
+ "\n",
373
+ "print(result.stdout)\n",
374
+ "if result.stderr:\n",
375
+ " print(result.stderr)\n",
376
+ "\n",
377
+ "import os\n",
378
+ "db_exists = os.path.isfile('/content/murshid.db')\n",
379
+ "print('✅ Database ready:', '/content/murshid.db' if db_exists else '❌ لم يُنشأ')"
380
+ ]
381
+ },
382
+ {
383
+ "cell_type": "markdown",
384
+ "metadata": {},
385
+ "source": [
386
+ "---\n",
387
+ "## الخلية 7: استيراد قوالب WQL من Excel\n"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": null,
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "import sys\n",
397
+ "sys.path.insert(0, '/content/murshid_backend')\n",
398
+ "os.chdir('/content/murshid_backend')\n",
399
+ "\n",
400
+ "excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n",
401
+ "\n",
402
+ "if os.path.isfile(excel_path):\n",
403
+ " from app.db.session import SessionLocal\n",
404
+ " from scripts.import_excel_templates import run as import_excel\n",
405
+ "\n",
406
+ " db = SessionLocal()\n",
407
+ " try:\n",
408
+ " result = import_excel(db, replace=False)\n",
409
+ " print('✅ Excel import result:')\n",
410
+ " for k, v in result.items():\n",
411
+ " print(f' {k}: {v}')\n",
412
+ " finally:\n",
413
+ " db.close()\n",
414
+ "else:\n",
415
+ " print(f'⚠️ Excel file not found at: {excel_path}')\n",
416
+ " print(' يمكنك المتابعة — القوالب ستُضاف لاحقاً يدوياً')"
417
+ ]
418
+ },
419
+ {
420
+ "cell_type": "markdown",
421
+ "metadata": {},
422
+ "source": [
423
+ "---\n",
424
+ "## الخلية 8: تشغيل FastAPI + ngrok\n",
425
+ "\n",
426
+ "> ⏳ هذه الخلية تأخذ **5-10 دقائق** لتحميل LLaMA (4.5GB) و SecureBERT+\n",
427
+ "\n",
428
+ "> 🔑 **الرابط العام سيظهر في النهاية** — انسخيه للفرونت\n"
429
+ ]
430
+ },
431
+ {
432
+ "cell_type": "code",
433
+ "execution_count": null,
434
+ "metadata": {},
435
+ "outputs": [],
436
+ "source": [
437
+ "import subprocess, time, os, sys, urllib.request\n",
438
+ "import nest_asyncio\n",
439
+ "nest_asyncio.apply()\n",
440
+ "\n",
441
+ "os.chdir('/content/murshid_backend')\n",
442
+ "\n",
443
+ "# ─── التحقق من bitsandbytes قبل تشغيل الخادم ─────────────────\n",
444
+ "try:\n",
445
+ " import bitsandbytes as bnb\n",
446
+ " print(f'✅ bitsandbytes {bnb.__version__}')\n",
447
+ "except ImportError:\n",
448
+ " print('❌ bitsandbytes غير مثبّت — شغّلي الخلية 4 أولاً')\n",
449
+ " raise\n",
450
+ "\n",
451
+ "# ─── تشغيل uvicorn ───────────────────────────────────────────\n",
452
+ "log_path = '/content/murshid_server.log'\n",
453
+ "log_file = open(log_path, 'w')\n",
454
+ "\n",
455
+ "server_proc = subprocess.Popen(\n",
456
+ " [\n",
457
+ " 'python', '-m', 'uvicorn', 'app.main:app',\n",
458
+ " '--host', '0.0.0.0',\n",
459
+ " '--port', '8000',\n",
460
+ " '--log-level', 'info'\n",
461
+ " ],\n",
462
+ " cwd='/content/murshid_backend',\n",
463
+ " stdout=log_file,\n",
464
+ " stderr=subprocess.STDOUT\n",
465
+ ")\n",
466
+ "\n",
467
+ "print('⏳ Loading LLaMA 3 8B + SecureBERT+...')\n",
468
+ "print(' جاري التحميل — انتظري حتى تظهر الرسالة النهائية')\n",
469
+ "\n",
470
+ "# ─── انتظار ذكي مع عرض اللوج ────────────────────────────────\n",
471
+ "started = False\n",
472
+ "last_log_size = 0\n",
473
+ "\n",
474
+ "for i in range(180): # 15 دقيقة كحد أقصى\n",
475
+ " time.sleep(5)\n",
476
+ "\n",
477
+ " # تحقق إذا بدأ الخادم\n",
478
+ " try:\n",
479
+ " resp = urllib.request.urlopen('http://localhost:8000/health', timeout=3)\n",
480
+ " if resp.status == 200:\n",
481
+ " started = True\n",
482
+ " break\n",
483
+ " except Exception:\n",
484
+ " pass\n",
485
+ "\n",
486
+ " # عرض اللوج الجديد كل 30 ثانية\n",
487
+ " if i % 6 == 0:\n",
488
+ " elapsed = (i + 1) * 5\n",
489
+ " log_file.flush()\n",
490
+ " try:\n",
491
+ " with open(log_path) as f:\n",
492
+ " log_content = f.read()\n",
493
+ " new_content = log_content[last_log_size:]\n",
494
+ " last_log_size = len(log_content)\n",
495
+ "\n",
496
+ " # تحقق من خطأ مبكر\n",
497
+ " if 'ERROR' in new_content or 'ImportError' in new_content:\n",
498
+ " print(f'\\n❌ خطأ في الخادم عند {elapsed}s:')\n",
499
+ " # عرض آخر 1000 حرف من اللوج\n",
500
+ " print(log_content[-1500:])\n",
501
+ " server_proc.terminate()\n",
502
+ " log_file.close()\n",
503
+ " raise RuntimeError('Server failed to start. See log above.')\n",
504
+ "\n",
505
+ " # عرض ما تم تحميله\n",
506
+ " if 'Loaded' in new_content or 'loaded' in new_content or 'Application' in new_content:\n",
507
+ " for line in new_content.strip().split('\\n'):\n",
508
+ " if any(k in line for k in ['INFO', 'Loaded', 'loaded', 'Application', 'WARNING']):\n",
509
+ " print(f' {line.strip()}')\n",
510
+ " else:\n",
511
+ " mins = elapsed // 60\n",
512
+ " secs = elapsed % 60\n",
513
+ " print(f' ⏳ {mins}m {secs}s — يجري تحميل النماذج...')\n",
514
+ " except RuntimeError:\n",
515
+ " raise\n",
516
+ " except Exception:\n",
517
+ " print(f' ⏳ {elapsed}s elapsed...')\n",
518
+ "\n",
519
+ "log_file.flush()\n",
520
+ "log_file.close()\n",
521
+ "\n",
522
+ "if not started:\n",
523
+ " print('\\n❌ Server did not start after 15 minutes.')\n",
524
+ " print('─── آخر سطور اللوج ───')\n",
525
+ " with open(log_path) as f:\n",
526
+ " print(f.read()[-3000:])\n",
527
+ "else:\n",
528
+ " print('\\n✅ Server started successfully!')\n",
529
+ "\n",
530
+ " # ─── Cloudflare Tunnel (مجاني — بدون حساب) ──────────────────\n",
531
+ " import subprocess, re, threading, time\n",
532
+ "\n",
533
+ " # تثبيت cloudflared\n",
534
+ " subprocess.run(\n",
535
+ " ['wget', '-q', 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
536
+ " '-O', '/usr/local/bin/cloudflared'],\n",
537
+ " check=True\n",
538
+ " )\n",
539
+ " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
540
+ " print('✅ cloudflared installed')\n",
541
+ "\n",
542
+ " # تشغيل النفق\n",
543
+ " cf_log = open('/content/cloudflared.log', 'w')\n",
544
+ " cf_proc = subprocess.Popen(\n",
545
+ " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
546
+ " stdout=cf_log, stderr=subprocess.STDOUT\n",
547
+ " )\n",
548
+ "\n",
549
+ " # انتظار ظهور الرابط في اللوج\n",
550
+ " public_url = None\n",
551
+ " for _ in range(30):\n",
552
+ " time.sleep(2)\n",
553
+ " cf_log.flush()\n",
554
+ " try:\n",
555
+ " with open('/content/cloudflared.log') as f:\n",
556
+ " content = f.read()\n",
557
+ " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
558
+ " if match:\n",
559
+ " public_url = match.group(0)\n",
560
+ " break\n",
561
+ " except Exception:\n",
562
+ " pass\n",
563
+ "\n",
564
+ " if public_url:\n",
565
+ " print('\\n' + '='*60)\n",
566
+ " print('🌐 PUBLIC URL (الرابط العام — Cloudflare):')\n",
567
+ " print(f' {public_url}')\n",
568
+ " print('='*60)\n",
569
+ " print(f'📖 Swagger: {public_url}/docs')\n",
570
+ " print(f'💚 Health: {public_url}/health')\n",
571
+ " print(f'🗄️ DB Summary: {public_url}/api/db/summary')\n",
572
+ " print('='*60)\n",
573
+ " print('\\n📋 انسخي هذا السطر والصقيه في الفرونت (index.html):')\n",
574
+ " print(f\" const BASE = '{public_url}';\")\n",
575
+ " else:\n",
576
+ " print('⚠️ Cloudflare tunnel URL not found, check /content/cloudflared.log')\n",
577
+ " with open('/content/cloudflared.log') as f:\n",
578
+ " print(f.read()[-1000:])"
579
+ ]
580
+ },
581
+ {
582
+ "cell_type": "code",
583
+ "execution_count": null,
584
+ "metadata": {},
585
+ "outputs": [],
586
+ "source": [
587
+ "# ─── تشغيل Cloudflare Tunnel بشكل منفصل (إذا فشل مع الخلية 8) ─\n",
588
+ "# شغّلي هذه الخلية فقط إذا كان الخادم يعمل لكن الـ tunnel فشل\n",
589
+ "\n",
590
+ "import subprocess, re, time, os\n",
591
+ "\n",
592
+ "# تثبيت cloudflared إذا لم يُثبَّت\n",
593
+ "if not os.path.isfile('/usr/local/bin/cloudflared'):\n",
594
+ " subprocess.run(\n",
595
+ " ['wget', '-q',\n",
596
+ " 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
597
+ " '-O', '/usr/local/bin/cloudflared'],\n",
598
+ " check=True\n",
599
+ " )\n",
600
+ " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
601
+ " print('✅ cloudflared installed')\n",
602
+ "else:\n",
603
+ " print('✅ cloudflared already installed')\n",
604
+ "\n",
605
+ "# تشغيل النفق\n",
606
+ "cf_log_path = '/content/cloudflared.log'\n",
607
+ "cf_log = open(cf_log_path, 'w')\n",
608
+ "cf_proc = subprocess.Popen(\n",
609
+ " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
610
+ " stdout=cf_log, stderr=subprocess.STDOUT\n",
611
+ ")\n",
612
+ "\n",
613
+ "print('⏳ Opening Cloudflare tunnel...')\n",
614
+ "\n",
615
+ "public_url = None\n",
616
+ "for _ in range(30):\n",
617
+ " time.sleep(2)\n",
618
+ " cf_log.flush()\n",
619
+ " try:\n",
620
+ " with open(cf_log_path) as f:\n",
621
+ " content = f.read()\n",
622
+ " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
623
+ " if match:\n",
624
+ " public_url = match.group(0)\n",
625
+ " break\n",
626
+ " except Exception:\n",
627
+ " pass\n",
628
+ "\n",
629
+ "if public_url:\n",
630
+ " print('\\n' + '='*60)\n",
631
+ " print(f'🌐 PUBLIC URL: {public_url}')\n",
632
+ " print(f'📖 Swagger: {public_url}/docs')\n",
633
+ " print(f'💚 Health: {public_url}/health')\n",
634
+ " print('='*60)\n",
635
+ " print('\\n📋 الصقي هذا السطر في index.html:')\n",
636
+ " print(f\" const BASE = '{public_url}';\")\n",
637
+ "else:\n",
638
+ " print('❌ لم يُعثر على URL. اللوج:')\n",
639
+ " with open(cf_log_path) as f:\n",
640
+ " print(f.read())\n"
641
+ ]
642
+ },
643
+ {
644
+ "cell_type": "markdown",
645
+ "metadata": {},
646
+ "source": [
647
+ "---\n",
648
+ "## الخلية 9: ربط الفرونت بـ Cloudflare URL\n",
649
+ "\n",
650
+ "بعد تشغيل الخلية السابقة، ستظهر رسالة مثل:\n",
651
+ "```\n",
652
+ "🌐 PUBLIC URL: https://xxxx-xxxx.trycloudflare.com\n",
653
+ "```\n",
654
+ "\n",
655
+ "**الخلية أدناه تُحدّث الفرونت تلقائياً** — أو يمكنك التعديل يدوياً في `index.html`:\n",
656
+ "```javascript\n",
657
+ "const BASE = 'https://xxxx-xxxx.trycloudflare.com';\n",
658
+ "```\n"
659
+ ]
660
+ },
661
+ {
662
+ "cell_type": "code",
663
+ "execution_count": null,
664
+ "metadata": {},
665
+ "outputs": [],
666
+ "source": [
667
+ "import subprocess, re, time, os\n",
668
+ "\n",
669
+ "# ── الخطوة 1: تثبيت cloudflared ──────────────────────────────\n",
670
+ "if not os.path.isfile('/usr/local/bin/cloudflared'):\n",
671
+ " subprocess.run([\n",
672
+ " 'wget', '-q',\n",
673
+ " 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
674
+ " '-O', '/usr/local/bin/cloudflared'\n",
675
+ " ], check=True)\n",
676
+ " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
677
+ " print('✅ cloudflared installed')\n",
678
+ "else:\n",
679
+ " print('✅ cloudflared ready')\n",
680
+ "\n",
681
+ "# ── الخطوة 2: تشغيل النفق ────────────────────────────────────\n",
682
+ "cf_log_path = '/content/cf.log'\n",
683
+ "cf_log = open(cf_log_path, 'w')\n",
684
+ "subprocess.Popen(\n",
685
+ " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
686
+ " stdout=cf_log, stderr=subprocess.STDOUT\n",
687
+ ")\n",
688
+ "\n",
689
+ "print('⏳ Opening Cloudflare tunnel...')\n",
690
+ "\n",
691
+ "# ── الخطوة 3: انتظار الرابط ───────────────────────────────────\n",
692
+ "public_url = None\n",
693
+ "for _ in range(30):\n",
694
+ " time.sleep(2)\n",
695
+ " cf_log.flush()\n",
696
+ " with open(cf_log_path) as f:\n",
697
+ " content = f.read()\n",
698
+ " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
699
+ " if match:\n",
700
+ " public_url = match.group(0)\n",
701
+ " break\n",
702
+ "\n",
703
+ "if not public_url:\n",
704
+ " print('❌ Tunnel failed. Log:')\n",
705
+ " with open(cf_log_path) as f: print(f.read())\n",
706
+ "else:\n",
707
+ " # ── الخطوة 4: تحديث index.html تلقائياً ─────────────────\n",
708
+ " frontend_path = '/content/drive/MyDrive/murshid_frontend/index.html'\n",
709
+ "\n",
710
+ " if os.path.isfile(frontend_path):\n",
711
+ " with open(frontend_path, 'r', encoding='utf-8') as f:\n",
712
+ " html = f.read()\n",
713
+ " html_updated = re.sub(r\"const BASE = '[^']*';\",\n",
714
+ " f\"const BASE = '{public_url}';\", html)\n",
715
+ " with open(frontend_path, 'w', encoding='utf-8') as f:\n",
716
+ " f.write(html_updated)\n",
717
+ " print(f'✅ index.html updated automatically')\n",
718
+ " else:\n",
719
+ " print(f'⚠️ index.html not found — عدّليه يدوياً')\n",
720
+ "\n",
721
+ " print('\\n' + '='*60)\n",
722
+ " print(f'🌐 PUBLIC URL: {public_url}')\n",
723
+ " print(f'📖 Swagger: {public_url}/docs')\n",
724
+ " print(f'💚 Health: {public_url}/health')\n",
725
+ " print(f'🖥️ Frontend: {public_url}/index.html')\n",
726
+ " print('='*60)\n",
727
+ " print(f\"\\n📋 const BASE = '{public_url}';\")"
728
+ ]
729
+ },
730
+ {
731
+ "cell_type": "markdown",
732
+ "metadata": {},
733
+ "source": [
734
+ "---\n",
735
+ "## الخلية 10: اختبار الـ API\n"
736
+ ]
737
+ },
738
+ {
739
+ "cell_type": "code",
740
+ "execution_count": null,
741
+ "metadata": {},
742
+ "outputs": [],
743
+ "source": [
744
+ "import urllib.request, json\n",
745
+ "\n",
746
+ "# ─── Health Check ────────────────────────────────────────────\n",
747
+ "with urllib.request.urlopen('http://localhost:8000/health') as r:\n",
748
+ " health = json.load(r)\n",
749
+ "\n",
750
+ "print('=== Health Check ===')\n",
751
+ "print(f\" status: {health['status']}\")\n",
752
+ "print(f\" pipeline_mode: {health['pipeline_mode']}\")\n",
753
+ "print(f\" llama_loaded: {health['components']['llama_loaded']}\")\n",
754
+ "print(f\" embedder_loaded: {health['components']['embedder_loaded']}\")\n",
755
+ "print(f\" logreg_loaded: {health['components']['logreg_loaded']}\")\n",
756
+ "print(f\" cuda_available: {health['components']['cuda_available']}\")\n",
757
+ "\n",
758
+ "mode = health.get('pipeline_mode', 'unknown')\n",
759
+ "if mode == 'full':\n",
760
+ " print('\\n✅ FULL mode — نتائج مطابقة 100% للدفتر')\n",
761
+ "elif mode == 'local':\n",
762
+ " print('\\n⚠️ LOCAL mode — LLaMA لم يُحمَّل، تحققي من MURSHID_SKIP_LLM=false')\n",
763
+ "else:\n",
764
+ " print('\\n❌ LITE mode — تحققي من تثبيت torch والنماذج')"
765
+ ]
766
+ },
767
+ {
768
+ "cell_type": "code",
769
+ "execution_count": null,
770
+ "metadata": {},
771
+ "outputs": [],
772
+ "source": [
773
+ "# ─── تحليل قاعدة اختبار ──────────────────────────────────────\n",
774
+ "import urllib.request, json\n",
775
+ "\n",
776
+ "test_rule = '''<rule id=\"18205\" level=\"5\">\n",
777
+ " <if_sid>18201</if_sid>\n",
778
+ " <id>^634$|^4730$</id>\n",
779
+ " <description>Windows: Security Enabled Global Group Deleted</description>\n",
780
+ " <mitre><id>T1484</id></mitre>\n",
781
+ " <group>group_deleted,win_group_deleted</group>\n",
782
+ "</rule>'''\n",
783
+ "\n",
784
+ "payload = json.dumps({'rule_xml': test_rule}).encode()\n",
785
+ "req = urllib.request.Request(\n",
786
+ " 'http://localhost:8000/rules/analyze',\n",
787
+ " data=payload,\n",
788
+ " headers={'Content-Type': 'application/json'},\n",
789
+ " method='POST'\n",
790
+ ")\n",
791
+ "\n",
792
+ "with urllib.request.urlopen(req) as r:\n",
793
+ " result = json.load(r)\n",
794
+ "\n",
795
+ "print('=== Analyze Result ===')\n",
796
+ "print(f\" rule_id: {result['rule_id']}\")\n",
797
+ "print(f\" pipeline_mode: {result['pipeline_mode']}\")\n",
798
+ "print(f\" summary: {result['summary']}\")\n",
799
+ "print(f\"\\n TOP 5 Techniques:\")\n",
800
+ "print(f\" {'Technique':<15} {'Conf%':>8} {'Proba':>8} {'Thr':>6} {'Gap':>8} {'Pred':>6}\")\n",
801
+ "print(f\" {'-'*55}\")\n",
802
+ "for r in result['all_results'][:5]:\n",
803
+ " pred = '✅' if r['predicted'] else ' '\n",
804
+ " print(f\" {pred} {r['technique_id']:<13} {r['confidence_percent']:>7.2f}%\"\n",
805
+ " f\" {r['proba']:>8.4f} {r['threshold']:>6.2f} {r['gap']:>+8.4f}\")\n",
806
+ "\n",
807
+ "print(f\"\\n Detected: {len(result['detected'])} technique(s)\")\n",
808
+ "for d in result['detected']:\n",
809
+ " print(f\" ✅ {d['technique_id']} — {d['confidence_percent']}%\")"
810
+ ]
811
+ },
812
+ {
813
+ "cell_type": "code",
814
+ "execution_count": null,
815
+ "metadata": {},
816
+ "outputs": [],
817
+ "source": [
818
+ "# ─── قوالب WQL للتقنية المكتشفة ──────────────────────────────\n",
819
+ "if result['detected']:\n",
820
+ " top_technique = result['detected'][0]['technique_id']\n",
821
+ "\n",
822
+ " with urllib.request.urlopen(f'http://localhost:8000/queries/{top_technique}') as r:\n",
823
+ " queries = json.load(r)\n",
824
+ "\n",
825
+ " print(f'=== WQL Templates for {top_technique} ===')\n",
826
+ " for i, q in enumerate(queries, 1):\n",
827
+ " print(f\"\\n [{i}] {q.get('purpose', 'N/A')}\")\n",
828
+ " print(f\" Query: {q['wql_query'][:120]}...\")\n",
829
+ " print(f\" Note: {q.get('note', 'N/A')}\")"
830
+ ]
831
+ },
832
+ {
833
+ "cell_type": "markdown",
834
+ "metadata": {},
835
+ "source": [
836
+ "---\n",
837
+ "## الخلية 11: تصدير النتائج (اختياري)\n",
838
+ "\n",
839
+ "لحفظ النتائج بصيغة JSON لاستخدامها لاحقاً على الجهاز المحلي\n"
840
+ ]
841
+ },
842
+ {
843
+ "cell_type": "code",
844
+ "execution_count": null,
845
+ "metadata": {},
846
+ "outputs": [],
847
+ "source": [
848
+ "# ─── تحليل قائمة من القواعد وتصديرها ───────────────────────\n",
849
+ "import urllib.request, json, os\n",
850
+ "\n",
851
+ "# ✏️ أضيفي Rule IDs التي تريدين تحليلها\n",
852
+ "# يمكنك قراءتها من ملف\n",
853
+ "test_ids_path = f'{NEEDED_PATH}/test_rule_ids.json'\n",
854
+ "\n",
855
+ "if os.path.isfile(test_ids_path):\n",
856
+ " with open(test_ids_path) as f:\n",
857
+ " rule_ids = json.load(f)\n",
858
+ " print(f'Loaded {len(rule_ids)} rule IDs from test_rule_ids.json')\n",
859
+ "else:\n",
860
+ " # قواعد تجريبية\n",
861
+ " rule_ids = ['18205']\n",
862
+ " print('Using default test rule')\n",
863
+ "\n",
864
+ "print(f'Processing {len(rule_ids)} rules...')\n",
865
+ "\n",
866
+ "export_results = []\n",
867
+ "\n",
868
+ "for rule_id in rule_ids:\n",
869
+ " try:\n",
870
+ " with urllib.request.urlopen(f'http://localhost:8000/results/{rule_id}') as r:\n",
871
+ " data = json.load(r)\n",
872
+ " data['source'] = 'colab_full_mode'\n",
873
+ " export_results.append(data)\n",
874
+ " detected = len(data.get('detected', []))\n",
875
+ " top = data['mappings'][0] if data['mappings'] else {}\n",
876
+ " print(f\" ✅ {rule_id}: {top.get('technique_id','?')} ({top.get('confidence_percent','?')}%) — {detected} detected\")\n",
877
+ " except Exception as e:\n",
878
+ " print(f\" ⚠️ {rule_id}: {e}\")\n",
879
+ "\n",
880
+ "# حفظ النتائج\n",
881
+ "export_path = f'{NEEDED_PATH}/murshid_full_results.json'\n",
882
+ "with open(export_path, 'w', encoding='utf-8') as f:\n",
883
+ " json.dump(export_results, f, ensure_ascii=False, indent=2)\n",
884
+ "\n",
885
+ "print(f'\\n✅ Exported {len(export_results)} results to:')\n",
886
+ "print(f' {export_path}')\n",
887
+ "print('\\nيمكنك الآن استيراد هذا الملف في الباكند المحلي')"
888
+ ]
889
+ },
890
+ {
891
+ "cell_type": "markdown",
892
+ "metadata": {},
893
+ "source": [
894
+ "---\n",
895
+ "## الخلية 12: إيقاف الخادم (عند الانتهاء)\n"
896
+ ]
897
+ },
898
+ {
899
+ "cell_type": "code",
900
+ "execution_count": null,
901
+ "metadata": {},
902
+ "outputs": [],
903
+ "source": [
904
+ "# إيقاف الخادم وإغلاق ngrok\n",
905
+ "try:\n",
906
+ " from pyngrok import ngrok\n",
907
+ " ngrok.kill()\n",
908
+ " print('✅ ngrok tunnel closed')\n",
909
+ "except Exception:\n",
910
+ " pass\n",
911
+ "\n",
912
+ "try:\n",
913
+ " server_proc.terminate()\n",
914
+ " print('✅ Server stopped')\n",
915
+ "except Exception:\n",
916
+ " pass"
917
+ ]
918
+ },
919
+ {
920
+ "cell_type": "markdown",
921
+ "metadata": {},
922
+ "source": [
923
+ "---\n",
924
+ "## ملاحظات مهمة\n",
925
+ "\n",
926
+ "### إذا انقطع الاتصال بـ Colab\n",
927
+ "- الخادم يتوقف تلقائياً\n",
928
+ "- أعيدي تشغيل الخلايا من الخلية 8\n",
929
+ "- رابط ngrok سيتغيّر — عدّلي الفرونت بالرابط الجديد\n",
930
+ "\n",
931
+ "### إذا ظهر خطأ في LLaMA\n",
932
+ "- تأكدي أن لديك صلاحية الوصول للنموذج: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct\n",
933
+ "- تأكدي من صحة HF_TOKEN\n",
934
+ "\n",
935
+ "### المقارنة مع الجهاز المحلي\n",
936
+ "| | Colab (FULL) | الجهاز المحلي (LOCAL) |\n",
937
+ "|--|-------------|----------------------|\n",
938
+ "| LLaMA | ✅ | ❌ |\n",
939
+ "| T1484 confidence | **94.76%** | 89.29% |\n",
940
+ "| القرار النهائي | T1484 ✅ | T1484 ✅ |\n",
941
+ "\n",
942
+ "### للعرض التقديمي\n",
943
+ "1. شغّلي الخلايا 1-8 مسبقاً (قبل العرض بـ 15 دقيقة)\n",
944
+ "2. انسخي رابط ngrok\n",
945
+ "3. عدّلي الفرونت\n",
946
+ "4. افتحي `https://xxxx.ngrok-free.app/index.html`\n"
947
+ ]
948
+ }
949
+ ],
950
+ "metadata": {
951
+ "accelerator": "GPU",
952
+ "colab": {
953
+ "gpuType": "T4",
954
+ "machine_shape": "hm",
955
+ "provenance": []
956
+ },
957
+ "kernelspec": {
958
+ "display_name": "Python 3",
959
+ "name": "python3"
960
+ },
961
+ "language_info": {
962
+ "name": "python"
963
+ }
964
+ },
965
+ "nbformat": 4,
966
+ "nbformat_minor": 0
967
+ }
MurshidBackend_Colab_Report.md ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # تقرير تقني: آلية عمل MurshidBackend_Colab.ipynb
2
+
3
+ ## مشروع مُرشِد | From Alerts to Guidance
4
+ ### MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts
5
+
6
+ ---
7
+
8
+ ## 1. نظرة عامة
9
+
10
+ `MurshidBackend_Colab.ipynb` هو دفتر Jupyter مُصمَّم لتشغيل الباكند الكامل لمشروع مُرشِد على بيئة **Google Colab** باستخدام **GPU (Tesla T4)**، مما يُتيح تشغيل نموذج **LLaMA 3 8B** بتكميم 4-bit لتوليد ملخصات دلالية غنية لقواعد Wazuh XML، وذلك على عكس البيئة المحلية التي تعمل بدون LLaMA (LOCAL mode).
11
+
12
+ ### الهدف الرئيسي
13
+ تشغيل **FULL mode** للـ pipeline:
14
+ ```
15
+ قاعدة Wazuh XML
16
+
17
+ LLaMA 3 8B ←── ملخص دلالي غني (GPU)
18
+
19
+ SecureBERT+ ←── 768-dim embedding
20
+
21
+ Logistic Regression ←── confidence scores لكل تقنية
22
+
23
+ FastAPI + SQLite ←── تخزين وخدمة النتائج
24
+
25
+ Cloudflare Tunnel ←── رابط عام للفرونت
26
+ ```
27
+
28
+ ---
29
+
30
+ ## 2. المتطلبات قبل التشغيل
31
+
32
+ ### 2.1 إعداد Google Colab
33
+ | المتطلب | التفاصيل |
34
+ |---------|----------|
35
+ | **GPU** | Tesla T4 — يُفعَّل من: `Runtime → Change runtime type → T4 GPU` |
36
+ | **الذاكرة** | High RAM (machine_shape: "hm") |
37
+ | **الإنترنت** | مفعَّل لتنزيل النماذج من Hugging Face |
38
+
39
+ ### 2.2 الملفات المطلوبة على Google Drive
40
+ ```
41
+ MyDrive/
42
+ ├── murshid_backend_for_drive.zip ← ملفات الباكند مضغوطة (44 KB)
43
+ │ أو
44
+ ├── murshid_backend/ ← المجلد مستخرج مسبقاً
45
+ │ ├── app/
46
+ │ │ ├── main.py
47
+ │ │ ├── config.py
48
+ │ │ ├── api/routes/
49
+ │ │ ├── ml/
50
+ │ │ ├── models/
51
+ │ │ ├── services/
52
+ │ │ └── repositories/
53
+ │ ├── alembic/
54
+ │ ├── scripts/
55
+ │ ├── alembic.ini
56
+ │ └── requirements.txt
57
+
58
+ └── Needed/
59
+ ├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib ← نموذج LogReg
60
+ ├── murshid_logreg_thresholds_manual_oof_pcatuned.npy ← عتبات التنبؤ
61
+ ├── murshid_label_columns.json ← أسماء التقنيات الـ 20
62
+ └── murshid_query_template_structure_clean_shared.xlsx ← 60 قالب WQL
63
+ ```
64
+
65
+ ### 2.3 Hugging Face Token
66
+ مطلوب للوصول إلى نموذج `meta-llama/Meta-Llama-3-8B-Instruct`:
67
+ - يُضاف في `Colab Secrets` باسم `HF_TOKEN`
68
+ - أو مباشرةً في خلية 5 من الدفتر
69
+
70
+ ---
71
+
72
+ ## 3. شرح الخلايا بالتفصيل
73
+
74
+ ### الخلية 1: التحقق من GPU
75
+
76
+ **الهدف:** التأكد من وجود GPU قبل البدء.
77
+
78
+ ```python
79
+ import torch
80
+ print('CUDA available:', torch.cuda.is_available())
81
+ print('GPU:', torch.cuda.get_device_name(0))
82
+ print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')
83
+ ```
84
+
85
+ **المخرج المتوقع:**
86
+ ```
87
+ CUDA available: True
88
+ GPU: Tesla T4
89
+ Memory: 15.8 GB
90
+ ```
91
+
92
+ **ماذا يحدث إذا لم يكن هناك GPU؟**
93
+ - LLaMA لن يُحمَّل (يحتاج CUDA)
94
+ - الخادم سيعمل بـ LOCAL mode فقط (بدون تلخيص)
95
+
96
+ ---
97
+
98
+ ### الخلية 2: تحميل Google Drive والتحقق من الملفات
99
+
100
+ **الهدف:** ربط Colab بـ Google Drive والتحقق من وجود جميع الملفات المطلوبة.
101
+
102
+ ```python
103
+ from google.colab import drive
104
+ drive.mount('/content/drive')
105
+
106
+ NEEDED_PATH = '/content/drive/MyDrive/Needed'
107
+ BACKEND_PATH = '/content/drive/MyDrive/murshid_backend'
108
+ ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip'
109
+ ```
110
+
111
+ **ما يتحقق منه:**
112
+ | الملف | النوع | الحالة |
113
+ |-------|-------|--------|
114
+ | `murshid_logreg_pipeline_manual_oof_pcatuned.joblib` | إلزامي | ✅ / ❌ |
115
+ | `murshid_logreg_thresholds_manual_oof_pcatuned.npy` | إلزامي | ✅ / ❌ |
116
+ | `murshid_label_columns.json` | إلزامي | ✅ / ❌ |
117
+ | `murshid_query_template_structure_clean_shared.xlsx` | اختياري | ✅ / ⚠️ |
118
+ | `murshid_backend/` أو `.zip` | إلزامي | ✅ / ❌ |
119
+
120
+ ---
121
+
122
+ ### الخلية 3: تجهيز الباكند في /content
123
+
124
+ **الهدف:** نقل ملفات الباكند من Drive إلى `/content` لتسريع القراءة (Drive أبطأ في I/O).
125
+
126
+ **المنطق الذكي:**
127
+ ```
128
+ هل murshid_backend/ موجود على Drive؟
129
+ ↓ نعم → انسخ مباشرةً إلى /content
130
+ ↓ لا
131
+ هل murshid_backend_for_drive.zip موجود؟
132
+ ↓ نعم → استخرجه إلى Drive أولاً ثم انسخ
133
+ ↓ لا
134
+ → ❌ خطأ: "ارفعي ZIP إلى Google Drive"
135
+ ```
136
+
137
+ **الخطوات المنفَّذة:**
138
+ 1. **استخراج ZIP** (إذا لزم) إلى `MyDrive/`
139
+ 2. **ن��خ** `murshid_backend/` إلى `/content/murshid_backend/` (بدون pycache وملفات مؤقتة)
140
+ 3. **إضافة** `/content/murshid_backend` إلى `sys.path`
141
+ 4. **تغيير** working directory إلى `/content/murshid_backend`
142
+
143
+ **لماذا النسخ إلى /content؟**
144
+ - Drive يعتمد على FUSE mount = بطيء للقراءة المتكررة
145
+ - `/content` على SSD محلي للـ VM = أسرع بـ 5-10x
146
+
147
+ ---
148
+
149
+ ### الخلية 4: تثبيت المتطلبات
150
+
151
+ **الهدف:** تثبيت جميع المكتبات اللازمة لتشغيل الباكند.
152
+
153
+ **المكتبات المثبَّتة:**
154
+
155
+ | المكتبة | الإصدار | الغرض |
156
+ |---------|---------|--------|
157
+ | `fastapi` | 0.115.0 | إطار API |
158
+ | `uvicorn` | 0.32.0 | خادم ASGI |
159
+ | `pydantic` | 2.9.0 | تحقق من البيانات |
160
+ | `sqlalchemy` | 2.0.0 | ORM |
161
+ | `alembic` | 1.13.0 | هجرة DB |
162
+ | `scikit-learn` | **1.6.1** | نموذج LogReg (يطابق بيئة التدريب) |
163
+ | `bitsandbytes` | ≥0.46.1 | تكميم LLaMA 4-bit |
164
+ | `accelerate` | آخر نسخة | `device_map="auto"` للـ GPU |
165
+ | `openpyxl` | آخر نسخة | قراءة ملف Excel |
166
+ | `lxml` | آخر نسخة | معالجة XML |
167
+ | `pyngrok` | آخر نسخة | (احتياطي — غير مستخدم) |
168
+
169
+ > **ملاحظة مهمة:** `scikit-learn==1.6.1` محدَّد بدقة لأن ملفات joblib دُرِّبت بهذه النسخة — استخدام نسخة مختلفة يُنتج تحذيرات `InconsistentVersionWarning`.
170
+
171
+ ---
172
+
173
+ ### الخلية 5: إعداد ملف .env
174
+
175
+ **الهدف:** إنشاء ملف الإعدادات لتشغيل FULL mode.
176
+
177
+ **محتوى الملف المُولَّد:**
178
+ ```env
179
+ MURSHID_DB_URL=sqlite:////content/murshid.db
180
+ MURSHID_MODELS_DIR=/content/drive/MyDrive/Needed
181
+ HF_TOKEN=****
182
+ MURSHID_SKIP_LLM=false ← مفتاح FULL mode
183
+ SECRET_KEY=murshid_colab_2026
184
+ LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
185
+ EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus
186
+ LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib
187
+ LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy
188
+ LABEL_COLUMNS_JSON=murshid_label_columns.json
189
+ ```
190
+
191
+ **الفرق بين FULL و LOCAL mode:**
192
+ | المتغير | FULL mode | LOCAL mode |
193
+ |---------|-----------|------------|
194
+ | `MURSHID_SKIP_LLM` | `false` | `true` |
195
+ | LLaMA يُحمَّل؟ | ✅ نعم | ❌ لا |
196
+ | جودة التلخيص | عالية | الوصف الخام فقط |
197
+ | T1484 confidence (مثال) | **94.76%** | 89.29% |
198
+
199
+ ---
200
+
201
+ ### الخلية 6: تهجير قاعدة البيانات (Alembic)
202
+
203
+ **الهدف:** إنشاء جداول قاعدة البيانات SQLite.
204
+
205
+ ```bash
206
+ python -m alembic upgrade head
207
+ ```
208
+
209
+ **الجداول المُنشأة (من migration 0001):**
210
+
211
+ | الجدول | الغرض | مصدره في التقرير |
212
+ |--------|--------|-----------------|
213
+ | `users` | مستخدمو النظام (admin/analyst) | ER Diagram §3.2.6 |
214
+ | `mapping_jobs` | وظائف معالجة ملفات القواعد | ER Diagram §3.2.6 |
215
+ | `rules` | قواعد Wazuh المُحلَّلة | ER Diagram §3.2.6 |
216
+ | `techniques` | تقنيات MITRE ATT&CK | ER Diagram §3.2.6 |
217
+ | `rule_technique_mappings` | ربط القواعد بالتقنيات + confidence | ER Diagram §3.2.6 |
218
+ | `query_templates` | قوالب WQL للتحقيق | ER Diagram §3.2.6 |
219
+
220
+ > **ملاحظة:** قاعدة البيانات في `/content/murshid.db` — تُنشأ من جديد في كل جلسة Colab.
221
+
222
+ ---
223
+
224
+ ### الخلية 7: استيراد قوالب WQL من Excel
225
+
226
+ **الهدف:** تحميل 60 قالب WQL من ملف Excel إلى قاعدة البيانات.
227
+
228
+ **البيانات المستوردة:**
229
+
230
+ | الإحصائية | القيمة |
231
+ |-----------|--------|
232
+ | إجمالي التقنيات | 20 تقنية |
233
+ | إجمالي القوالب | 60 قالب (3 لكل تقنية) |
234
+ | التقنيات المشمولة | T1047, T1055, T1059.001, T1070.004, T1078, T1083, T1095, T1098, T1105, T1110, T1112, T1114, T1176, T1190, T1484, T1498, T1499, T1529, T1531, T1562.001 |
235
+
236
+ **مثال على قالب WQL (T1484):**
237
+ ```
238
+ Template 1: Host pivot
239
+ agent.name:${HOST} AND win.system.eventID:(4728 OR 4729 ...) AND @timestamp:[now-24h TO now]
240
+
241
+ Template 2: Actor pivot
242
+ win.eventdata.SubjectUserName:${USER} AND win.system.eventID:(...) AND @timestamp:[now-24h TO now]
243
+
244
+ Template 3: High-impact target change
245
+ win.system.eventID:(...) AND win.eventdata.TargetUserName:("Domain Admins" OR ...) AND @timestamp:[now-24h TO now]
246
+ ```
247
+
248
+ **منع التكرار:**
249
+ - يتحقق من وجود (`technique_id` + `purpose`) قبل الإضافة
250
+ - `replace=False` بشكل افتراضي (لا يُعيد الكتابة)
251
+
252
+ ---
253
+
254
+ ### الخلية 8: تشغيل FastAPI + Cloudflare Tunnel
255
+
256
+ **الهدف:** الخلية الرئيسية — تُشغّل الباكند وتُنشئ رابطاً عاماً.
257
+
258
+ #### 8.1 التحقق من bitsandbytes
259
+ ```python
260
+ import bitsandbytes as bnb
261
+ print(f'✅ bitsandbytes {bnb.__version__}')
262
+ ```
263
+ > إذا فشل: يُوقف التشغيل فوراً مع رسالة واضحة.
264
+
265
+ #### 8.2 تشغيل uvicorn
266
+ ```bash
267
+ python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 --log-level info
268
+ ```
269
+ - `--host 0.0.0.0`: يستمع على كل الواجهات (مطلوب للـ tunnel)
270
+ - اللوج يُحفظ في `/content/murshid_server.log`
271
+
272
+ #### 8.3 تحميل النماذج (lifespan)
273
+ عند بدء الخادم تُنفَّذ `load_models()` بهذا الترتيب:
274
+
275
+ ```
276
+ 1. hf_login(token) ← 1-2 ثانية
277
+ 2. LLaMA 3 8B-Instruct (4-bit NF4) ← 5-8 دقائق (4.5 GB)
278
+ - BitsAndBytesConfig: load_in_4bit=True
279
+ - bnb_4bit_quant_type="nf4"
280
+ - bnb_4bit_compute_dtype=float16
281
+ 3. SecureBERT+ (ehsanaghaei) ← 1-2 دقيقة
282
+ - AutoModel + AutoTokenizer
283
+ - mean pooling 768-dim
284
+ 4. LogisticRegressionModel ← < 1 ثانية
285
+ - joblib.load (Pipeline: PCA + OneVsRestClassifier)
286
+ - np.load thresholds
287
+ ```
288
+
289
+ #### 8.4 الانتظار الذكي
290
+ ```python
291
+ for i in range(180): # 15 دقيقة كحد أقصى
292
+ time.sleep(5)
293
+ # فحص /health كل 5 ثوانٍ
294
+ # عرض اللوج كل 30 ثانية
295
+ # كشف مبكر للأخطاء (ERROR, ImportError)
296
+ ```
297
+
298
+ #### 8.5 Cloudflare Tunnel
299
+ ```bash
300
+ wget cloudflared-linux-amd64 → /usr/local/bin/cloudflared
301
+ cloudflared tunnel --url http://localhost:8000
302
+ ```
303
+ - لا يحتاج حساباً أو توكناً
304
+ - يُنتج رابطاً مثل: `https://xxxx.trycloudflare.com`
305
+ - صالح طوال جلسة Colab
306
+
307
+ ---
308
+
309
+ ### الخلية 9: ربط الفرونت تلقائياً
310
+
311
+ **الهدف:** تحديث `index.html` بالرابط الجديد من Cloudflare تلقائياً.
312
+
313
+ ```python
314
+ # استخراج الرابط
315
+ match = re.search(r'https://[a-z0-9\-]+\.trycloudflare\.com', content)
316
+ public_url = match.group(0)
317
+
318
+ # تحديث index.html على Drive
319
+ html = re.sub(
320
+ r"const BASE = '[^']*';",
321
+ f"const BASE = '{public_url}';",
322
+ html
323
+ )
324
+ ```
325
+
326
+ **النتيجة:**
327
+ ```javascript
328
+ // قبل
329
+ const BASE = 'http://127.0.0.1:8000';
330
+
331
+ // بعد
332
+ const BASE = 'https://xxxx.trycloudflare.com';
333
+ ```
334
+
335
+ ---
336
+
337
+
338
+
339
+ ### الخلية 10: اختبار الـ API
340
+
341
+ **الهدف:** التحقق من عمل كل مكون.
342
+
343
+ #### 10.1 Health Check
344
+ ```python
345
+ urllib.request.urlopen('http://localhost:8000/health')
346
+ ```
347
+
348
+ **المخرج المتوقع (FULL mode):**
349
+ ```json
350
+ {
351
+ "pipeline_mode": "full",
352
+ "pipeline_description": "LLaMA + SecureBERT+ + LogReg",
353
+ "components": {
354
+ "llama_loaded": true,
355
+ "embedder_loaded": true,
356
+ "logreg_loaded": true,
357
+ "cuda_available": true
358
+ },
359
+ "all_model_files_present": true
360
+ }
361
+ ```
362
+
363
+ #### 10.2 تحليل قاعدة اختبار
364
+ ```python
365
+ rule_xml = '<rule id="18205" level="5">...'
366
+ POST http://localhost:8000/rules/analyze
367
+ ```
368
+
369
+ **الـ pipeline خطوة بخطوة:**
370
+
371
+ ```
372
+ XML Input (rule 18205)
373
+
374
+ sanitize_rule_from_string()
375
+ - حذف: mitre, if_sid, group, if_group
376
+
377
+ summarize_one_rule() [LLaMA]
378
+ - Input: sanitized XML
379
+ - Output: "Detects the deletion of a security-enabled global group on a Windows system."
380
+
381
+ build_text_for_embedding()
382
+ - text = summary + ". " + description
383
+ - "Detects the deletion of a security-enabled global group on a Windows system. Windows: Security Enabled Global Group Deleted."
384
+
385
+ SecureBERTEmbedder.embed_text()
386
+ - Chunks (256 tokens max)
387
+ - mean pooling per chunk
388
+ - average chunks → 768-dim vector
389
+ - L2 normalize
390
+
391
+ LogisticRegressionModel.predict()
392
+ - predict_proba(X_user)
393
+ - pred = (proba >= logreg_thr)
394
+ - conf = proba * 100
395
+ - gap = proba - logreg_thr
396
+
397
+ save_technique_mappings() [DB]
398
+ - حفظ 20 تقنية مع confidence
399
+
400
+ JSON Response
401
+ ```
402
+
403
+ **المخرج للقاعدة 18205:**
404
+ ```
405
+ Technique Pred Conf% Proba Thr Gap
406
+ T1484 ✅ 94.76 0.9476 0.74 +0.2076 ← Primary
407
+ T1531 ❌ 27.92 0.2792 ... ...
408
+ T1070.004 ❌ 21.03 0.2103 ... ...
409
+ T1098 ❌ 10.65 0.1065 ... ...
410
+ T1112 ❌ 9.27 0.0927 ... ...
411
+ ```
412
+
413
+ ---
414
+ الخطوات القادمة للمود المحلي (lOCAL Mode) غير ضروريه
415
+
416
+ ### الخلية 11: تصدير النتائج (اختياري)
417
+
418
+ **الهدف:** تصدير نتائج القواعد المُحلَّلة إلى JSON لاستخدامها لاحقاً على الجهاز المحلي.
419
+
420
+ ```python
421
+ export_path = f'{NEEDED_PATH}/murshid_full_results.json'
422
+ json.dump(export_results, f, ensure_ascii=False, indent=2)
423
+ ```
424
+
425
+ **الاستخدام:** يُمكِّن استيراد نتائج FULL mode في الباكند المحلي بدون GPU.
426
+
427
+ ---
428
+
429
+ ### الخلية 12: إيقاف الخادم
430
+
431
+ ```python
432
+ cf_proc.terminate() # إغلاق Cloudflare tunnel
433
+ server_proc.terminate() # إيقاف uvicorn
434
+ ```
435
+
436
+ ---
437
+
438
+ ## 4. مقارنة أوضاع التشغيل
439
+
440
+ | | FULL mode (Colab) | LOCAL mode (الجهاز) | LITE mode |
441
+ |--|-------------------|---------------------|-----------|
442
+ | **LLaMA** | ✅ | ❌ | ❌ |
443
+ | **SecureBERT+** | ✅ | ✅ | ❌ |
444
+ | **LogReg** | ✅ | ✅ | ✅ |
445
+ | **GPU** | Tesla T4 | لا يلزم | لا يلزم |
446
+ | **Embedding** | نص مُثرى بـ LLaMA | وصف القاعدة فقط | عشوائي |
447
+ | **T1484 confidence** | **94.76%** | 89.29% | غير موثوق |
448
+ | **القرار النهائي** | T1484 ✅ | T1484 ✅ | غير موثوق |
449
+ | **وقت التحليل/قاعدة** | ~30-60 ثانية | ~2-5 ثوانٍ | < 1 ثانية |
450
+ | **الاستخدام** | إنتاج / عرض | تطوير محلي | اختبار فقط |
451
+
452
+ ---
453
+
454
+ ## 5. معمارية النظام الكاملة على Colab
455
+
456
+ ```
457
+ ┌─────────────────────────────────────────────────────┐
458
+ │ Google Colab VM │
459
+ │ │
460
+ │ ┌─────────────────────────────────┐ │
461
+ │ │ /content/murshid_backend/ │ │
462
+ │ │ │ │
463
+ │ │ FastAPI (uvicorn :8000) │ │
464
+ │ │ ├── /health │ │
465
+ │ │ ├── POST /rules/analyze │ │
466
+ │ │ ├── GET /results/{rule_id} │ │
467
+ │ │ ├── GET /queries/{tech_id} │ │
468
+ │ │ └── GET /api/db/... │ │
469
+ │ └───────────────┬─────────────────┘ │
470
+ │ │ │
471
+ │ ┌───────────────┴───────────┐ │
472
+ │ │ ML Models (GPU VRAM) │ │
473
+ │ │ ├── LLaMA 3 8B (4-bit) │ │
474
+ │ │ ├── SecureBERT+ │ │
475
+ │ │ └── LogReg Pipeline │ │
476
+ │ └───────────────────────────┘ │
477
+ │ │ │
478
+ │ ┌───────────────┴───────────┐ │
479
+ │ │ /content/murshid.db │ │
480
+ │ │ (SQLite — 6 جداول) │ │
481
+ │ └───────────────────────────┘ │
482
+ │ │
483
+ │ ┌───────────────────────────┐ │
484
+ │ │ cloudflared tunnel │ │
485
+ │ │ localhost:8000 → HTTPS │ │
486
+ │ └───────────────┬───────────┘ │
487
+ └──────────────────┼──────────────────────────────────┘
488
+
489
+
490
+ https://xxxx.trycloudflare.com
491
+
492
+
493
+ ┌─────────────────────────┐
494
+ │ المتصفح / الفرونت │
495
+ │ index.html (React) │
496
+ └─────────────────────────┘
497
+ ```
498
+
499
+ ---
500
+
501
+ ## 6. الأخطاء الشائعة وحلولها
502
+
503
+ | الخطأ | السبب | الحل |
504
+ |-------|-------|------|
505
+ | `ImportError: bitsandbytes>=0.46.1` | نسخة قديمة | شغّلي `!pip install -U bitsandbytes>=0.46.1` |
506
+ | `FileNotFoundError: murshid_backend` | ZIP غير مرفوع | ارفعي `murshid_backend_for_drive.zip` إلى Drive |
507
+ | `ERR_NGROK_4018` | ngrok يحتاج حساباً | استخدمي Cloudflare Tunnel (خلية 9) |
508
+ | `Cannot connect to backend` | CORS مغلق | `allow_origins=["*"]` في `main.py` |
509
+ | Server يستغرق > 15 دقيقة | تنزيل LLaMA بطيء | في الجلسة الثانية التنزيل من Cache |
510
+ | `InconsistentVersionWarning` | sklearn إصدار مختلف | تأكدي من `scikit-learn==1.6.1` |
511
+
512
+ ---
513
+
514
+ ## 7. الـ Endpoints المتاحة بعد التشغيل
515
+
516
+ | Method | Endpoint | الوصف |
517
+ |--------|----------|-------|
518
+ | `GET` | `/health` | حالة الخادم والنماذج |
519
+ | `GET` | `/api/stats` | إحصائيات Dashboard |
520
+ | `GET` | `/api/db/summary` | عدد الصفوف في الجداول |
521
+ | `GET` | `/api/db/rules` | جميع القواعد في DB |
522
+ | `GET` | `/api/db/mappings` | جميع المطابقات |
523
+ | `GET` | `/api/db/techniques` | تقنيات MITRE المخزّنة |
524
+ | `GET` | `/api/db/templates` | قوالب WQL |
525
+ | `POST` | `/api/db/import-excel` | استيراد Excel |
526
+ | `POST` | `/rules/analyze` | تحليل قاعدة XML (FULL pipeline) |
527
+ | `GET` | `/results/{rule_id}` | نتائج تقنية قاعدة محددة |
528
+ | `GET` | `/queries/{technique_id}` | استعلامات WQL لتقنية |
529
+ | `POST` | `/admin/templates` | إضافة قالب WQL |
530
+ | `PATCH` | `/admin/templates/{id}` | تعديل قالب |
531
+ | `GET` | `/docs` | Swagger UI التفاعلي |
532
+
533
+ ---
534
+
535
+ ## 8. ملاحظات للعرض التقديمي
536
+
537
+ 1. **شغّلي الخلايا قبل العرض بـ 15 دقيقة** (وقت تحميل LLaMA)
538
+ 2. **انسخي رابط Cloudflare** وتحققي منه في المتصفح
539
+ 3. **الفرونت يُحدَّث تلقائياً** بالرابط الجديد في خلية 9
540
+ 4. **كل جلسة Colab جديدة = رابط Cloudflare جديد** — كرّري الخطوات
541
+ 5. **DB فارغة في كل جلسة** — حلّلي القواعد عبر Admin Panel أو خلية اختبار
542
+
543
+ ---
544
+
545
+ *تاريخ الإنشاء: 8 أبريل 2026 | مشروع مُرشِد — CCIS, PNU*
Needed/murshid_label_columns.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "T1047",
3
+ "T1055",
4
+ "T1059.001",
5
+ "T1070.004",
6
+ "T1078",
7
+ "T1083",
8
+ "T1095",
9
+ "T1098",
10
+ "T1105",
11
+ "T1110",
12
+ "T1112",
13
+ "T1114",
14
+ "T1176",
15
+ "T1190",
16
+ "T1484",
17
+ "T1498",
18
+ "T1499",
19
+ "T1529",
20
+ "T1531",
21
+ "T1562.001"
22
+ ]
Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be629d9f6780456a9435f8be2655e3fa0a848fbe2a4f166813913331b4c43ba4
3
+ size 206584
Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:005a664d7faa22104e4a9e58ace6976628d1d00c1cabcaead1833ff792366c79
3
+ size 208
Needed/murshid_query_template_structure_clean_shared.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1491c4dee86bbf29691b3c4254a344e2cb87eabbb77f04f49da09856cb1d145
3
+ size 20938
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Murshid - مُرشِد
3
+ emoji: 🛡️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # 🛡️ Murshid | مُرشِد
12
+
13
+ **From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts**
14
+
15
+ REST API + Dashboard for analyzing Wazuh IDS rules and mapping them to MITRE ATT&CK techniques.
16
+
17
+ ## Features
18
+
19
+ - **Rule Analysis**: Parse Wazuh XML rules and classify MITRE ATT&CK techniques
20
+ - **WQL Queries**: Get pre-built Wazuh Query Language templates per technique
21
+ - **Dashboard**: Interactive web UI with statistics and DB viewer
22
+ - **ML Pipeline**: Logistic Regression with SecureBERT+ embeddings
23
+
24
+ ## Tech Stack
25
+
26
+ - **FastAPI** — REST API
27
+ - **SQLite** — Database
28
+ - **Logistic Regression** — Primary classification model
29
+ - **SecureBERT+** — Text embeddings (optional, requires torch)
30
+
31
+ ## API Endpoints
32
+
33
+ | Method | URL | Description |
34
+ |--------|-----|-------------|
35
+ | `GET` | `/health` | System health check |
36
+ | `POST` | `/rules/analyze` | Analyze a Wazuh XML rule |
37
+ | `GET` | `/results/{rule_id}` | Get stored results for a rule |
38
+ | `GET` | `/queries/{technique_id}` | Get WQL templates for a technique |
39
+ | `GET` | `/docs` | Interactive Swagger documentation |
murshid_backend/README.md ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Murshid Backend
2
+
3
+ REST API لمشروع "مرشد — من التنبيهات إلى التوجيه: ربط تقنيات MITRE ATT&CK لمحللي SOC"
4
+
5
+ ## التقنيات
6
+
7
+ - **FastAPI** — REST API
8
+ - **MySQL** + **SQLAlchemy** — قاعدة البيانات
9
+ - **Alembic** — هجرة الجداول
10
+ - **Logistic Regression** — النموذج الأساسي في هذه المرحلة
11
+ - **SecureBERT+** — تضمينات نصية
12
+ - **Llama 3 8B** — تلخيص قواعد Wazuh
13
+
14
+ > المنطق مستخرج من `MurshidUIPipeline.ipynb` دون تعديله.
15
+
16
+ ---
17
+
18
+ ## هيكل المشروع
19
+
20
+ ```
21
+ murshid_backend/
22
+ app/
23
+ main.py ← نقطة تشغيل FastAPI
24
+ config.py
25
+ api/routes/
26
+ health.py ← GET /health
27
+ rules.py ← POST /rules/analyze + GET /results/{rule_id}
28
+ queries.py ← GET /queries/{technique_id} + Admin endpoints
29
+ services/
30
+ ml_service.py
31
+ rule_service.py
32
+ result_service.py
33
+ template_service.py
34
+ ml/
35
+ sanitizer.py ← تنظيف XML
36
+ summarizer.py ← تلخيص Llama
37
+ embedder.py ← SecureBERT+
38
+ logistic_model.py ← Logistic Regression inference
39
+ pipeline.py ← analyze_rule() الشامل
40
+ models/ ← SQLAlchemy ORM (6 جداول من ER Diagram)
41
+ schemas/ ← Pydantic schemas
42
+ repositories/ ← DB access layer
43
+ db/
44
+ base.py
45
+ session.py
46
+ alembic/
47
+ versions/0001_initial_schema.py
48
+ requirements.txt
49
+ .env.example
50
+ ```
51
+
52
+ ---
53
+
54
+ ## جداول قاعدة البيانات (مستخرجة من ER Diagram §3.2.6)
55
+
56
+ | جدول | المصدر في التقرير |
57
+ |------|-------------------|
58
+ | `users` | User entity — username, email, password_hash, role |
59
+ | `mapping_jobs` | MappingJob entity — job_id, file_name, status, progress, timestamp |
60
+ | `rules` | Rule entity — rule_id, embedding_vector, job_id |
61
+ | `techniques` | Technique entity — technique_id, technique_name, tactic |
62
+ | `rule_technique_mappings` | RuleTechniqueMapping — rule_id, technique_id, confidence_score |
63
+ | `query_templates` | QueryTemplate — purpose, wql_query, note, is_active |
64
+
65
+ ---
66
+
67
+ ## الإعداد والتشغيل
68
+
69
+ ### 1) متطلبات
70
+
71
+ - Python 3.10+
72
+ - MySQL 8+
73
+ - GPU موصى به لـ Llama 3 8B
74
+
75
+ ### 2) تثبيت
76
+
77
+ ```powershell
78
+ cd d:\GP\murshid_backend
79
+ python -m venv .venv
80
+ .\.venv\Scripts\activate
81
+ pip install -r requirements.txt
82
+ ```
83
+
84
+ ### 3) إعداد قاعدة البيانات
85
+
86
+ إنشاء قاعدة البيانات في MySQL:
87
+ ```sql
88
+ CREATE DATABASE murshid_db CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
89
+ ```
90
+
91
+ ### 4) إعداد `.env`
92
+
93
+ ```powershell
94
+ copy .env.example .env
95
+ ```
96
+
97
+ عدّلي القيم:
98
+ ```env
99
+ MURSHID_DB_URL=mysql+pymysql://root:YOUR_PASSWORD@localhost:3306/murshid_db
100
+ MURSHID_MODELS_DIR=d:/GP/Needed
101
+ HF_TOKEN=hf_xxxx
102
+ MURSHID_SKIP_LLM=false
103
+ ```
104
+
105
+ ### 5) تأكد من وجود ملفات النماذج في `d:\GP\Needed`
106
+
107
+ ```
108
+ murshid_logreg_pipeline_manual_oof_pcatuned.joblib
109
+ murshid_logreg_thresholds_manual_oof_pcatuned.npy
110
+ murshid_label_columns.json
111
+ ```
112
+
113
+ ### 6) تشغيل Alembic (هجرة الجداول)
114
+
115
+ ```powershell
116
+ alembic upgrade head
117
+ ```
118
+
119
+ ### 7) تشغيل الـ API
120
+
121
+ ```powershell
122
+ uvicorn app.main:app --reload --host 127.0.0.1 --port 8000
123
+ ```
124
+
125
+ ---
126
+
127
+ ## الـ Endpoints
128
+
129
+ | Method | URL | الوصف |
130
+ |--------|-----|--------|
131
+ | `GET` | `/health` | فحص حالة النظام والنماذج |
132
+ | `POST` | `/rules/analyze` | تحليل قاعدة Wazuh XML وحفظ النتائج |
133
+ | `GET` | `/results/{rule_id}` | استرجاع التقنيات المخزنة لمعرف القاعدة |
134
+ | `GET` | `/queries/{technique_id}` | جلب قوالب WQL لتقنية معينة |
135
+ | `POST` | `/admin/templates` | إضافة قالب WQL جديد (Admin) |
136
+ | `PATCH` | `/admin/templates/{id}` | تعديل أو تعطيل قالب (Admin) |
137
+
138
+ ### مثال — تحليل قاعدة
139
+
140
+ ```bash
141
+ curl -X POST http://127.0.0.1:8000/rules/analyze \
142
+ -H "Content-Type: application/json" \
143
+ -d '{"rule_xml": "<rule id=\"597\" level=\"5\"><description>Registry Key Entry Deleted.</description></rule>"}'
144
+ ```
145
+
146
+ ### التوثيق التفاعلي
147
+
148
+ افتحي: **http://127.0.0.1:8000/docs**
149
+
150
+ ---
151
+
152
+ ## ملاحظات
153
+
154
+ - الملف الأصلي `MurshidUIPipeline.ipynb` **لم يُعدَّل** — المنطق منسوخ إلى طبقة `app/ml/`.
155
+ - النموذج المعتمد في هذه المرحلة: **Logistic Regression** فقط.
156
+ - لتشغيل بدون GPU للاختبار فقط: ضعي `MURSHID_SKIP_LLM=true` في `.env` (لكن `/rules/analyze` ستعيد 503).
murshid_backend/TECHNICAL_REPORT.md ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # تقرير تقني مفصّل — مشروع مُرشِد (Murshid)
2
+ ## From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts
3
+
4
+ ---
5
+
6
+ ## 1. نظرة عامة
7
+
8
+ مُرشِد نظام ذكي يحوّل تنبيهات قواعد Wazuh XML إلى تقنيات MITRE ATT&CK مُرتّبة بدرجات ثقة، ويُنتج استعلامات تحقيق WQL جاهزة لمحللي SOC.
9
+
10
+ ```
11
+ قاعدة Wazuh XML
12
+
13
+ Sanitization (حذف if_sid, group, mitre)
14
+
15
+ LLaMA 3 8B (تلخيص بجملة واحدة)
16
+
17
+ SecureBERT+ (768-dim embedding)
18
+
19
+ Logistic Regression + PCA (تصنيف)
20
+
21
+ تقنيات MITRE ATT&CK + Confidence Scores
22
+
23
+ قوالب WQL للتحقيق
24
+ ```
25
+
26
+ ---
27
+
28
+ ## 2. هيكل المشروع الكامل
29
+
30
+ ```
31
+ d:\GP\
32
+ ├── MurshidUIPipeline.ipynb ← الدفتر الأصلي (لا يُعدَّل)
33
+ ├── Needed\ ← ملفات النماذج المدرّبة
34
+ │ ├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib
35
+ │ ├── murshid_logreg_thresholds_manual_oof_pcatuned.npy
36
+ │ ├── murshid_svmlinear_per_label_thresholds.joblib
37
+ │ └── murshid_label_columns.json (20 تقنية)
38
+ ├── murshid_backend\ ← خدمة FastAPI
39
+ │ ├── app\
40
+ │ │ ├── main.py
41
+ │ │ ├── config.py
42
+ │ │ ├── api\routes\
43
+ │ │ │ ├── health.py GET /health
44
+ │ │ │ ├── rules.py POST /rules/analyze | GET /results/{rule_id}
45
+ │ │ │ ├── queries.py GET /queries/{technique_id} | POST,PATCH /admin/templates
46
+ │ │ │ ├── stats.py GET /api/stats
47
+ │ │ │ └── db_viewer.py GET /api/db/{summary|rules|mappings|...}
48
+ │ │ ├── ml\
49
+ │ │ │ ├── sanitizer.py تنظيف XML
50
+ │ │ │ ├── summarizer.py LLaMA inference
51
+ │ │ │ ├── embedder.py SecureBERT+ embeddings
52
+ │ │ │ ├── logistic_model.py LogReg inference (PRIMARY)
53
+ │ │ │ └── pipeline.py تنسيق المراحل (FULL|LOCAL|LITE)
54
+ │ │ ├── models\ SQLAlchemy ORM
55
+ │ │ │ ├── user.py
56
+ │ │ │ ├── mapping_job.py
57
+ │ │ │ ├── rule.py
58
+ │ │ │ ├── technique.py
59
+ │ │ │ ├── rule_technique_mapping.py
60
+ │ │ │ └── query_template.py
61
+ │ │ ├── schemas\ Pydantic schemas
62
+ │ │ ├── services\ Business logic
63
+ │ │ ├── repositories\ DB access
64
+ │ │ └── db\ SQLAlchemy session
65
+ │ ├── alembic\ Migrations
66
+ │ ├── murshid.db SQLite database
67
+ │ ├── .env
68
+ │ └── requirements.txt
69
+ └── murshid_frontend\ واجهة React
70
+ └── index.html
71
+ ```
72
+
73
+ ---
74
+
75
+ ## 3. طبقة الباكند (FastAPI)
76
+
77
+ ### 3.1 الـ Endpoints
78
+
79
+ | Method | URL | الوصف | Actor |
80
+ |--------|-----|--------|-------|
81
+ | `GET` | `/health` | حالة النظام + pipeline mode + ملفات النماذج | All |
82
+ | `GET` | `/api/stats` | إحصائيات Dashboard (KPIs + Technique Frequency) | All |
83
+ | `GET` | `/api/db/summary` | عدد الصفوف في كل جدول | Testing |
84
+ | `GET` | `/api/db/rules` | جميع القواعد المخزّنة | Testing |
85
+ | `GET` | `/api/db/mappings` | جميع مطابقات القواعد-التقنيات | Testing |
86
+ | `GET` | `/api/db/techniques` | جميع تقنيات MITRE المخزّنة | Testing |
87
+ | `GET` | `/api/db/templates` | جميع قوالب WQL | Testing |
88
+ | `POST` | `/rules/analyze` | تحليل قاعدة XML → تخزين النتائج | Admin |
89
+ | `GET` | `/results/{rule_id}` | استرجاع تقنيات قاعدة محددة (Figure 4-11/12) | SOC Analyst |
90
+ | `GET` | `/queries/{technique_id}` | قوالب WQL لتقنية محددة | SOC Analyst |
91
+ | `POST` | `/admin/templates` | إضافة قالب WQL جديد | Admin |
92
+ | `PATCH` | `/admin/templates/{id}` | تعديل/تعطيل قالب | Admin |
93
+
94
+ ### 3.2 معمارية الطبقات
95
+
96
+ ```
97
+ HTTP Request
98
+
99
+
100
+ API Layer (FastAPI routes)
101
+ │ validates input (Pydantic)
102
+
103
+ Service Layer
104
+ │ orchestrates business logic
105
+
106
+ ML Layer Repository Layer
107
+ │ │
108
+ ▼ ▼
109
+ Pipeline SQLAlchemy ORM
110
+ (sanitize→embed→classify) │
111
+ │ ▼
112
+ └──────────→ SQLite DB
113
+ ```
114
+
115
+ ### 3.3 قاعدة البيانات (SQLite + SQLAlchemy)
116
+
117
+ مستخرجة حرفياً من ER Diagram (§3.2.6 من التقرير):
118
+
119
+ | الجدول | الأعمدة الرئيسية | المصدر في التقرير |
120
+ |--------|------------------|-------------------|
121
+ | `users` | user_id, username, email, password_hash, role | User entity |
122
+ | `mapping_jobs` | job_id, user_id, file_name, status, progress, timestamp | MappingJob entity |
123
+ | `rules` | rule_id (PK), job_id, embedding_vector | Rule entity |
124
+ | `techniques` | technique_id (PK), technique_name, tactic | Technique entity |
125
+ | `rule_technique_mappings` | mapping_id, rule_id, technique_id, confidence_score | RuleTechniqueMapping |
126
+ | `query_templates` | template_id, technique_id, purpose, wql_query, note, is_active | QueryTemplate |
127
+
128
+ > Index على `rule_id` في `rule_technique_mappings` (Use Case 6 §3.2.7)
129
+
130
+ ---
131
+
132
+ ## 4. طبقة ML
133
+
134
+ ### 4.1 مراحل الـ Pipeline (من الدفتر)
135
+
136
+ #### المرحلة 1: Sanitization
137
+ ```python
138
+ # ml/sanitizer.py — من cell 10 في الدفتر
139
+ REMOVE_TAGS_ANYWHERE = {"mitre", "if_sid", "group", "if_group"}
140
+ # يُحذف: group tags, if_sid, mitre IDs, compliance tags
141
+ # يبقى: description, id, category, decoded_as, info
142
+ ```
143
+
144
+ #### المرحلة 2: LLM Summarization (LLaMA 3 8B)
145
+ ```python
146
+ # ml/summarizer.py — من cell 11 في الدفتر
147
+ # Input: sanitized XML
148
+ # Prompt: "Write EXACTLY ONE sentence describing the observable event pattern"
149
+ # Output: JSON {"summary": "Detects ..."}
150
+ # Constraints: 7-18 words, يبدأ بـ Detects/Monitors/...
151
+ ```
152
+
153
+ #### المرحلة 3: Paragraph Construction
154
+ ```python
155
+ # ml/embedder.py — من cell 12 في الدفتر
156
+ text = f"{summary}. {description}."
157
+ # مثال: "Detects deletion of global group. Windows: Security Enabled Global Group Deleted."
158
+ ```
159
+
160
+ #### المرحلة 4: SecureBERT+ Embedding
161
+ ```python
162
+ # ml/embedder.py — من cell 15 في الدفتر
163
+ # Model: ehsanaghaei/SecureBERT_Plus
164
+ # MAX_LEN: 512 tokens, chunks
165
+ # Pooling: Mean pooling across tokens → 768-dim vector
166
+ # Normalization: L2
167
+ ```
168
+
169
+ #### المرحلة 5: Logistic Regression Inference
170
+ ```python
171
+ # ml/logistic_model.py — من cell 18-19 في الدفتر
172
+ proba = logreg_model.predict_proba(X_user)
173
+ proba = proba.reshape(-1)
174
+ pred = (proba >= logreg_thr).astype(int)
175
+ conf = proba * 100
176
+ gap = proba - logreg_thr
177
+ # تُرجع جميع الـ 20 تقنية مرتّبة تنازلياً
178
+ ```
179
+
180
+ ### 4.2 أوضاع التشغيل
181
+
182
+ | الوضع | الشرط | الدقة | الاستخدام |
183
+ |-------|--------|-------|-----------|
184
+ | **FULL** | LLaMA + SecureBERT + LogReg | 100% (مطابق للدفتر) | Colab/GPU |
185
+ | **LOCAL** | SecureBERT + LogReg (بدون LLaMA) | ~95% (وصف بدون ملخص) | الجهاز المحلي |
186
+ | **LITE** | LogReg فقط (بدون torch) | منخفضة (عشوائي) | اختبار البنية فقط |
187
+
188
+ ---
189
+
190
+ ## 5. طبقة الفرونت (React + Tailwind + Chart.js)
191
+
192
+ ### 5.1 الصفحات (CDN-based React, بدون Build Step)
193
+
194
+ | الصفحة | ID | المستخدم | الوصف |
195
+ |--------|-----|----------|--------|
196
+ | Login | — | All | تسجيل دخول + اختيار دور |
197
+ | Dashboard | `dashboard` | All | KPIs + MITRE Technique Frequency Chart |
198
+ | Rule Lookup | `rules` | SOC Analyst | بحث بـ Rule ID → Figure 4-11 + Figure 4-12 |
199
+ | نتائج DB | `dbviewer` | All | استعراض قاعدة البيانات للاختبار |
200
+ | Rule Mapping | `admin` | Admin | رفع XML + تحليل + جدول التقدم |
201
+ | WQL Templates | `templates` | Admin | إدارة قوالب الاستعلامات |
202
+ | Settings | `settings` | All | ملف شخصي + Dark Mode + ألوان |
203
+
204
+ ### 5.2 الـ Figures كما في التقرير
205
+
206
+ | Figure | الصفحة | المكوّن |
207
+ |--------|--------|---------|
208
+ | Figure 4-10 | Rule Lookup | Search bar + Rule ID input |
209
+ | Figure 4-11 | Rule Lookup | `TechniqueDistributionChart` — Horizontal bar chart (Top 5, مُلوَّن H/M/L) |
210
+ | Figure 4-12 | Rule Lookup | Investigation Queries table (Primary + Secondary ≥50%) |
211
+ | Figure 4-13 | Admin | Rule Mapping Panel (paste XML + Submit) |
212
+ | Figure 4-14 | Admin | Mapping Progress Table (Job ID, Status, Progress) |
213
+ | Figure 4-9 | Dashboard | KPIs + Technique Frequency Bar Chart |
214
+
215
+ ### 5.3 ربط الفرونت بالباكند
216
+
217
+ ```javascript
218
+ const BASE = 'http://127.0.0.1:8000';
219
+ // CORS مُفعَّل في الباكند لـ http://localhost:5173 و http://127.0.0.1:5173
220
+ // الفرونت يُخدَّم مباشرةً من FastAPI عبر StaticFiles
221
+ ```
222
+
223
+ ---
224
+
225
+ ## 6. مخطط تدفق البيانات الكامل
226
+
227
+ ```
228
+ ┌─────────────────────────────────────────┐
229
+ │ SOC Analyst / Admin │
230
+ │ (murshid_frontend/index.html) │
231
+ └────────────────┬────────────────────────┘
232
+ │ HTTP/JSON
233
+
234
+ ┌─────────────────────────────────────────┐
235
+ │ FastAPI (port 8000) │
236
+ │ │
237
+ │ /health → pipeline status │
238
+ │ POST /rules/analyze: │
239
+ │ 1. sanitizer.py → clean XML │
240
+ │ 2. summarizer.py → LLaMA summary │ ← FULL mode only
241
+ │ 3. embedder.py → 768-dim vector │
242
+ │ 4. logistic_model → proba + scores │
243
+ │ 5. rule_repo → save to DB │
244
+ │ │
245
+ │ GET /results/{id} → from DB │
246
+ │ GET /queries/{id} → WQL templates │
247
+ └────────────────┬────────────────────────┘
248
+ │ SQLAlchemy
249
+
250
+ ┌─────────────────────────────────────────┐
251
+ │ SQLite (murshid.db) │
252
+ │ rules | techniques | mappings │
253
+ │ query_templates | mapping_jobs │
254
+ └─────────────────────────────────────────┘
255
+ ```
256
+
257
+ ---
258
+
259
+ ## 7. التشغيل
260
+
261
+ ### المتطلبات
262
+ - Python 3.12 (عبر uv)
263
+ - ملفات النماذج في `d:\GP\Needed\`
264
+ - اتصال إنترنت (لـ SecureBERT+ من HuggingFace أول مرة)
265
+
266
+ ### تشغيل الخادم
267
+ ```powershell
268
+ cd d:\GP\murshid_backend
269
+ .venv\Scripts\python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000
270
+ ```
271
+
272
+ ### الروابط
273
+ | الرابط | الوصف |
274
+ |--------|--------|
275
+ | http://127.0.0.1:8000/index.html | الواجهة الرئيسية |
276
+ | http://127.0.0.1:8000/docs | Swagger API Documentation |
277
+ | http://127.0.0.1:8000/health | فحص حالة النظام |
278
+ | http://127.0.0.1:8000/api/db/summary | ملخص قاعدة البيانات |
279
+
280
+ ### اختبار سريع
281
+ ```powershell
282
+ # 1. تحليل قاعدة
283
+ $body = '{"rule_xml":"<rule id=\"597\"><description>Registry Key Entry Deleted.</description></rule>"}'
284
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/rules/analyze" -Method POST -ContentType "application/json" -Body $body
285
+
286
+ # 2. استرجاع النتائج
287
+ Invoke-RestMethod "http://127.0.0.1:8000/results/597"
288
+
289
+ # 3. إضافة قالب WQL
290
+ $t = '{"technique_id":"T1112","purpose":"Detect registry modification","wql_query":"agent.name:${HOST} AND rule.description:\"registry\"","note":"Replace ${HOST}"}'
291
+ Invoke-RestMethod -Uri "http://127.0.0.1:8000/admin/templates" -Method POST -ContentType "application/json" -Body $t
292
+
293
+ # 4. جلب الاستعلامات
294
+ Invoke-RestMethod "http://127.0.0.1:8000/queries/T1112"
295
+ ```
296
+
297
+ ---
298
+
299
+ ## 8. الفرق بين FULL mode (Colab) و LOCAL mode (الجهاز)
300
+
301
+ | | Colab (FULL) | الجهاز المحلي (LOCAL) |
302
+ |--|-------------|----------------------|
303
+ | Input text | `"Detects deletion of a security-enabled global group. Windows: Security Enabled Global Group Deleted."` | `"Windows: Security Enabled Global Group Deleted"` |
304
+ | T1484 proba | **0.9476 (94.76%)** | **0.8929 (89.29%)** |
305
+ | سبب الفرق | LLaMA يُثري النص بسياق دلالي | الوصف فقط بدون إثراء |
306
+ | القرار الصحيح | T1484 ✅ | T1484 ✅ |
307
+
308
+ **الاستنتاج:** القرار النهائي صحيح في كلا الوضعين — الاختلاف في درجة الثقة فقط.
309
+
310
+ ---
311
+
312
+ ## 9. حالات الاستخدام المُنفَّذة (من التقرير)
313
+
314
+ | Use Case | الوصف | مُنفَّذ |
315
+ |----------|--------|---------|
316
+ | UC1 | View techniques and scores for a rule | ✅ `GET /results/{rule_id}` |
317
+ | UC2 | View WQL investigation queries | ✅ `GET /queries/{technique_id}` |
318
+ | UC3 | Copy and fill investigation query | ✅ زر Copy في الفرونت |
319
+ | UC4 | Upload Wazuh rule(s) | ✅ Admin Panel |
320
+ | UC5 | Process rule via ML pipeline | ✅ `POST /rules/analyze` |
321
+ | UC6 | Store mapped techniques in DB | ✅ تلقائي بعد analyze |
322
+ | UC7 | Manage WQL templates repository | ✅ `POST/PATCH /admin/templates` |
murshid_backend/alembic.ini ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [alembic]
2
+ script_location = alembic
3
+ prepend_sys_path = .
4
+ sqlalchemy.url = sqlite:///murshid.db
5
+
6
+ [loggers]
7
+ keys = root,sqlalchemy,alembic
8
+
9
+ [handlers]
10
+ keys = console
11
+
12
+ [formatters]
13
+ keys = generic
14
+
15
+ [logger_root]
16
+ level = WARN
17
+ handlers = console
18
+ qualname =
19
+
20
+ [logger_sqlalchemy]
21
+ level = WARN
22
+ handlers =
23
+ qualname = sqlalchemy.engine
24
+
25
+ [logger_alembic]
26
+ level = INFO
27
+ handlers =
28
+ qualname = alembic
29
+
30
+ [handler_console]
31
+ class = StreamHandler
32
+ args = (sys.stderr,)
33
+ level = NOTSET
34
+ formatter = generic
35
+
36
+ [formatter_generic]
37
+ format = %(levelname)-5.5s [%(name)s] %(message)s
38
+ datefmt = %H:%M:%S
murshid_backend/alembic/env.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from logging.config import fileConfig
3
+ from pathlib import Path
4
+
5
+ from sqlalchemy import engine_from_config, pool
6
+
7
+ from alembic import context
8
+
9
+ # make app importable
10
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
11
+
12
+ from app.config import settings
13
+ from app.db.base import Base
14
+ import app.models # noqa: F401 — registers all models with Base.metadata
15
+
16
+ config = context.config
17
+ config.set_main_option("sqlalchemy.url", settings.murshid_db_url)
18
+
19
+ if config.config_file_name is not None:
20
+ fileConfig(config.config_file_name)
21
+
22
+ target_metadata = Base.metadata
23
+
24
+
25
+ def run_migrations_offline() -> None:
26
+ url = config.get_main_option("sqlalchemy.url")
27
+ context.configure(
28
+ url=url,
29
+ target_metadata=target_metadata,
30
+ literal_binds=True,
31
+ dialect_opts={"paramstyle": "named"},
32
+ )
33
+ with context.begin_transaction():
34
+ context.run_migrations()
35
+
36
+
37
+ def run_migrations_online() -> None:
38
+ connectable = engine_from_config(
39
+ config.get_section(config.config_ini_section, {}),
40
+ prefix="sqlalchemy.",
41
+ poolclass=pool.NullPool,
42
+ )
43
+ with connectable.connect() as connection:
44
+ context.configure(connection=connection, target_metadata=target_metadata)
45
+ with context.begin_transaction():
46
+ context.run_migrations()
47
+
48
+
49
+ if context.is_offline_mode():
50
+ run_migrations_offline()
51
+ else:
52
+ run_migrations_online()
murshid_backend/alembic/script.py.mako ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """${message}
2
+
3
+ Revision ID: ${up_revision}
4
+ Revises: ${down_revision | comma,n}
5
+ Create Date: ${create_date}
6
+
7
+ """
8
+ from typing import Sequence, Union
9
+
10
+ from alembic import op
11
+ import sqlalchemy as sa
12
+ ${imports if imports else ""}
13
+
14
+ revision: str = ${repr(up_revision)}
15
+ down_revision: Union[str, None] = ${repr(down_revision)}
16
+ branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
17
+ depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
18
+
19
+
20
+ def upgrade() -> None:
21
+ ${upgrades if upgrades else "pass"}
22
+
23
+
24
+ def downgrade() -> None:
25
+ ${downgrades if downgrades else "pass"}
murshid_backend/alembic/versions/0001_initial_schema.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """initial schema — all 6 tables from ER Diagram §3.2.6
2
+
3
+ Revision ID: 0001
4
+ Revises:
5
+ Create Date: 2026-04-08
6
+ """
7
+
8
+ from typing import Sequence, Union
9
+
10
+ import sqlalchemy as sa
11
+ from alembic import op
12
+
13
+ revision: str = "0001"
14
+ down_revision: Union[str, None] = None
15
+ branch_labels: Union[str, Sequence[str], None] = None
16
+ depends_on: Union[str, Sequence[str], None] = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ op.create_table(
21
+ "users",
22
+ sa.Column("user_id", sa.Integer(), primary_key=True, autoincrement=True),
23
+ sa.Column("username", sa.String(100), unique=True, nullable=False),
24
+ sa.Column("email", sa.String(255), unique=True, nullable=False),
25
+ sa.Column("password_hash", sa.String(255), nullable=False),
26
+ sa.Column("role", sa.String(20), nullable=False, server_default="analyst"),
27
+ )
28
+
29
+ op.create_table(
30
+ "mapping_jobs",
31
+ sa.Column("job_id", sa.Integer(), primary_key=True, autoincrement=True),
32
+ sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.user_id"), nullable=False),
33
+ sa.Column("file_name", sa.String(255), nullable=False),
34
+ sa.Column("rules_count", sa.Integer(), server_default="0"),
35
+ sa.Column("status", sa.String(20), nullable=False, server_default="pending"),
36
+ sa.Column("progress", sa.Integer(), server_default="0"),
37
+ sa.Column("timestamp", sa.DateTime(), server_default=sa.func.now()),
38
+ )
39
+
40
+ op.create_table(
41
+ "rules",
42
+ sa.Column("rule_id", sa.String(50), primary_key=True),
43
+ sa.Column("job_id", sa.Integer(), sa.ForeignKey("mapping_jobs.job_id"), nullable=True),
44
+ sa.Column("embedding_vector", sa.Text(), nullable=True),
45
+ )
46
+
47
+ op.create_table(
48
+ "techniques",
49
+ sa.Column("technique_id", sa.String(20), primary_key=True),
50
+ sa.Column("technique_name", sa.String(255), nullable=False),
51
+ sa.Column("tactic", sa.String(100), nullable=True),
52
+ )
53
+
54
+ op.create_table(
55
+ "rule_technique_mappings",
56
+ sa.Column("mapping_id", sa.Integer(), primary_key=True, autoincrement=True),
57
+ sa.Column("rule_id", sa.String(50), sa.ForeignKey("rules.rule_id"), nullable=False),
58
+ sa.Column(
59
+ "technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False
60
+ ),
61
+ sa.Column("confidence_score", sa.Float(), nullable=False),
62
+ )
63
+ # Index on rule_id — Use Case 6 §3.2.7
64
+ op.create_index("ix_rule_technique_rule_id", "rule_technique_mappings", ["rule_id"])
65
+
66
+ op.create_table(
67
+ "query_templates",
68
+ sa.Column("template_id", sa.Integer(), primary_key=True, autoincrement=True),
69
+ sa.Column(
70
+ "technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False
71
+ ),
72
+ sa.Column("purpose", sa.String(255), nullable=True),
73
+ sa.Column("wql_query", sa.Text(), nullable=False),
74
+ sa.Column("note", sa.Text(), nullable=True),
75
+ sa.Column("is_active", sa.Boolean(), nullable=False, server_default="1"),
76
+ )
77
+
78
+
79
+ def downgrade() -> None:
80
+ op.drop_table("query_templates")
81
+ op.drop_index("ix_rule_technique_rule_id", table_name="rule_technique_mappings")
82
+ op.drop_table("rule_technique_mappings")
83
+ op.drop_table("techniques")
84
+ op.drop_table("rules")
85
+ op.drop_table("mapping_jobs")
86
+ op.drop_table("users")
87
+ pass # SQLite: no custom types to drop
murshid_backend/app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Murshid backend package."""
murshid_backend/app/api/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """API layer — FastAPI routers."""
murshid_backend/app/api/routes/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Route modules."""
murshid_backend/app/api/routes/db_viewer.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GET /api/db/rules — all rules in DB
3
+ GET /api/db/mappings — all rule-technique mappings
4
+ GET /api/db/techniques — all techniques
5
+ GET /api/db/templates — all query templates
6
+ GET /api/db/summary — counts per table
7
+ POST /api/db/import-excel — import WQL templates from Excel file
8
+ """
9
+
10
+ from fastapi import APIRouter, Depends, HTTPException, Query
11
+ from sqlalchemy import func
12
+ from sqlalchemy.orm import Session
13
+
14
+ from app.db.session import get_db
15
+ from app.models.mapping_job import MappingJob
16
+ from app.models.query_template import QueryTemplate
17
+ from app.models.rule import Rule
18
+ from app.models.rule_technique_mapping import RuleTechniqueMapping
19
+ from app.models.technique import Technique
20
+
21
+ router = APIRouter(prefix="/api/db", tags=["db-viewer"])
22
+
23
+
24
+ @router.get("/summary")
25
+ def db_summary(db: Session = Depends(get_db)):
26
+ return {
27
+ "rules": db.query(func.count(Rule.rule_id)).scalar(),
28
+ "techniques": db.query(func.count(Technique.technique_id)).scalar(),
29
+ "rule_mappings": db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar(),
30
+ "query_templates": db.query(func.count(QueryTemplate.template_id)).scalar(),
31
+ "mapping_jobs": db.query(func.count(MappingJob.job_id)).scalar(),
32
+ }
33
+
34
+
35
+ @router.get("/rules")
36
+ def all_rules(db: Session = Depends(get_db)):
37
+ rows = db.query(Rule).order_by(Rule.rule_id).all()
38
+ return [
39
+ {
40
+ "rule_id": r.rule_id,
41
+ "job_id": r.job_id,
42
+ "has_embedding": r.embedding_vector is not None,
43
+ }
44
+ for r in rows
45
+ ]
46
+
47
+
48
+ @router.get("/mappings")
49
+ def all_mappings(db: Session = Depends(get_db)):
50
+ rows = (
51
+ db.query(RuleTechniqueMapping)
52
+ .order_by(
53
+ RuleTechniqueMapping.rule_id,
54
+ RuleTechniqueMapping.confidence_score.desc(),
55
+ )
56
+ .all()
57
+ )
58
+ return [
59
+ {
60
+ "mapping_id": m.mapping_id,
61
+ "rule_id": m.rule_id,
62
+ "technique_id": m.technique_id,
63
+ "confidence_score": round(m.confidence_score, 4),
64
+ "confidence_pct": round(m.confidence_score * 100, 2),
65
+ }
66
+ for m in rows
67
+ ]
68
+
69
+
70
+ @router.get("/techniques")
71
+ def all_techniques(db: Session = Depends(get_db)):
72
+ rows = db.query(Technique).order_by(Technique.technique_id).all()
73
+ return [
74
+ {
75
+ "technique_id": t.technique_id,
76
+ "technique_name": t.technique_name,
77
+ "tactic": t.tactic,
78
+ }
79
+ for t in rows
80
+ ]
81
+
82
+
83
+ @router.get("/templates")
84
+ def all_templates(db: Session = Depends(get_db)):
85
+ rows = db.query(QueryTemplate).order_by(QueryTemplate.technique_id, QueryTemplate.template_id).all()
86
+ return [
87
+ {
88
+ "template_id": t.template_id,
89
+ "technique_id": t.technique_id,
90
+ "purpose": t.purpose,
91
+ "wql_query": t.wql_query,
92
+ "note": t.note,
93
+ "is_active": t.is_active,
94
+ }
95
+ for t in rows
96
+ ]
97
+
98
+
99
+ @router.post("/import-excel")
100
+ def import_excel_templates(
101
+ replace: bool = Query(False, description="Update existing templates if True"),
102
+ db: Session = Depends(get_db),
103
+ ):
104
+ """
105
+ Import WQL query templates from the Excel file:
106
+ murshid_query_template_structure_clean_shared.xlsx
107
+
108
+ The file is read from MURSHID_MODELS_DIR or the GP root folder.
109
+ Pass ?replace=true to overwrite existing templates.
110
+ """
111
+ try:
112
+ from scripts.import_excel_templates import run
113
+ result = run(db, replace=replace)
114
+ except FileNotFoundError as e:
115
+ raise HTTPException(status_code=404, detail=str(e))
116
+ except Exception as e:
117
+ raise HTTPException(status_code=500, detail=str(e))
118
+
119
+ if "error" in result:
120
+ raise HTTPException(status_code=404, detail=result["error"])
121
+
122
+ return result
murshid_backend/app/api/routes/health.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GET /health — system readiness check with clear pipeline mode info."""
2
+
3
+ from pathlib import Path
4
+
5
+ from fastapi import APIRouter
6
+
7
+ from app.config import settings
8
+ from app.ml.pipeline import _store, is_ready
9
+
10
+ router = APIRouter(tags=["health"])
11
+
12
+ try:
13
+ import torch
14
+ _CUDA = torch.cuda.is_available()
15
+ _TORCH = True
16
+ _TORCH_ERR = None
17
+ except (ImportError, OSError) as _e:
18
+ _CUDA = False
19
+ _TORCH = False
20
+ _TORCH_ERR = str(_e)
21
+
22
+
23
+ def _check_model_files() -> dict:
24
+ base = Path(settings.murshid_models_dir).resolve()
25
+ files = {
26
+ "logreg_joblib": base / settings.logreg_joblib,
27
+ "logreg_thresholds": base / settings.logreg_thresholds_npy,
28
+ "label_columns": base / settings.label_columns_json,
29
+ }
30
+ return {k: v.is_file() for k, v in files.items()}
31
+
32
+
33
+ @router.get("/health")
34
+ def health():
35
+ model_files = _check_model_files()
36
+ all_files_ok = all(model_files.values())
37
+
38
+ if _store.llama_model is not None:
39
+ mode = "full"
40
+ mode_desc = "LLaMA + SecureBERT+ + LogReg"
41
+ elif _store.embedder is not None and _store.logreg is not None:
42
+ mode = "local"
43
+ mode_desc = "SecureBERT+ + LogReg (no LLaMA — using description as text)"
44
+ elif _store.logreg is not None:
45
+ mode = "lite"
46
+ mode_desc = "LogReg only (no embedder — random vectors, testing only)"
47
+ else:
48
+ mode = "not_ready"
49
+ mode_desc = "No ML models loaded"
50
+
51
+ return {
52
+ "status": "ok",
53
+ "pipeline_ready": is_ready(),
54
+ "pipeline_mode": mode,
55
+ "pipeline_description": mode_desc,
56
+ "analyze_available": _store.logreg is not None,
57
+ "components": {
58
+ "llama_loaded": _store.llama_model is not None,
59
+ "embedder_loaded": _store.embedder is not None,
60
+ "logreg_loaded": _store.logreg is not None,
61
+ "torch_installed": _TORCH,
62
+ "cuda_available": _CUDA,
63
+ "torch_error": _TORCH_ERR,
64
+ },
65
+ "model_files": model_files,
66
+ "all_model_files_present": all_files_ok,
67
+ "models_dir": str(settings.murshid_models_dir.resolve()),
68
+ "skip_llm_env": settings.murshid_skip_llm,
69
+ "next_step": (
70
+ "POST /rules/analyze is ready!" if _store.logreg is not None
71
+ else "Copy .joblib and .npy files to MURSHID_MODELS_DIR and restart."
72
+ ),
73
+ }
murshid_backend/app/api/routes/queries.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GET /queries/{technique_id} — SOC Analyst: fetch WQL templates.
3
+ POST /admin/templates — Admin: add new template.
4
+ PATCH /admin/templates/{template_id} — Admin: update / disable template.
5
+
6
+ Based on:
7
+ Use Case 2 (View Investigation WQL Queries) — §3.2.7
8
+ Use Case 7 (Manage static query templates) — §3.2.7
9
+ """
10
+
11
+ from fastapi import APIRouter, Depends, HTTPException
12
+ from sqlalchemy.orm import Session
13
+
14
+ from app.db.session import get_db
15
+ from app.schemas.query import QueryTemplateIn, QueryTemplateOut, QueryTemplateUpdate
16
+ from app.services.template_service import TemplateService
17
+
18
+ router = APIRouter(tags=["queries"])
19
+
20
+
21
+ def _get_template_service(db: Session = Depends(get_db)) -> TemplateService:
22
+ return TemplateService(db=db)
23
+
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # GET /queries/{technique_id}
27
+ # ---------------------------------------------------------------------------
28
+
29
+
30
+ @router.get("/queries/{technique_id}", response_model=list[QueryTemplateOut])
31
+ def get_queries(
32
+ technique_id: str,
33
+ svc: TemplateService = Depends(_get_template_service),
34
+ ):
35
+ """
36
+ Returns all active WQL templates for the given MITRE technique.
37
+ Use Case 2 — §3.2.7
38
+ """
39
+ templates = svc.get_queries_for_technique(technique_id)
40
+ if not templates:
41
+ raise HTTPException(
42
+ status_code=404,
43
+ detail=f"No active query templates found for technique '{technique_id}'.",
44
+ )
45
+ return [QueryTemplateOut(**t) for t in templates]
46
+
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Admin endpoints
50
+ # ---------------------------------------------------------------------------
51
+
52
+
53
+ @router.post("/admin/templates", response_model=QueryTemplateOut, status_code=201)
54
+ def add_template(
55
+ body: QueryTemplateIn,
56
+ svc: TemplateService = Depends(_get_template_service),
57
+ ):
58
+ """Admin: add a new WQL template. Use Case 7 — §3.2.7"""
59
+ result = svc.add_template(
60
+ technique_id=body.technique_id,
61
+ purpose=body.purpose,
62
+ wql_query=body.wql_query,
63
+ note=body.note,
64
+ )
65
+ return QueryTemplateOut(**result)
66
+
67
+
68
+ @router.patch("/admin/templates/{template_id}", response_model=QueryTemplateOut)
69
+ def update_template(
70
+ template_id: int,
71
+ body: QueryTemplateUpdate,
72
+ svc: TemplateService = Depends(_get_template_service),
73
+ ):
74
+ """Admin: update or disable a WQL template. Use Case 7 — §3.2.7"""
75
+ result = svc.update_template(template_id, body.model_dump(exclude_none=True))
76
+ if result is None:
77
+ raise HTTPException(status_code=404, detail=f"Template {template_id} not found.")
78
+ return QueryTemplateOut(**result)
murshid_backend/app/api/routes/rules.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ POST /rules/analyze — Admin: analyze a rule, persist results.
3
+ GET /results/{rule_id} — SOC Analyst: retrieve stored mappings.
4
+
5
+ Based on:
6
+ Use Case 4+5+6 (Upload, Process, Store) — §3.2.7
7
+ Use Case 1 (View techniques and scores) — §3.2.7
8
+ """
9
+
10
+ from fastapi import APIRouter, Depends, HTTPException
11
+ from sqlalchemy.orm import Session
12
+
13
+ from app.db.session import get_db
14
+ from app.ml.pipeline import is_ready
15
+ from app.schemas.result import MappingResult, ResultsResponse
16
+ from app.schemas.rule import AnalyzeRequest, AnalyzeResponse, TechniqueResult
17
+ from app.services.ml_service import MLService
18
+ from app.services.result_service import ResultService
19
+ from app.services.rule_service import RuleService
20
+
21
+ router = APIRouter(tags=["rules"])
22
+
23
+
24
+ def _get_rule_service(db: Session = Depends(get_db)) -> RuleService:
25
+ return RuleService(db=db, ml=MLService())
26
+
27
+
28
+ def _get_result_service(db: Session = Depends(get_db)) -> ResultService:
29
+ return ResultService(db=db)
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # POST /rules/analyze
34
+ # ---------------------------------------------------------------------------
35
+
36
+
37
+ @router.post("/rules/analyze", response_model=AnalyzeResponse, status_code=201)
38
+ def analyze_rule(
39
+ body: AnalyzeRequest,
40
+ svc: RuleService = Depends(_get_rule_service),
41
+ ):
42
+ """
43
+ Runs the full ML pipeline on the submitted Wazuh rule XML and stores
44
+ the results in the database.
45
+ """
46
+ if not is_ready():
47
+ raise HTTPException(status_code=503, detail="ML pipeline not ready.")
48
+
49
+ try:
50
+ result = svc.analyze_and_persist(body.rule_xml)
51
+ except ValueError as exc:
52
+ raise HTTPException(status_code=422, detail=str(exc)) from exc
53
+ except RuntimeError as exc:
54
+ raise HTTPException(status_code=503, detail=str(exc)) from exc
55
+ except Exception as exc:
56
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
57
+
58
+ all_results = [TechniqueResult(**r) for r in result["results"]]
59
+ detected = [r for r in all_results if r.predicted]
60
+
61
+ return AnalyzeResponse(
62
+ rule_id=result["rule_id"],
63
+ sanitized_xml=result["sanitized_xml"],
64
+ summary=result["summary"],
65
+ text_for_embedding=result["text_for_embedding"],
66
+ embedding_dim=result["embedding_dim"],
67
+ pipeline_mode=result.get("pipeline_mode", "full"),
68
+ detected=detected,
69
+ all_results=all_results,
70
+ )
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # GET /results/{rule_id}
75
+ # ---------------------------------------------------------------------------
76
+
77
+
78
+ @router.get("/results/{rule_id}", response_model=ResultsResponse)
79
+ def get_results(
80
+ rule_id: str,
81
+ svc: ResultService = Depends(_get_result_service),
82
+ ):
83
+ """
84
+ Returns all stored MITRE ATT&CK techniques for a rule ID, sorted by confidence.
85
+ Use Case 1 — §3.2.7
86
+ - mappings: ALL techniques sorted by confidence desc (for Figure 4-11 Top 5 chart)
87
+ - detected: primary + secondary (≥0.5) only (for Figure 4-12 WQL queries)
88
+ """
89
+ data = svc.get_results_for_rule(rule_id)
90
+ if data is None:
91
+ raise HTTPException(
92
+ status_code=404,
93
+ detail=f"No mapping results found for rule_id '{rule_id}'. "
94
+ "Run POST /rules/analyze first.",
95
+ )
96
+ return ResultsResponse(
97
+ rule_id=rule_id,
98
+ mappings=[MappingResult(**m) for m in data["mappings"]],
99
+ detected=[MappingResult(**m) for m in data["detected"]],
100
+ )
murshid_backend/app/api/routes/stats.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GET /api/stats — dashboard KPIs."""
2
+
3
+ from fastapi import APIRouter, Depends
4
+ from sqlalchemy import func
5
+ from sqlalchemy.orm import Session
6
+
7
+ from app.db.session import get_db
8
+ from app.models.rule import Rule
9
+ from app.models.rule_technique_mapping import RuleTechniqueMapping
10
+ from app.models.query_template import QueryTemplate
11
+ from app.models.technique import Technique
12
+
13
+ router = APIRouter(prefix="/api", tags=["stats"])
14
+
15
+
16
+ @router.get("/stats")
17
+ def get_stats(db: Session = Depends(get_db)):
18
+ total_rules = db.query(func.count(Rule.rule_id)).scalar() or 0
19
+ total_mappings = db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar() or 0
20
+ total_queries = db.query(func.count(QueryTemplate.template_id)).filter(QueryTemplate.is_active.is_(True)).scalar() or 0
21
+ total_techniques = db.query(func.count(Technique.technique_id)).scalar() or 0
22
+
23
+ technique_freq = (
24
+ db.query(
25
+ RuleTechniqueMapping.technique_id,
26
+ func.count(RuleTechniqueMapping.mapping_id).label("count"),
27
+ )
28
+ .group_by(RuleTechniqueMapping.technique_id)
29
+ .order_by(func.count(RuleTechniqueMapping.mapping_id).desc())
30
+ .limit(10)
31
+ .all()
32
+ )
33
+
34
+ return {
35
+ "total_rules_mapped": total_rules,
36
+ "total_techniques": total_techniques,
37
+ "total_mappings": total_mappings,
38
+ "total_queries": total_queries,
39
+ "technique_frequency": [
40
+ {"technique_id": t.technique_id, "count": t.count}
41
+ for t in technique_freq
42
+ ],
43
+ }
murshid_backend/app/config.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
4
+
5
+ _GP_ROOT = Path(__file__).resolve().parent.parent.parent
6
+
7
+
8
+ class Settings(BaseSettings):
9
+ model_config = SettingsConfigDict(
10
+ env_file=".env",
11
+ env_file_encoding="utf-8",
12
+ extra="ignore",
13
+ )
14
+
15
+ murshid_db_url: str = "mysql+pymysql://root:password@localhost:3306/murshid_db"
16
+ murshid_models_dir: Path = _GP_ROOT / "Needed"
17
+ hf_token: str | None = None
18
+ murshid_skip_llm: bool = False
19
+ secret_key: str = "change_me"
20
+
21
+ llama_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
22
+ embed_model_id: str = "ehsanaghaei/SecureBERT_Plus"
23
+
24
+ logreg_joblib: str = "murshid_logreg_pipeline_manual_oof_pcatuned.joblib"
25
+ logreg_thresholds_npy: str = "murshid_logreg_thresholds_manual_oof_pcatuned.npy"
26
+ label_columns_json: str = "murshid_label_columns.json"
27
+
28
+
29
+ settings = Settings()
murshid_backend/app/db/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Database layer."""
murshid_backend/app/db/base.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from sqlalchemy.orm import DeclarativeBase
2
+
3
+
4
+ class Base(DeclarativeBase):
5
+ pass
murshid_backend/app/db/session.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections.abc import Generator
2
+
3
+ from sqlalchemy import create_engine
4
+ from sqlalchemy.orm import Session, sessionmaker
5
+
6
+ from app.config import settings
7
+
8
+ _is_sqlite = settings.murshid_db_url.startswith("sqlite")
9
+
10
+ engine = create_engine(
11
+ settings.murshid_db_url,
12
+ connect_args={"check_same_thread": False} if _is_sqlite else {},
13
+ pool_pre_ping=not _is_sqlite,
14
+ pool_recycle=3600 if not _is_sqlite else -1,
15
+ )
16
+
17
+ SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False)
18
+
19
+
20
+ def get_db() -> Generator[Session, None, None]:
21
+ db = SessionLocal()
22
+ try:
23
+ yield db
24
+ finally:
25
+ db.close()
murshid_backend/app/main.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Murshid Backend — FastAPI entrypoint.
3
+
4
+ Architecture:
5
+ API Layer → app/api/routes/
6
+ Service Layer→ app/services/
7
+ ML Layer → app/ml/
8
+ Repository → app/repositories/
9
+ Database → app/db/ (SQLAlchemy + Alembic, MySQL)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from contextlib import asynccontextmanager
15
+
16
+ from pathlib import Path
17
+
18
+ from fastapi import FastAPI
19
+ from fastapi.middleware.cors import CORSMiddleware
20
+ from fastapi.staticfiles import StaticFiles
21
+
22
+ from app.api.routes import db_viewer, health, queries, rules, stats
23
+ from app.ml.pipeline import load_models, unload_models
24
+
25
+ _FRONTEND_DIR = Path(__file__).resolve().parent.parent.parent / "murshid_frontend"
26
+
27
+
28
+ @asynccontextmanager
29
+ async def lifespan(app: FastAPI):
30
+ load_models()
31
+ yield
32
+ unload_models()
33
+
34
+
35
+ app = FastAPI(
36
+ title="Murshid API",
37
+ description=(
38
+ "MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts. "
39
+ "Transforms Wazuh IDS rules into actionable threat intelligence."
40
+ ),
41
+ version="1.0.0",
42
+ lifespan=lifespan,
43
+ )
44
+
45
+ app.add_middleware(
46
+ CORSMiddleware,
47
+ allow_origins=["*"],
48
+ allow_credentials=True,
49
+ allow_methods=["*"],
50
+ allow_headers=["*"],
51
+ )
52
+
53
+ app.include_router(health.router)
54
+ app.include_router(stats.router)
55
+ app.include_router(db_viewer.router)
56
+ app.include_router(rules.router)
57
+ app.include_router(queries.router)
58
+
59
+ if _FRONTEND_DIR.is_dir():
60
+ app.mount("/", StaticFiles(directory=str(_FRONTEND_DIR), html=True), name="frontend")
murshid_backend/app/ml/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """ML layer — logic extracted from MurshidUIPipeline.ipynb without modifying the original."""
murshid_backend/app/ml/embedder.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SecureBERT+ embedder — extracted from MurshidUIPipeline.ipynb (cell 15).
3
+ Produces a 768-dim float32 embedding for a text paragraph.
4
+ Also provides build_text_for_embedding (cell 12).
5
+ Original file is NOT modified.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+ from lxml import etree
12
+
13
+ try:
14
+ import torch
15
+ from transformers import AutoModel, AutoTokenizer
16
+ _TORCH_OK = True
17
+ except (ImportError, OSError):
18
+ _TORCH_OK = False
19
+
20
+ from app.config import settings
21
+
22
+
23
+ def _norm_spaces(s: str) -> str:
24
+ return " ".join((s or "").split()).strip()
25
+
26
+
27
+ def _strip_end_punct(s: str) -> str:
28
+ return (s or "").rstrip(". ").strip()
29
+
30
+
31
+ def build_text_for_embedding(clean_rule: str, summary: str) -> str:
32
+ """Combine LLM summary with rule description — cell 12 of notebook."""
33
+ rule_elem = etree.fromstring(clean_rule.strip())
34
+ raw_desc = rule_elem.findtext("description") or ""
35
+ description = _norm_spaces(raw_desc)
36
+ summary = _norm_spaces(summary)
37
+ description = _norm_spaces(description)
38
+
39
+ if not summary and not description:
40
+ return ""
41
+ if summary and not description:
42
+ return summary
43
+ if description and not summary:
44
+ return description
45
+
46
+ s0 = _strip_end_punct(summary).lower()
47
+ d0 = _strip_end_punct(description).lower()
48
+
49
+ if s0 == d0:
50
+ return _strip_end_punct(summary) + "."
51
+ return f"{_strip_end_punct(summary)}. {_strip_end_punct(description)}."
52
+
53
+
54
+ class SecureBERTEmbedder:
55
+ """Mean-pooling embedder using ehsanaghaei/SecureBERT_Plus — cell 15."""
56
+
57
+ MAX_LEN = 512
58
+ BATCH_CHUNKS = 8
59
+
60
+ def __init__(self, model_id: str | None = None, device: str | None = None):
61
+ if not _TORCH_OK:
62
+ raise RuntimeError("torch/transformers not available — SecureBERTEmbedder cannot be initialised.")
63
+ mid = model_id or settings.embed_model_id
64
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
65
+ torch.backends.cudnn.deterministic = True
66
+ torch.backends.cudnn.benchmark = False
67
+ self.tokenizer = AutoTokenizer.from_pretrained(mid, use_fast=True)
68
+ self.model = AutoModel.from_pretrained(mid).to(self.device)
69
+ self.model.eval()
70
+ self.cls_id = self.tokenizer.cls_token_id
71
+ self.sep_id = self.tokenizer.sep_token_id
72
+ self.pad_id = (
73
+ self.tokenizer.pad_token_id
74
+ if self.tokenizer.pad_token_id is not None
75
+ else self.sep_id
76
+ )
77
+
78
+ def _chunk_text(self, text: str) -> list[list[int]]:
79
+ token_ids = self.tokenizer.encode(text, add_special_tokens=False)
80
+ chunk_size = self.MAX_LEN - 2
81
+ chunks = []
82
+ for i in range(0, len(token_ids), chunk_size):
83
+ piece = token_ids[i : i + chunk_size]
84
+ chunks.append([self.cls_id] + piece + [self.sep_id])
85
+ return chunks
86
+
87
+ def embed_text(self, text: str) -> np.ndarray:
88
+ chunks = self._chunk_text(text)
89
+ all_embs: list[np.ndarray] = []
90
+
91
+ for i in range(0, len(chunks), self.BATCH_CHUNKS):
92
+ batch = chunks[i : i + self.BATCH_CHUNKS]
93
+ max_len = max(len(x) for x in batch)
94
+ input_ids, masks = [], []
95
+ for x in batch:
96
+ pad = max_len - len(x)
97
+ input_ids.append(x + [self.pad_id] * pad)
98
+ masks.append([1] * len(x) + [0] * pad)
99
+
100
+ ids_t = torch.tensor(input_ids).to(self.device)
101
+ mask_t = torch.tensor(masks).to(self.device)
102
+
103
+ with torch.no_grad():
104
+ out = self.model(input_ids=ids_t, attention_mask=mask_t)
105
+ tok_emb = out.last_hidden_state
106
+ mask_exp = mask_t.unsqueeze(-1).expand(tok_emb.size()).float()
107
+ summed = torch.sum(tok_emb * mask_exp, dim=1)
108
+ denom = torch.clamp(mask_exp.sum(dim=1), min=1e-9)
109
+ mean_pooled = summed / denom
110
+
111
+ all_embs.append(mean_pooled.cpu().numpy())
112
+
113
+ all_embs_np = np.vstack(all_embs)
114
+ para_emb = all_embs_np.mean(axis=0)
115
+ para_emb /= np.linalg.norm(para_emb) + 1e-12
116
+ return para_emb.astype(np.float32)
murshid_backend/app/ml/logistic_model.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Logistic Regression — PRIMARY model per user decision.
3
+
4
+ Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19):
5
+
6
+ logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib")
7
+ logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy")
8
+
9
+ proba = logreg_model.predict_proba(X_user)
10
+
11
+ if isinstance(proba, list):
12
+ proba = np.column_stack([p[:, 1] for p in proba])
13
+ elif proba.ndim == 3:
14
+ proba = proba[:, :, 1]
15
+
16
+ proba = proba.reshape(-1)
17
+
18
+ pred_logreg = (proba >= logreg_thr).astype(int)
19
+ conf_logreg = proba * 100
20
+ gap_logreg = proba - logreg_thr
21
+
22
+ Original notebook file is NOT modified.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ from pathlib import Path
29
+
30
+ import joblib
31
+ import numpy as np
32
+
33
+ from app.config import settings
34
+
35
+
36
+ class LogisticRegressionModel:
37
+ """
38
+ Wraps the trained Logistic Regression pipeline + per-label thresholds.
39
+ File structure (from notebook cell 18):
40
+ logreg_model → sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg))
41
+ logreg_thr → np.ndarray shape (n_techniques,) per-label thresholds
42
+ """
43
+
44
+ def __init__(self, models_dir: Path | None = None) -> None:
45
+ base = Path(models_dir or settings.murshid_models_dir).resolve()
46
+
47
+ logreg_path = base / settings.logreg_joblib
48
+ thr_path = base / settings.logreg_thresholds_npy
49
+ labels_path = base / settings.label_columns_json
50
+
51
+ for p in (logreg_path, thr_path, labels_path):
52
+ if not p.is_file():
53
+ raise FileNotFoundError(f"Missing model file: {p}")
54
+
55
+ # --- notebook cell 18: load model + thresholds ---
56
+ self._model = joblib.load(logreg_path) # logreg_model
57
+ self._thr = np.load(thr_path) # logreg_thr
58
+
59
+ with open(labels_path, encoding="utf-8") as f:
60
+ self.technique_names: list[str] = json.load(f)
61
+
62
+ n = len(self.technique_names)
63
+ if self._thr.shape[0] != n:
64
+ raise ValueError(
65
+ f"LogReg thresholds length {self._thr.shape[0]} != {n} labels"
66
+ )
67
+
68
+ # ------------------------------------------------------------------
69
+
70
+ def predict(self, embedding_1d: np.ndarray) -> list[dict]:
71
+ """
72
+ Run LogReg inference exactly as in notebook cell 19.
73
+
74
+ Returns list of dicts sorted by confidence_percent desc:
75
+ technique_id, predicted, confidence_percent, proba, threshold, gap
76
+ """
77
+ X_user = embedding_1d.reshape(1, -1)
78
+
79
+ # --- verbatim from notebook cell 19 ---
80
+ proba = self._model.predict_proba(X_user)
81
+
82
+ if isinstance(proba, list):
83
+ proba = np.column_stack([p[:, 1] for p in proba])
84
+ elif proba.ndim == 3:
85
+ proba = proba[:, :, 1]
86
+
87
+ proba = proba.reshape(-1)
88
+
89
+ pred_logreg = (proba >= self._thr).astype(int)
90
+ conf_logreg = proba * 100
91
+ gap_logreg = proba - self._thr
92
+ # --- end verbatim ---
93
+
94
+ results = [
95
+ {
96
+ "technique_id": self.technique_names[i],
97
+ "predicted": bool(pred_logreg[i]),
98
+ "confidence_percent": round(float(conf_logreg[i]), 2),
99
+ "proba": round(float(proba[i]), 4),
100
+ "threshold": round(float(self._thr[i]), 4),
101
+ "gap": round(float(gap_logreg[i]), 4),
102
+ }
103
+ for i in range(len(self.technique_names))
104
+ ]
105
+
106
+ # sort: predicted first, then by confidence desc (notebook sort logic)
107
+ return sorted(
108
+ results,
109
+ key=lambda r: (r["predicted"], r["confidence_percent"]),
110
+ reverse=True,
111
+ )
murshid_backend/app/ml/pipeline.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Full inference pipeline — combines sanitizer → summarizer → embedder → logistic_model.
3
+ Exposes analyze_rule(rule_xml) -> dict as the single callable for the service layer.
4
+
5
+ Modes:
6
+ FULL : LLaMA available + SecureBERT+ + LogReg (GPU/Colab required)
7
+ LOCAL : MURSHID_SKIP_LLM=true + SecureBERT+ + LogReg
8
+ → skips LLaMA; uses <description> field as the paragraph text.
9
+ This allows POST /rules/analyze to work locally without a GPU.
10
+ LITE : torch not installed → uses a trivial bag-of-words fake embedding (testing only)
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import xml.etree.ElementTree as ET
16
+ from dataclasses import dataclass
17
+ from typing import Any
18
+
19
+ import numpy as np
20
+
21
+ from app.config import settings
22
+ from app.ml.logistic_model import LogisticRegressionModel
23
+ from app.ml.sanitizer import sanitize_rule_from_string
24
+
25
+ try:
26
+ import torch
27
+ from huggingface_hub import login as hf_login
28
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
29
+ from app.ml.embedder import SecureBERTEmbedder, build_text_for_embedding
30
+ from app.ml.summarizer import summarize_one_rule
31
+ _TORCH_AVAILABLE = True
32
+ _TORCH_ERROR: str | None = None
33
+ except (ImportError, OSError) as _e:
34
+ _TORCH_AVAILABLE = False
35
+ _TORCH_ERROR = str(_e)
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Singleton container (loaded once at startup)
40
+ # ---------------------------------------------------------------------------
41
+
42
+
43
+ @dataclass
44
+ class _ModelStore:
45
+ llama_model: Any | None = None
46
+ llama_tokenizer: Any | None = None
47
+ llama_device: str = "cpu"
48
+ embedder: SecureBERTEmbedder | None = None
49
+ logreg: LogisticRegressionModel | None = None
50
+ ready: bool = False
51
+
52
+
53
+ _store = _ModelStore()
54
+
55
+
56
+ def load_models() -> None:
57
+ """
58
+ Load all models into _store.
59
+ Call once at FastAPI startup (lifespan).
60
+ """
61
+ if _TORCH_AVAILABLE and settings.hf_token:
62
+ hf_login(token=settings.hf_token, add_to_git_credential=False)
63
+
64
+ if not settings.murshid_skip_llm:
65
+ if not _TORCH_AVAILABLE:
66
+ print("[Murshid] WARNING: torch not installed — skipping LLM load.")
67
+ else:
68
+ bnb_cfg = BitsAndBytesConfig(
69
+ load_in_4bit=True,
70
+ bnb_4bit_use_double_quant=True,
71
+ bnb_4bit_quant_type="nf4",
72
+ bnb_4bit_compute_dtype=torch.float16,
73
+ )
74
+ tok = AutoTokenizer.from_pretrained(settings.llama_model_id, use_fast=True)
75
+ if tok.pad_token is None:
76
+ tok.pad_token = tok.eos_token
77
+ m = AutoModelForCausalLM.from_pretrained(
78
+ settings.llama_model_id,
79
+ quantization_config=bnb_cfg,
80
+ device_map="auto",
81
+ low_cpu_mem_usage=True,
82
+ dtype=torch.float16,
83
+ )
84
+ m.config.pad_token_id = tok.pad_token_id
85
+ m.eval()
86
+ _store.llama_tokenizer = tok
87
+ _store.llama_model = m
88
+ _store.llama_device = "cuda" if torch.cuda.is_available() else "cpu"
89
+
90
+ if _TORCH_AVAILABLE:
91
+ try:
92
+ _store.embedder = SecureBERTEmbedder()
93
+ except Exception as exc:
94
+ print(f"[Murshid] WARNING: SecureBERT+ not loaded — {exc}")
95
+ _store.embedder = None
96
+ else:
97
+ print("[Murshid] WARNING: torch not installed — embedder skipped.")
98
+ _store.embedder = None
99
+
100
+ try:
101
+ _store.logreg = LogisticRegressionModel()
102
+ except FileNotFoundError as exc:
103
+ print(f"[Murshid] WARNING: LogReg model files missing — {exc}")
104
+ _store.logreg = None
105
+ except Exception as exc:
106
+ print(f"[Murshid] WARNING: LogReg not loaded — {exc}")
107
+ _store.logreg = None
108
+
109
+ _store.ready = True
110
+
111
+
112
+ def unload_models() -> None:
113
+ _store.llama_model = None
114
+ _store.llama_tokenizer = None
115
+ _store.embedder = None
116
+ _store.logreg = None
117
+ _store.ready = False
118
+
119
+
120
+ def is_ready() -> bool:
121
+ return _store.ready
122
+
123
+
124
+ # ---------------------------------------------------------------------------
125
+ # Public function
126
+ # ---------------------------------------------------------------------------
127
+
128
+
129
+ def _extract_description(clean_xml: str) -> str:
130
+ """Extract <description> text from sanitized rule XML."""
131
+ try:
132
+ elem = ET.fromstring(clean_xml.strip())
133
+ desc = elem.findtext("description") or ""
134
+ return " ".join(desc.split()).strip()
135
+ except ET.ParseError:
136
+ return ""
137
+
138
+
139
+ def analyze_rule(rule_xml: str) -> dict:
140
+ """
141
+ Full pipeline: XML → sanitize → summarize → embed → LogReg → ranked results.
142
+
143
+ Operates in three modes depending on environment:
144
+
145
+ FULL mode (MURSHID_SKIP_LLM=false, GPU available):
146
+ LLaMA generates a natural-language summary → SecureBERT+ embeds it → LogReg predicts.
147
+
148
+ LOCAL mode (MURSHID_SKIP_LLM=true, torch installed):
149
+ Skips LLaMA. Uses the rule's <description> field directly as the text.
150
+ SecureBERT+ still embeds it properly → LogReg predicts.
151
+ ⚠️ Accuracy slightly lower than FULL mode (no LLaMA enrichment).
152
+
153
+ LITE mode (torch not installed):
154
+ Uses a random unit-vector as a placeholder embedding.
155
+ Results are meaningless — for structural testing only.
156
+
157
+ Returns:
158
+ {
159
+ "sanitized_xml": str,
160
+ "summary": str, # LLaMA output OR description OR "(lite mode)"
161
+ "text_for_embedding": str,
162
+ "embedding_dim": int,
163
+ "pipeline_mode": str, # "full" | "local" | "lite"
164
+ "results": [...], # all techniques sorted by confidence desc
165
+ "detected": [...], # predicted == True only
166
+ }
167
+ """
168
+ if not _store.ready:
169
+ raise RuntimeError("Models not loaded. Call load_models() first.")
170
+
171
+ if "<rule" not in rule_xml or "</rule>" not in rule_xml:
172
+ raise ValueError("Incomplete XML: must contain <rule> and </rule>.")
173
+
174
+ if _store.logreg is None:
175
+ raise RuntimeError(
176
+ "LogReg model not loaded. "
177
+ "Copy the .joblib and .npy files to MURSHID_MODELS_DIR and restart."
178
+ )
179
+
180
+ clean_xml = sanitize_rule_from_string(rule_xml)
181
+
182
+ # ── Choose mode ────────────────────────────────────────────────────────────
183
+ if _store.llama_model is not None and _store.llama_tokenizer is not None:
184
+ # FULL mode: LLaMA summary
185
+ mode = "full"
186
+ summary = summarize_one_rule(
187
+ clean_xml,
188
+ _store.llama_model,
189
+ _store.llama_tokenizer,
190
+ _store.llama_device,
191
+ )
192
+ text = build_text_for_embedding(clean_xml, summary)
193
+ embedding: np.ndarray = _store.embedder.embed_text(text)
194
+
195
+ elif _store.embedder is not None:
196
+ # LOCAL mode: no LLaMA, use <description> as text
197
+ mode = "local"
198
+ desc = _extract_description(clean_xml)
199
+ summary = desc or "No description available."
200
+ text = desc or clean_xml[:300]
201
+ embedding = _store.embedder.embed_text(text)
202
+
203
+ else:
204
+ # LITE mode: torch not available, random unit-vector (structural test only)
205
+ mode = "lite"
206
+ desc = _extract_description(clean_xml)
207
+ summary = f"(lite mode — no embedder) {desc}"
208
+ text = desc or clean_xml[:300]
209
+ dim = 768
210
+ raw = np.random.default_rng(abs(hash(text)) % (2**32)).random(dim).astype(np.float32)
211
+ embedding = raw / (np.linalg.norm(raw) + 1e-12)
212
+
213
+ # ── Classify ───────────────────────────────────────────────────────────────
214
+ all_results = _store.logreg.predict(embedding)
215
+ detected = [r for r in all_results if r["predicted"]]
216
+
217
+ return {
218
+ "sanitized_xml": clean_xml,
219
+ "summary": summary,
220
+ "text_for_embedding": text,
221
+ "embedding_dim": int(embedding.shape[0]),
222
+ "pipeline_mode": mode,
223
+ "results": all_results,
224
+ "detected": detected,
225
+ }
murshid_backend/app/ml/sanitizer.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rule sanitizer — extracted from MurshidUIPipeline.ipynb (cell 10).
3
+ Removes: mitre, if_sid, group, if_group tags from Wazuh XML rule.
4
+ Original file is NOT modified.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import copy
10
+ import xml.etree.ElementTree as ET
11
+
12
+ REMOVE_TAGS_ANYWHERE: set[str] = {"mitre", "if_sid", "group", "if_group"}
13
+
14
+
15
+ def _remove_tag_anywhere(root_elem: ET.Element, tag: str) -> None:
16
+ for parent in list(root_elem.iter()):
17
+ for child in list(parent):
18
+ if child.tag == tag:
19
+ parent.remove(child)
20
+
21
+
22
+ def sanitize_rule(rule_elem: ET.Element) -> ET.Element:
23
+ r = copy.deepcopy(rule_elem)
24
+ for tag in REMOVE_TAGS_ANYWHERE:
25
+ _remove_tag_anywhere(r, tag)
26
+ return r
27
+
28
+
29
+ def sanitize_rule_from_string(rule_xml: str) -> str:
30
+ rule_elem = ET.fromstring(rule_xml.strip())
31
+ sanitized = sanitize_rule(rule_elem)
32
+ return ET.tostring(sanitized, encoding="unicode")
murshid_backend/app/ml/summarizer.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM summarizer — extracted from MurshidUIPipeline.ipynb (cells 11-12).
3
+ Converts sanitized Wazuh XML rule to a one-sentence behavior summary.
4
+ Original file is NOT modified.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import re
11
+ import unicodedata
12
+
13
+ import torch
14
+
15
+ # --------------------------------------------------------------------------
16
+ # Constants (identical to notebook)
17
+ # --------------------------------------------------------------------------
18
+ MAX_INPUT_TOKENS = 2048
19
+ MAX_NEW_TOKENS = 160
20
+ DO_SAMPLE = False
21
+ NUM_BEAMS = 4
22
+ MAX_RETRIES = 3
23
+
24
+ SYSTEM_INSTR = (
25
+ "You are a cybersecurity expert.\n"
26
+ "You will be provided with a Wazuh rule in XML format.\n"
27
+ "Write EXACTLY ONE sentence describing the observable event pattern the rule matches.\n\n"
28
+ "HARD CONSTRAINTS:\n"
29
+ '1) Output must be minified JSON only: {"summary":"..."}\n'
30
+ "2) ONE sentence only.\n"
31
+ "3) Start with one of: Detects, Monitors, Identifies, Flags, Reports, Tracks, Captures.\n"
32
+ "4) Use ONLY facts present in the XML. Describe the observable system event only.\n"
33
+ "5) Do NOT infer attacker intent, attack type, or technique.\n"
34
+ "6) Do NOT mention MITRE, ATT&CK, or attack technique names unless explicitly present in the XML.\n"
35
+ "7) Do NOT use speculative language: likely, potentially, possible, possibly, may indicate, or could indicate.\n"
36
+ "8) Length: 7 to 18 words.\n"
37
+ "9) SHOULD include a clear event type when possible.\n"
38
+ "10) Mention at least ONE concrete indicator if available (event_id, process name, file path,\n"
39
+ " registry key, service, protocol/port, URL pattern, command, username, IP).\n"
40
+ "If only a single indicator exists, still produce a complete behavior-focused sentence.\n"
41
+ )
42
+
43
+ REPAIR_HINT = (
44
+ "Your previous output was rejected.\n"
45
+ "Fix it to satisfy ALL constraints:\n"
46
+ '- Output MUST be minified JSON only: {"summary":"..."}\n'
47
+ "- One sentence only.\n"
48
+ "- Keep it behavior-focused.\n"
49
+ "- Include at least ONE concrete indicator if present in the XML.\n"
50
+ "- Do NOT add any extra text outside JSON.\n"
51
+ )
52
+
53
+ VERB_OK = ("Detects", "Monitors", "Identifies", "Flags", "Reports", "Tracks", "Captures")
54
+ JSON_OBJ_RE = re.compile(r"\{.*?\}", re.DOTALL)
55
+ BAD_INTRO_RE = re.compile(
56
+ r"^\s*(this\s+(wazuh\s+)?rule|the\s+rule|this\s+alert)\b", re.IGNORECASE
57
+ )
58
+ BAD_INTENT_RE = re.compile(r"\b(likely|potentially|possible|maybe)\b", re.IGNORECASE)
59
+ GENERIC_RE = re.compile(
60
+ r"\b(detects activity|detects suspicious activity|detects potentially suspicious activity|"
61
+ r"monitors activity|reports activity|detects an event pattern defined by the rule indicators)\b",
62
+ re.IGNORECASE,
63
+ )
64
+
65
+
66
+ # --------------------------------------------------------------------------
67
+ # Helpers (identical to notebook)
68
+ # --------------------------------------------------------------------------
69
+
70
+ def _build_prompt(rule_xml: str, tokenizer, extra_hint: str = "") -> str:
71
+ sys = SYSTEM_INSTR + (("\n" + extra_hint) if extra_hint else "")
72
+ user = f"Wazuh rule XML:\n{rule_xml}\n\nReturn JSON only:"
73
+ messages = [{"role": "system", "content": sys}, {"role": "user", "content": user}]
74
+ return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
75
+
76
+
77
+ def _looks_broken_encoding(s: str) -> bool:
78
+ return any(m in s for m in ("Ã", "Ð", "Ñ", "â", "â")) if s else False
79
+
80
+
81
+ def _try_extract_json_summary(text: str) -> str | None:
82
+ t = (text or "").strip()
83
+ if not t:
84
+ return None
85
+ if t.startswith("{") and '"summary"' in t:
86
+ try:
87
+ obj = json.loads(t)
88
+ if isinstance(obj, dict) and isinstance(obj.get("summary"), str):
89
+ return obj["summary"].strip()
90
+ except Exception:
91
+ pass
92
+ m = JSON_OBJ_RE.search(t)
93
+ if m and '"summary"' in m.group(0):
94
+ blob = m.group(0)
95
+ try:
96
+ obj = json.loads(blob)
97
+ if isinstance(obj, dict) and isinstance(obj.get("summary"), str):
98
+ return obj["summary"].strip()
99
+ except Exception:
100
+ m2 = re.search(r'"summary"\s*:\s*"([^"]+)"', blob)
101
+ if m2:
102
+ return m2.group(1).strip()
103
+ return None
104
+
105
+
106
+ def _normalize_one_sentence(s: str) -> str:
107
+ s = re.sub(r"\s+", " ", (s or "").strip()).strip()
108
+ s = unicodedata.normalize("NFKC", s)
109
+ if not s:
110
+ return ""
111
+ if BAD_INTRO_RE.match(s):
112
+ s = BAD_INTRO_RE.sub("", s).lstrip(":,- ").strip()
113
+ if not s:
114
+ return ""
115
+ if not any(s.startswith(v) for v in VERB_OK):
116
+ s = "Detects " + (s[0].lower() + s[1:]) if len(s) > 1 else ""
117
+ if not s:
118
+ return ""
119
+ m = re.search(r"[.!?](?:\s|$)", s)
120
+ s = s[: m.end()].strip() if m else s + "."
121
+ s = re.sub(r"^(Detects\s+)+", "Detects ", s).strip()
122
+ return re.sub(r"\s+", " ", s).strip()
123
+
124
+
125
+ def _looks_truncated(s: str) -> bool:
126
+ return not s or s.strip().endswith(("(", ":", " -", ","))
127
+
128
+
129
+ def _has_behavior_signal(s: str) -> bool:
130
+ kws = ["create","delete","execute","spawn","launch","login","logon","authentication",
131
+ "connect","request","query","modify","registry","process","command","file",
132
+ "service","ip","url","dns","http","vpn","account"]
133
+ return any(k in s.lower() for k in kws)
134
+
135
+
136
+ def _has_indicator_signal(s: str) -> bool:
137
+ kws = [".exe",".dll",".ps1",".bat",".cmd","powershell","cmd.exe","reg.exe","rundll32",
138
+ "svchost","registry","temp","system32","event_id","http","dns","ip","url","port","key"]
139
+ return any(k in s.lower() for k in kws)
140
+
141
+
142
+ def _is_bad(s: str) -> bool:
143
+ if not s or BAD_INTRO_RE.match(s) or BAD_INTENT_RE.search(s) or GENERIC_RE.search(s):
144
+ return True
145
+ if _looks_broken_encoding(s) or _looks_truncated(s):
146
+ return True
147
+ wc = len(s.split())
148
+ if wc < 7 or wc > 18 or not _has_behavior_signal(s):
149
+ return True
150
+ return bool((s.startswith("{") and "summary" in s) or ('"summary"' in s and "{" in s))
151
+
152
+
153
+ def _is_catastrophic(s: str) -> bool:
154
+ return not s or _looks_broken_encoding(s) or _looks_truncated(s) or len(s.split()) < 3
155
+
156
+
157
+ def _score(s: str) -> int:
158
+ wc = len(s.split())
159
+ return (
160
+ (3 if 7 <= wc <= 18 else 0)
161
+ + (3 if _has_behavior_signal(s) else 0)
162
+ + (2 if _has_indicator_signal(s) else 0)
163
+ + (1 if not GENERIC_RE.search(s) else 0)
164
+ + (1 if not BAD_INTENT_RE.search(s) else 0)
165
+ )
166
+
167
+
168
+ def _rescue_finalize(s: str) -> str:
169
+ s = _normalize_one_sentence(s)
170
+ if not s:
171
+ return "Detects rule-matched behavior."
172
+ s = re.sub(r",\s*(possibly|potentially|maybe|may)\b.*$", "", s, flags=re.IGNORECASE).strip()
173
+ s = re.sub(r"\b(possibly|potentially|maybe|may)\b", "", s, flags=re.IGNORECASE)
174
+ s = re.sub(r"\s+", " ", s).strip()
175
+ if len(s.split()) < 7:
176
+ low = s.lower()
177
+ for kw, rep in [
178
+ ("powershell", "Detects powershell.exe process execution."),
179
+ ("cmd", "Detects cmd.exe process execution."),
180
+ ("reg", "Detects reg.exe process execution."),
181
+ ("svchost", "Detects svchost.exe process execution."),
182
+ ]:
183
+ if kw in low:
184
+ s = rep
185
+ break
186
+ else:
187
+ s = s.rstrip(".") + " matching rule indicators."
188
+ if _looks_truncated(s):
189
+ s = s.rstrip(".") + " matching rule indicators."
190
+ if not any(s.startswith(v) for v in VERB_OK):
191
+ s = "Detects " + s[0].lower() + s[1:] if len(s) > 1 else "Detects rule-matched behavior."
192
+ words = s.split()
193
+ if len(words) > 18:
194
+ s = " ".join(words[:18]).rstrip(".") + "."
195
+ return re.sub(r"\s+", " ", s if s.endswith(".") else s + ".").strip()
196
+
197
+
198
+ # --------------------------------------------------------------------------
199
+ # Public API
200
+ # --------------------------------------------------------------------------
201
+
202
+ def summarize_one_rule(rule_xml: str, model, tokenizer, device: str | None = None) -> str:
203
+ """Generate a one-sentence summary for a sanitized Wazuh rule XML string."""
204
+ if device is None:
205
+ device = "cuda" if torch.cuda.is_available() else "cpu"
206
+
207
+ pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
208
+ eos_id = tokenizer.eos_token_id or pad_id
209
+
210
+ best: str | None = None
211
+ best_any: str | None = None
212
+ last_raw = ""
213
+ last_cleaned = ""
214
+
215
+ for attempt in range(1, MAX_RETRIES + 1):
216
+ prompt = _build_prompt(
217
+ rule_xml, tokenizer, extra_hint=REPAIR_HINT if attempt >= 2 else ""
218
+ )
219
+ inputs = tokenizer(
220
+ prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS
221
+ ).to(device)
222
+
223
+ with torch.no_grad():
224
+ outputs = model.generate(
225
+ **inputs,
226
+ max_new_tokens=MAX_NEW_TOKENS,
227
+ do_sample=DO_SAMPLE,
228
+ num_beams=NUM_BEAMS,
229
+ pad_token_id=pad_id,
230
+ eos_token_id=eos_id,
231
+ repetition_penalty=1.05,
232
+ no_repeat_ngram_size=3,
233
+ )
234
+
235
+ raw = tokenizer.decode(
236
+ outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
237
+ ).strip()
238
+ last_raw = raw
239
+
240
+ parsed = _try_extract_json_summary(raw)
241
+ if parsed is None:
242
+ continue
243
+
244
+ cleaned = _normalize_one_sentence(parsed)
245
+ last_cleaned = cleaned
246
+
247
+ if cleaned and not _is_catastrophic(cleaned):
248
+ if best_any is None or _score(cleaned) > _score(best_any):
249
+ best_any = cleaned
250
+
251
+ if not _is_bad(cleaned):
252
+ best = cleaned
253
+ break
254
+
255
+ if best is None:
256
+ if best_any and not _is_catastrophic(best_any):
257
+ best = best_any
258
+ else:
259
+ src = last_cleaned or _try_extract_json_summary(last_raw) or last_raw
260
+ best = _rescue_finalize(src)
261
+
262
+ return best
murshid_backend/app/ml/svm_model.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SVM classifier — PRIMARY model per the report (§3.1.3 + §4.1).
3
+
4
+ Report quote:
5
+ "the Support Vector Machine (SVM) was adopted as the core classifier"
6
+ "classification using SVM to predict the associated MITRE ATT&CK techniques"
7
+
8
+ Inference logic (verbatim from MurshidUIPipeline.ipynb cell 16+19):
9
+ scores = svm_model.named_steps["clf"].decision_function(
10
+ svm_model.named_steps["pca"].transform(X_user)
11
+ ).reshape(-1)
12
+ pred = (scores >= thr_per_label).astype(int)
13
+ margins = scores - thr_per_label
14
+ conf = sigmoid(margins) * 100
15
+
16
+ Original notebook file is NOT modified.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ from pathlib import Path
23
+
24
+ import joblib
25
+ import numpy as np
26
+
27
+ from app.config import settings
28
+
29
+
30
+ def _sigmoid(x: np.ndarray) -> np.ndarray:
31
+ """Probability calibration: sigmoid(margin) — notebook cell 17."""
32
+ x = np.clip(x, -30, 30)
33
+ return 1.0 / (1.0 + np.exp(-x))
34
+
35
+
36
+ class SVMModel:
37
+ """
38
+ Wraps the trained LinearSVC pipeline with per-label thresholds.
39
+ Structure of the .joblib pack (from notebook):
40
+ svm_pack["model"] → sklearn Pipeline (PCA + LinearSVC)
41
+ svm_pack["thresholds_per_label"] → np.ndarray shape (n_techniques,)
42
+ """
43
+
44
+ def __init__(self, models_dir: Path | None = None) -> None:
45
+ base = Path(models_dir or settings.murshid_models_dir).resolve()
46
+
47
+ svm_path = base / settings.svm_joblib
48
+ labels_path = base / settings.label_columns_json
49
+
50
+ for p in (svm_path, labels_path):
51
+ if not p.is_file():
52
+ raise FileNotFoundError(f"Missing model file: {p}")
53
+
54
+ svm_pack = joblib.load(svm_path)
55
+ self._model = svm_pack["model"] # Pipeline(PCA → LinearSVC)
56
+ self._thresholds = np.asarray(
57
+ svm_pack["thresholds_per_label"], dtype=np.float64
58
+ )
59
+
60
+ with open(labels_path, encoding="utf-8") as f:
61
+ self.technique_names: list[str] = json.load(f)
62
+
63
+ n = len(self.technique_names)
64
+ if self._thresholds.shape[0] != n:
65
+ raise ValueError(
66
+ f"SVM thresholds length {self._thresholds.shape[0]} != {n} labels"
67
+ )
68
+
69
+ # ------------------------------------------------------------------
70
+
71
+ def predict(self, embedding_1d: np.ndarray) -> list[dict]:
72
+ """
73
+ Run SVM inference exactly as in the notebook.
74
+
75
+ Returns list of dicts sorted by confidence_percent desc:
76
+ technique_id, predicted, confidence_percent, score, threshold, margin
77
+ """
78
+ X = embedding_1d.reshape(1, -1)
79
+
80
+ # Apply PCA then LinearSVC decision function (notebook cell 19)
81
+ scores = self._model.named_steps["clf"].decision_function(
82
+ self._model.named_steps["pca"].transform(X)
83
+ ).reshape(-1)
84
+
85
+ pred = (scores >= self._thresholds).astype(int)
86
+ margins = scores - self._thresholds
87
+ conf = _sigmoid(margins) * 100 # calibrated confidence (%)
88
+
89
+ results = [
90
+ {
91
+ "technique_id": self.technique_names[i],
92
+ "predicted": bool(pred[i]),
93
+ "confidence_percent": round(float(conf[i]), 2),
94
+ "score": round(float(scores[i]), 4),
95
+ "threshold": round(float(self._thresholds[i]), 4),
96
+ "margin": round(float(margins[i]), 4),
97
+ }
98
+ for i in range(len(self.technique_names))
99
+ ]
100
+
101
+ return sorted(results, key=lambda r: r["confidence_percent"], reverse=True)
murshid_backend/app/models/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SQLAlchemy ORM models (tables defined exactly per ER Diagram §3.2.6 of the report)."""
2
+ from app.models.user import User
3
+ from app.models.mapping_job import MappingJob
4
+ from app.models.rule import Rule
5
+ from app.models.technique import Technique
6
+ from app.models.rule_technique_mapping import RuleTechniqueMapping
7
+ from app.models.query_template import QueryTemplate
8
+
9
+ __all__ = [
10
+ "User",
11
+ "MappingJob",
12
+ "Rule",
13
+ "Technique",
14
+ "RuleTechniqueMapping",
15
+ "QueryTemplate",
16
+ ]
murshid_backend/app/models/mapping_job.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MappingJob entity — ER Diagram §3.2.6
3
+ Attributes: job_ID, file_name, timestamp, rules_count, status, progress
4
+ Linked to User via "uploads" relationship.
5
+ Also visible in Figure 4-14 (Mapping Progress Table).
6
+ """
7
+
8
+ import enum
9
+ from datetime import datetime
10
+
11
+ from sqlalchemy import DateTime, Enum, ForeignKey, Integer, String, func
12
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
13
+
14
+ from app.db.base import Base
15
+
16
+
17
+ class JobStatus(str, enum.Enum):
18
+ pending = "pending"
19
+ running = "running"
20
+ done = "done"
21
+ failed = "failed"
22
+
23
+
24
+ class MappingJob(Base):
25
+ __tablename__ = "mapping_jobs"
26
+
27
+ job_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
28
+ user_id: Mapped[int] = mapped_column(ForeignKey("users.user_id"), nullable=False)
29
+ file_name: Mapped[str] = mapped_column(String(255), nullable=False)
30
+ rules_count: Mapped[int] = mapped_column(Integer, default=0)
31
+ status: Mapped[JobStatus] = mapped_column(
32
+ Enum(JobStatus), nullable=False, default=JobStatus.pending
33
+ )
34
+ progress: Mapped[int] = mapped_column(Integer, default=0)
35
+ timestamp: Mapped[datetime] = mapped_column(
36
+ DateTime, nullable=False, server_default=func.now()
37
+ )
38
+
39
+ user: Mapped["User"] = relationship(back_populates="jobs")
40
+ rules: Mapped[list["Rule"]] = relationship(back_populates="job")
murshid_backend/app/models/query_template.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ QueryTemplate entity — ER Diagram §3.2.6
3
+ Attributes: Template_ID, Purpose, wql_query, Note
4
+ Linked to Technique. Admin can add/update/disable (Use Case 7, §3.2.7).
5
+ """
6
+
7
+ from sqlalchemy import Boolean, ForeignKey, String, Text
8
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
9
+
10
+ from app.db.base import Base
11
+
12
+
13
+ class QueryTemplate(Base):
14
+ __tablename__ = "query_templates"
15
+
16
+ template_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
17
+ technique_id: Mapped[str] = mapped_column(
18
+ String(20), ForeignKey("techniques.technique_id"), nullable=False
19
+ )
20
+ purpose: Mapped[str | None] = mapped_column(String(255), nullable=True)
21
+ # WQL with placeholders: ${HOST}, ${USER}, ${IP}
22
+ wql_query: Mapped[str] = mapped_column(Text, nullable=False)
23
+ note: Mapped[str | None] = mapped_column(Text, nullable=True)
24
+ # Admin can disable without deleting — Use Case 7
25
+ is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
26
+
27
+ technique: Mapped["Technique"] = relationship(back_populates="query_templates")
murshid_backend/app/models/rule.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Rule entity — ER Diagram §3.2.6
3
+ Attributes: Rule_ID, embedding_vector, job_ID (FK)
4
+ Rule_ID is the Wazuh rule ID string (e.g. "597").
5
+ """
6
+
7
+ from sqlalchemy import ForeignKey, String, Text
8
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
9
+
10
+ from app.db.base import Base
11
+
12
+
13
+ class Rule(Base):
14
+ __tablename__ = "rules"
15
+
16
+ rule_id: Mapped[str] = mapped_column(String(50), primary_key=True)
17
+ job_id: Mapped[int | None] = mapped_column(
18
+ ForeignKey("mapping_jobs.job_id"), nullable=True
19
+ )
20
+ # 768-dimensional float vector stored as JSON string; kept nullable for
21
+ # rules where only the mapping result is persisted without the vector.
22
+ embedding_vector: Mapped[str | None] = mapped_column(Text, nullable=True)
23
+
24
+ job: Mapped["MappingJob | None"] = relationship(back_populates="rules")
25
+ technique_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship(
26
+ back_populates="rule", cascade="all, delete-orphan"
27
+ )
murshid_backend/app/models/rule_technique_mapping.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RuleTechniqueMapping associative entity — ER Diagram §3.2.6
3
+ Attributes: Mapping_ID, Rule_ID (FK), Technique_ID (FK), confidence_score
4
+ Index on rule_id for fast lookup — mentioned explicitly in Use Case 6 (§3.2.7).
5
+ """
6
+
7
+ from sqlalchemy import Float, ForeignKey, Index, Integer, String
8
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
9
+
10
+ from app.db.base import Base
11
+
12
+
13
+ class RuleTechniqueMapping(Base):
14
+ __tablename__ = "rule_technique_mappings"
15
+
16
+ mapping_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
17
+ rule_id: Mapped[str] = mapped_column(
18
+ String(50), ForeignKey("rules.rule_id"), nullable=False
19
+ )
20
+ technique_id: Mapped[str] = mapped_column(
21
+ String(20), ForeignKey("techniques.technique_id"), nullable=False
22
+ )
23
+ confidence_score: Mapped[float] = mapped_column(Float, nullable=False)
24
+
25
+ rule: Mapped["Rule"] = relationship(back_populates="technique_mappings")
26
+ technique: Mapped["Technique"] = relationship(back_populates="rule_mappings")
27
+
28
+ __table_args__ = (
29
+ # "creates an index on rule_id for efficient lookup" — Use Case 6
30
+ Index("ix_rule_technique_rule_id", "rule_id"),
31
+ )
murshid_backend/app/models/technique.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Technique entity — ER Diagram §3.2.6
3
+ Attributes: Technique_ID, technique_name, tactic
4
+ """
5
+
6
+ from sqlalchemy import String
7
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
8
+
9
+ from app.db.base import Base
10
+
11
+
12
+ class Technique(Base):
13
+ __tablename__ = "techniques"
14
+
15
+ technique_id: Mapped[str] = mapped_column(String(20), primary_key=True)
16
+ technique_name: Mapped[str] = mapped_column(String(255), nullable=False)
17
+ tactic: Mapped[str | None] = mapped_column(String(100), nullable=True)
18
+
19
+ rule_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship(
20
+ back_populates="technique"
21
+ )
22
+ query_templates: Mapped[list["QueryTemplate"]] = relationship(
23
+ back_populates="technique"
24
+ )
murshid_backend/app/models/user.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ User entity — ER Diagram §3.2.6
3
+ Attributes: User_ID, username, email, password_hash, role
4
+ """
5
+
6
+ import enum
7
+
8
+ from sqlalchemy import Enum, String
9
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
10
+
11
+ from app.db.base import Base
12
+
13
+
14
+ class UserRole(str, enum.Enum):
15
+ admin = "admin"
16
+ analyst = "analyst"
17
+
18
+
19
+ class User(Base):
20
+ __tablename__ = "users"
21
+
22
+ user_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
23
+ username: Mapped[str] = mapped_column(String(100), unique=True, nullable=False)
24
+ email: Mapped[str] = mapped_column(String(255), unique=True, nullable=False)
25
+ password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
26
+ role: Mapped[UserRole] = mapped_column(
27
+ Enum(UserRole), nullable=False, default=UserRole.analyst
28
+ )
29
+
30
+ jobs: Mapped[list["MappingJob"]] = relationship(back_populates="user")
murshid_backend/app/repositories/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Repository layer — thin DB access wrappers."""
murshid_backend/app/repositories/job_repo.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CRUD for MappingJob table."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime, timezone
6
+
7
+ from sqlalchemy.orm import Session
8
+
9
+ from app.models.mapping_job import JobStatus, MappingJob
10
+
11
+
12
+ def create_job(db: Session, *, user_id: int, file_name: str, rules_count: int = 0) -> MappingJob:
13
+ job = MappingJob(
14
+ user_id=user_id,
15
+ file_name=file_name,
16
+ rules_count=rules_count,
17
+ status=JobStatus.pending,
18
+ progress=0,
19
+ timestamp=datetime.now(tz=timezone.utc),
20
+ )
21
+ db.add(job)
22
+ db.flush()
23
+ return job
24
+
25
+
26
+ def update_job_status(
27
+ db: Session,
28
+ job_id: int,
29
+ *,
30
+ status: JobStatus,
31
+ progress: int | None = None,
32
+ ) -> MappingJob | None:
33
+ job = db.get(MappingJob, job_id)
34
+ if job is None:
35
+ return None
36
+ job.status = status
37
+ if progress is not None:
38
+ job.progress = progress
39
+ db.flush()
40
+ return job
41
+
42
+
43
+ def get_job(db: Session, job_id: int) -> MappingJob | None:
44
+ return db.get(MappingJob, job_id)
murshid_backend/app/repositories/rule_repo.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CRUD for Rule and RuleTechniqueMapping tables."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ import numpy as np
8
+ from sqlalchemy.orm import Session
9
+
10
+ from app.models.rule import Rule
11
+ from app.models.rule_technique_mapping import RuleTechniqueMapping
12
+
13
+
14
+ def upsert_rule(
15
+ db: Session,
16
+ *,
17
+ rule_id: str,
18
+ job_id: int | None = None,
19
+ embedding: np.ndarray | None = None,
20
+ ) -> Rule:
21
+ rule = db.get(Rule, rule_id)
22
+ if rule is None:
23
+ rule = Rule(rule_id=rule_id)
24
+ db.add(rule)
25
+ if job_id is not None:
26
+ rule.job_id = job_id
27
+ if embedding is not None:
28
+ rule.embedding_vector = json.dumps(embedding.tolist())
29
+ db.flush()
30
+ return rule
31
+
32
+
33
+ def save_technique_mappings(
34
+ db: Session,
35
+ *,
36
+ rule_id: str,
37
+ results: list[dict],
38
+ ) -> list[RuleTechniqueMapping]:
39
+ """
40
+ Persist ALL (rule_id, technique_id, confidence_score) rows sorted by confidence.
41
+ Deletes existing mappings first so re-runs are idempotent.
42
+ Saves ALL techniques (not just detected ones) so Figure 4-11 can show Top 5.
43
+ """
44
+ db.query(RuleTechniqueMapping).filter(
45
+ RuleTechniqueMapping.rule_id == rule_id
46
+ ).delete(synchronize_session=False)
47
+
48
+ sorted_results = sorted(results, key=lambda r: r["confidence_percent"], reverse=True)
49
+
50
+ rows = []
51
+ for r in sorted_results:
52
+ row = RuleTechniqueMapping(
53
+ rule_id=rule_id,
54
+ technique_id=r["technique_id"],
55
+ confidence_score=r["confidence_percent"] / 100.0,
56
+ )
57
+ db.add(row)
58
+ rows.append(row)
59
+ db.flush()
60
+ return rows
61
+
62
+
63
+ def get_mappings_for_rule(
64
+ db: Session, rule_id: str
65
+ ) -> list[RuleTechniqueMapping]:
66
+ return (
67
+ db.query(RuleTechniqueMapping)
68
+ .filter(RuleTechniqueMapping.rule_id == rule_id)
69
+ .order_by(RuleTechniqueMapping.confidence_score.desc())
70
+ .all()
71
+ )
murshid_backend/app/repositories/template_repo.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CRUD for Technique and QueryTemplate tables."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from sqlalchemy.orm import Session
6
+
7
+ from app.models.query_template import QueryTemplate
8
+ from app.models.technique import Technique
9
+
10
+
11
+ # --------------------------------------------------------------------------
12
+ # Techniques
13
+ # --------------------------------------------------------------------------
14
+
15
+
16
+ def get_or_create_technique(
17
+ db: Session, *, technique_id: str, technique_name: str = "", tactic: str | None = None
18
+ ) -> Technique:
19
+ t = db.get(Technique, technique_id)
20
+ if t is None:
21
+ t = Technique(
22
+ technique_id=technique_id,
23
+ technique_name=technique_name or technique_id,
24
+ tactic=tactic,
25
+ )
26
+ db.add(t)
27
+ db.flush()
28
+ return t
29
+
30
+
31
+ def get_technique(db: Session, technique_id: str) -> Technique | None:
32
+ return db.get(Technique, technique_id)
33
+
34
+
35
+ # --------------------------------------------------------------------------
36
+ # Query templates
37
+ # --------------------------------------------------------------------------
38
+
39
+
40
+ def get_templates_for_technique(
41
+ db: Session, technique_id: str
42
+ ) -> list[QueryTemplate]:
43
+ return (
44
+ db.query(QueryTemplate)
45
+ .filter(
46
+ QueryTemplate.technique_id == technique_id,
47
+ QueryTemplate.is_active.is_(True),
48
+ )
49
+ .all()
50
+ )
51
+
52
+
53
+ def create_template(
54
+ db: Session,
55
+ *,
56
+ technique_id: str,
57
+ purpose: str | None,
58
+ wql_query: str,
59
+ note: str | None,
60
+ ) -> QueryTemplate:
61
+ tpl = QueryTemplate(
62
+ technique_id=technique_id,
63
+ purpose=purpose,
64
+ wql_query=wql_query,
65
+ note=note,
66
+ is_active=True,
67
+ )
68
+ db.add(tpl)
69
+ db.flush()
70
+ return tpl
71
+
72
+
73
+ def update_template(
74
+ db: Session,
75
+ template_id: int,
76
+ *,
77
+ purpose: str | None = None,
78
+ wql_query: str | None = None,
79
+ note: str | None = None,
80
+ is_active: bool | None = None,
81
+ ) -> QueryTemplate | None:
82
+ tpl = db.get(QueryTemplate, template_id)
83
+ if tpl is None:
84
+ return None
85
+ if purpose is not None:
86
+ tpl.purpose = purpose
87
+ if wql_query is not None:
88
+ tpl.wql_query = wql_query
89
+ if note is not None:
90
+ tpl.note = note
91
+ if is_active is not None:
92
+ tpl.is_active = is_active
93
+ db.flush()
94
+ return tpl
murshid_backend/app/schemas/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Pydantic schemas for API request/response validation."""