Commit ·
26e1c2e
0
Parent(s):
Initial deployment - secrets removed
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +18 -0
- .gitattributes +3 -0
- .gitignore +11 -0
- DEPLOY_GUIDE.md +103 -0
- Dockerfile +49 -0
- MurshidBackend_Colab.ipynb +967 -0
- MurshidBackend_Colab_Report.md +545 -0
- Needed/murshid_label_columns.json +22 -0
- Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib +3 -0
- Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy +3 -0
- Needed/murshid_query_template_structure_clean_shared.xlsx +3 -0
- README.md +39 -0
- murshid_backend/README.md +156 -0
- murshid_backend/TECHNICAL_REPORT.md +322 -0
- murshid_backend/alembic.ini +38 -0
- murshid_backend/alembic/env.py +52 -0
- murshid_backend/alembic/script.py.mako +25 -0
- murshid_backend/alembic/versions/0001_initial_schema.py +87 -0
- murshid_backend/app/__init__.py +1 -0
- murshid_backend/app/api/__init__.py +1 -0
- murshid_backend/app/api/routes/__init__.py +1 -0
- murshid_backend/app/api/routes/db_viewer.py +122 -0
- murshid_backend/app/api/routes/health.py +73 -0
- murshid_backend/app/api/routes/queries.py +78 -0
- murshid_backend/app/api/routes/rules.py +100 -0
- murshid_backend/app/api/routes/stats.py +43 -0
- murshid_backend/app/config.py +29 -0
- murshid_backend/app/db/__init__.py +1 -0
- murshid_backend/app/db/base.py +5 -0
- murshid_backend/app/db/session.py +25 -0
- murshid_backend/app/main.py +60 -0
- murshid_backend/app/ml/__init__.py +1 -0
- murshid_backend/app/ml/embedder.py +116 -0
- murshid_backend/app/ml/logistic_model.py +111 -0
- murshid_backend/app/ml/pipeline.py +225 -0
- murshid_backend/app/ml/sanitizer.py +32 -0
- murshid_backend/app/ml/summarizer.py +262 -0
- murshid_backend/app/ml/svm_model.py +101 -0
- murshid_backend/app/models/__init__.py +16 -0
- murshid_backend/app/models/mapping_job.py +40 -0
- murshid_backend/app/models/query_template.py +27 -0
- murshid_backend/app/models/rule.py +27 -0
- murshid_backend/app/models/rule_technique_mapping.py +31 -0
- murshid_backend/app/models/technique.py +24 -0
- murshid_backend/app/models/user.py +30 -0
- murshid_backend/app/repositories/__init__.py +1 -0
- murshid_backend/app/repositories/job_repo.py +44 -0
- murshid_backend/app/repositories/rule_repo.py +71 -0
- murshid_backend/app/repositories/template_repo.py +94 -0
- murshid_backend/app/schemas/__init__.py +1 -0
.dockerignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.venv
|
| 5 |
+
venv
|
| 6 |
+
**/.env
|
| 7 |
+
.env.local
|
| 8 |
+
*.db
|
| 9 |
+
*.log
|
| 10 |
+
.git
|
| 11 |
+
.gitignore
|
| 12 |
+
*.zip
|
| 13 |
+
MurshidBackend_Colab.ipynb
|
| 14 |
+
MurshidBackend_Colab_Report.md
|
| 15 |
+
interface_pictures/
|
| 16 |
+
murshid_backend/.venv
|
| 17 |
+
murshid_backend/__pycache__
|
| 18 |
+
murshid_backend/TECHNICAL_REPORT.md
|
.gitattributes
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.xlsx filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.venv/
|
| 5 |
+
venv/
|
| 6 |
+
*.db
|
| 7 |
+
*.log
|
| 8 |
+
**/.env
|
| 9 |
+
.env.local
|
| 10 |
+
murshid_backend_for_drive.zip
|
| 11 |
+
interface_pictures/
|
DEPLOY_GUIDE.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 دليل النشر على Hugging Face Spaces
|
| 2 |
+
|
| 3 |
+
## المتطلبات
|
| 4 |
+
- حساب على [Hugging Face](https://huggingface.co/) (مجاني)
|
| 5 |
+
- [Git](https://git-scm.com/) مثبّت على جهازك
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## الخطوات
|
| 10 |
+
|
| 11 |
+
### 1. إنشاء Space جديد
|
| 12 |
+
|
| 13 |
+
1. اذهب إلى: https://huggingface.co/new-space
|
| 14 |
+
2. **Space name**: `murshid`
|
| 15 |
+
3. **SDK**: اختر **Docker**
|
| 16 |
+
4. **Visibility**: Public (مجاني) أو Private
|
| 17 |
+
5. اضغط **Create Space**
|
| 18 |
+
|
| 19 |
+
### 2. رفع المشروع
|
| 20 |
+
|
| 21 |
+
```powershell
|
| 22 |
+
cd d:\murishd
|
| 23 |
+
|
| 24 |
+
# تهيئة Git (إذا لم يكن موجوداً)
|
| 25 |
+
git init
|
| 26 |
+
|
| 27 |
+
# إضافة الـ remote (غيّر YOUR_USERNAME باسم حسابك)
|
| 28 |
+
git remote add space https://huggingface.co/spaces/YOUR_USERNAME/murshid
|
| 29 |
+
|
| 30 |
+
# إضافة الملفات والرفع
|
| 31 |
+
git add .
|
| 32 |
+
git commit -m "Initial deployment"
|
| 33 |
+
git push space main
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
> ⚠️ إذا طلب كلمة مرور، استخدم **Access Token** من:
|
| 37 |
+
> https://huggingface.co/settings/tokens
|
| 38 |
+
|
| 39 |
+
### 3. إعداد المتغيرات البيئية (Secrets)
|
| 40 |
+
|
| 41 |
+
اذهب إلى إعدادات الـ Space: `Settings → Variables and secrets`
|
| 42 |
+
|
| 43 |
+
أضف هذه المتغيرات:
|
| 44 |
+
|
| 45 |
+
| الاسم | القيمة | النوع |
|
| 46 |
+
|-------|--------|-------|
|
| 47 |
+
| `MURSHID_DB_URL` | `sqlite:////app/data/murshid.db` | Variable |
|
| 48 |
+
| `MURSHID_MODELS_DIR` | `/app/Needed` | Variable |
|
| 49 |
+
| `MURSHID_SKIP_LLM` | `true` | Variable |
|
| 50 |
+
| `SECRET_KEY` | (اختر كلمة سر عشوائية) | **Secret** |
|
| 51 |
+
| `HF_TOKEN` | (اختياري — لو تبغى Llama) | **Secret** |
|
| 52 |
+
|
| 53 |
+
### 4. انتظر البناء
|
| 54 |
+
|
| 55 |
+
- HF Spaces يبني الـ Docker image تلقائياً
|
| 56 |
+
- يأخذ **3-5 دقائق** للبناء الأول
|
| 57 |
+
- بعد النجاح، الرابط يكون:
|
| 58 |
+
```
|
| 59 |
+
https://YOUR_USERNAME-murshid.hf.space
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
|
| 64 |
+
## الروابط بعد النشر
|
| 65 |
+
|
| 66 |
+
| الرابط | الوصف |
|
| 67 |
+
|--------|-------|
|
| 68 |
+
| `https://YOUR_USERNAME-murshid.hf.space` | الواجهة الرئيسية |
|
| 69 |
+
| `https://YOUR_USERNAME-murshid.hf.space/docs` | توثيق Swagger |
|
| 70 |
+
| `https://YOUR_USERNAME-murshid.hf.space/health` | فحص الحالة |
|
| 71 |
+
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
## ملاحظات
|
| 75 |
+
|
| 76 |
+
### الوضع الحالي (LITE mode)
|
| 77 |
+
- المشروع ينشر بوضع **LITE** (بدون torch/SecureBERT+)
|
| 78 |
+
- تحليل القواعد يعمل لكن بدقة أقل (embeddings عشوائية)
|
| 79 |
+
- مناسب لاختبار الواجهة والـ API
|
| 80 |
+
|
| 81 |
+
### للترقية إلى LOCAL mode (SecureBERT+ بدون Llama)
|
| 82 |
+
عدّل `Dockerfile` وأزل التعليق من سطر torch:
|
| 83 |
+
```dockerfile
|
| 84 |
+
RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece
|
| 85 |
+
```
|
| 86 |
+
> ⚠️ هذا يزيد حجم الصورة ~800MB ويحتاج ذاكرة أكثر
|
| 87 |
+
|
| 88 |
+
### للترقية إلى FULL mode (مع Llama 3)
|
| 89 |
+
- غيّر الـ Space إلى **GPU (T4)** من الإعدادات ($0.60/ساعة)
|
| 90 |
+
- عدّل `MURSHID_SKIP_LLM=false`
|
| 91 |
+
- أضف `HF_TOKEN` في الـ Secrets
|
| 92 |
+
- استخدم `requirements.txt` الكامل بدل `requirements_light.txt`
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
## استكشاف الأخطاء
|
| 97 |
+
|
| 98 |
+
| المشكلة | الحل |
|
| 99 |
+
|---------|------|
|
| 100 |
+
| Build فشل | تحقق من الـ Logs في تبويب الـ Space |
|
| 101 |
+
| 502 Bad Gateway | انتظر دقيقة — الخادم يبدأ |
|
| 102 |
+
| DB خطأ | تحقق من `MURSHID_DB_URL` في المتغيرات |
|
| 103 |
+
| Frontend لا يتصل | الـ BASE URL أصبح تلقائي (`window.location.origin`) |
|
Dockerfile
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# System deps
|
| 4 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
+
build-essential libxml2-dev libxslt1-dev \
|
| 6 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
+
|
| 8 |
+
# Create non-root user (HF Spaces requirement)
|
| 9 |
+
RUN useradd -m -u 1000 appuser
|
| 10 |
+
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# Copy requirements first for layer caching
|
| 14 |
+
COPY murshid_backend/requirements_light.txt ./requirements.txt
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt \
|
| 16 |
+
&& pip install --no-cache-dir openpyxl aiofiles scikit-learn
|
| 17 |
+
|
| 18 |
+
# Optional: install torch CPU-only for LOCAL mode (SecureBERT+ embeddings)
|
| 19 |
+
# Uncomment the next line if you want LOCAL mode (adds ~800MB to image)
|
| 20 |
+
# RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece
|
| 21 |
+
|
| 22 |
+
# Copy backend code
|
| 23 |
+
COPY murshid_backend/ ./murshid_backend/
|
| 24 |
+
|
| 25 |
+
# Copy model files
|
| 26 |
+
COPY Needed/ ./Needed/
|
| 27 |
+
|
| 28 |
+
# Copy frontend
|
| 29 |
+
COPY murshid_frontend/ ./murshid_frontend/
|
| 30 |
+
|
| 31 |
+
# Create writable directory for SQLite DB
|
| 32 |
+
RUN mkdir -p /app/data && chown -R appuser:appuser /app
|
| 33 |
+
|
| 34 |
+
# Setup environment
|
| 35 |
+
ENV MURSHID_DB_URL=sqlite:////app/data/murshid.db
|
| 36 |
+
ENV MURSHID_MODELS_DIR=/app/Needed
|
| 37 |
+
ENV MURSHID_SKIP_LLM=true
|
| 38 |
+
ENV SECRET_KEY=murshid_hf_space_2026
|
| 39 |
+
ENV PORT=7860
|
| 40 |
+
|
| 41 |
+
# Run DB migrations + import templates + start server
|
| 42 |
+
COPY start.sh ./start.sh
|
| 43 |
+
RUN chmod +x start.sh
|
| 44 |
+
|
| 45 |
+
USER appuser
|
| 46 |
+
|
| 47 |
+
EXPOSE 7860
|
| 48 |
+
|
| 49 |
+
CMD ["./start.sh"]
|
MurshidBackend_Colab.ipynb
ADDED
|
@@ -0,0 +1,967 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# 🛡️ Murshid Backend — Full Mode on Colab\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"**مُرشِد | From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts**\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"---\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"## 📁 الملفات المطلوبة على Google Drive\n",
|
| 14 |
+
"\n",
|
| 15 |
+
"```\n",
|
| 16 |
+
"MyDrive/\n",
|
| 17 |
+
"├── murshid_backend_for_drive.zip ← ارفعيه ثم شغّلي الخلية 2b لاستخراجه\n",
|
| 18 |
+
"│ أو\n",
|
| 19 |
+
"├── murshid_backend/ ← إذا استخرجته مسبقاً\n",
|
| 20 |
+
"│ ├── app/\n",
|
| 21 |
+
"│ ├── alembic/\n",
|
| 22 |
+
"│ ├── scripts/\n",
|
| 23 |
+
"│ ├── alembic.ini\n",
|
| 24 |
+
"│ └── requirements.txt\n",
|
| 25 |
+
"│\n",
|
| 26 |
+
"└── Needed/\n",
|
| 27 |
+
" ├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
|
| 28 |
+
" ├── murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
|
| 29 |
+
" ├── murshid_label_columns.json\n",
|
| 30 |
+
" └── murshid_query_template_structure_clean_shared.xlsx\n",
|
| 31 |
+
"```\n",
|
| 32 |
+
"\n",
|
| 33 |
+
"## تعليمات التشغيل\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"### المتطلبات قبل التشغيل\n",
|
| 36 |
+
"1. ✅ **GPU مُفعَّل:** `Runtime → Change runtime type → T4 GPU`\n",
|
| 37 |
+
"2. ✅ **Google Drive مُتَّصل** (يحتوي مجلد `Needed` بملفات النماذج)\n",
|
| 38 |
+
"3. ✅ **مجلد `murshid_backend`** على Drive أو رفعه يدوياً\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"### الملفات المطلوبة في Google Drive\n",
|
| 41 |
+
"```\n",
|
| 42 |
+
"MyDrive/\n",
|
| 43 |
+
"├── Needed/\n",
|
| 44 |
+
"│ ├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
|
| 45 |
+
"│ ├── murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
|
| 46 |
+
"│ ├── murshid_label_columns.json\n",
|
| 47 |
+
"│ └── murshid_query_template_structure_clean_shared.xlsx\n",
|
| 48 |
+
"└── murshid_backend/ ← مجلد الباكند كاملاً\n",
|
| 49 |
+
"```\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"### ترتيب التشغيل\n",
|
| 52 |
+
"**شغّلي الخلايا بالترتيب من الأعلى للأسفل — لا تتخطّي أي خلية**\n"
|
| 53 |
+
]
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"cell_type": "markdown",
|
| 57 |
+
"metadata": {},
|
| 58 |
+
"source": [
|
| 59 |
+
"---\n",
|
| 60 |
+
"## الخلية 1: التحقق من GPU\n"
|
| 61 |
+
]
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"cell_type": "code",
|
| 65 |
+
"execution_count": null,
|
| 66 |
+
"metadata": {},
|
| 67 |
+
"outputs": [],
|
| 68 |
+
"source": [
|
| 69 |
+
"import torch\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"print('CUDA available:', torch.cuda.is_available())\n",
|
| 72 |
+
"if torch.cuda.is_available():\n",
|
| 73 |
+
" print('GPU:', torch.cuda.get_device_name(0))\n",
|
| 74 |
+
" print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')\n",
|
| 75 |
+
"else:\n",
|
| 76 |
+
" print('⚠️ لا يوجد GPU — غيّري Runtime إلى T4 من القائمة أعلاه')"
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"cell_type": "markdown",
|
| 81 |
+
"metadata": {},
|
| 82 |
+
"source": [
|
| 83 |
+
"---\n",
|
| 84 |
+
"## الخلية 2: تحميل Google Drive\n"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"cell_type": "markdown",
|
| 89 |
+
"metadata": {},
|
| 90 |
+
"source": [
|
| 91 |
+
"---\n",
|
| 92 |
+
"## الخلية 3: تجهيز الباكند في /content\n",
|
| 93 |
+
"\n",
|
| 94 |
+
"> تقوم هذه الخلية تلقائياً بـ:\n",
|
| 95 |
+
"> 1. استخراج ZIP من Drive (إذا كان ZIP موجوداً ولم يُستخرج بعد)\n",
|
| 96 |
+
"> 2. نسخ مجلد `murshid_backend` إلى `/content` (أسرع للقراءة)\n",
|
| 97 |
+
"> 3. ضبط Python path\n"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "code",
|
| 102 |
+
"execution_count": null,
|
| 103 |
+
"metadata": {},
|
| 104 |
+
"outputs": [],
|
| 105 |
+
"source": [
|
| 106 |
+
"print('(هذه الخلية فارغة — الكود انتقل إلى الخلية 3 أدناه)')\n",
|
| 107 |
+
"\n"
|
| 108 |
+
]
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"cell_type": "code",
|
| 112 |
+
"execution_count": null,
|
| 113 |
+
"metadata": {},
|
| 114 |
+
"outputs": [],
|
| 115 |
+
"source": [
|
| 116 |
+
"from google.colab import drive\n",
|
| 117 |
+
"import os\n",
|
| 118 |
+
"\n",
|
| 119 |
+
"drive.mount('/content/drive')\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"# ✏️ عدّلي هذا المسار إذا كان مجلدك مختلفاً\n",
|
| 122 |
+
"NEEDED_PATH = '/content/drive/MyDrive/Needed'\n",
|
| 123 |
+
"BACKEND_PATH = '/content/drive/MyDrive/murshid_backend'\n",
|
| 124 |
+
"ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip'\n",
|
| 125 |
+
"\n",
|
| 126 |
+
"print('=' * 55)\n",
|
| 127 |
+
"print('📂 Checking Google Drive files...')\n",
|
| 128 |
+
"print('=' * 55)\n",
|
| 129 |
+
"\n",
|
| 130 |
+
"# ── التحقق من ملفات Needed ────────────────────────���───────────\n",
|
| 131 |
+
"print('\\n📁 Needed/ (model files):')\n",
|
| 132 |
+
"required_files = {\n",
|
| 133 |
+
" 'murshid_logreg_pipeline_manual_oof_pcatuned.joblib': 'LogReg model',\n",
|
| 134 |
+
" 'murshid_logreg_thresholds_manual_oof_pcatuned.npy': 'LogReg thresholds',\n",
|
| 135 |
+
" 'murshid_label_columns.json': 'Technique names',\n",
|
| 136 |
+
"}\n",
|
| 137 |
+
"\n",
|
| 138 |
+
"models_ok = True\n",
|
| 139 |
+
"for fname, desc in required_files.items():\n",
|
| 140 |
+
" path = f'{NEEDED_PATH}/{fname}'\n",
|
| 141 |
+
" exists = os.path.isfile(path)\n",
|
| 142 |
+
" size = f'{os.path.getsize(path)/1024:.0f} KB' if exists else ''\n",
|
| 143 |
+
" status = '✅' if exists else '❌'\n",
|
| 144 |
+
" print(f' {status} {fname} {size}')\n",
|
| 145 |
+
" if not exists:\n",
|
| 146 |
+
" models_ok = False\n",
|
| 147 |
+
"\n",
|
| 148 |
+
"excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n",
|
| 149 |
+
"excel_ok = os.path.isfile(excel_path)\n",
|
| 150 |
+
"print(f' {\"✅\" if excel_ok else \"⚠️ \"} murshid_query_template_structure_clean_shared.xlsx (optional)')\n",
|
| 151 |
+
"\n",
|
| 152 |
+
"# ── التحقق من الباكند ─────────────────────────────────────────\n",
|
| 153 |
+
"print('\\n📁 murshid_backend/ (backend code):')\n",
|
| 154 |
+
"backend_ok = os.path.isdir(BACKEND_PATH)\n",
|
| 155 |
+
"zip_ok = os.path.isfile(ZIP_PATH)\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"if backend_ok:\n",
|
| 158 |
+
" fcount = sum(len(f) for _, _, f in os.walk(BACKEND_PATH))\n",
|
| 159 |
+
" print(f' ✅ murshid_backend/ ({fcount} files)')\n",
|
| 160 |
+
"elif zip_ok:\n",
|
| 161 |
+
" zsize = f'{os.path.getsize(ZIP_PATH)/1024:.0f} KB'\n",
|
| 162 |
+
" print(f' 📦 murshid_backend_for_drive.zip ({zsize}) — سيُستخرج تلقائياً في الخلية 3')\n",
|
| 163 |
+
"else:\n",
|
| 164 |
+
" print(f' ❌ murshid_backend/ غير موجود')\n",
|
| 165 |
+
" print(f' ❌ murshid_backend_for_drive.zip غير موجود')\n",
|
| 166 |
+
" print(f'\\n ⚠️ ارفعي murshid_backend_for_drive.zip إلى:')\n",
|
| 167 |
+
" print(f' Google Drive → My Drive')\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"# ── ملخص ──────────────────────────────────────────────────────\n",
|
| 170 |
+
"print('\\n' + '=' * 55)\n",
|
| 171 |
+
"if models_ok and (backend_ok or zip_ok):\n",
|
| 172 |
+
" print('✅ كل شيء جاهز — تابعي تشغيل الخلايا')\n",
|
| 173 |
+
"elif not models_ok:\n",
|
| 174 |
+
" print('❌ ملفات النماذج مفقودة من Needed/ — يجب رفعها أولاً')\n",
|
| 175 |
+
"else:\n",
|
| 176 |
+
" print('❌ ملفات الباكند مفقودة — ارفعي ZIP أولاً')"
|
| 177 |
+
]
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"cell_type": "markdown",
|
| 181 |
+
"metadata": {},
|
| 182 |
+
"source": [
|
| 183 |
+
"---\n",
|
| 184 |
+
"## الخلية 3: نسخ الباكند إلى /content\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"> نسخ الملفات من Drive إلى `/content` لتسريع القراءة\n"
|
| 187 |
+
]
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"cell_type": "code",
|
| 191 |
+
"execution_count": null,
|
| 192 |
+
"metadata": {},
|
| 193 |
+
"outputs": [],
|
| 194 |
+
"source": [
|
| 195 |
+
"import shutil, os, zipfile, sys\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"DRIVE_BASE = '/content/drive/MyDrive'\n",
|
| 198 |
+
"ZIP_PATH = f'{DRIVE_BASE}/murshid_backend_for_drive.zip'\n",
|
| 199 |
+
"BACKEND_DRIVE= f'{DRIVE_BASE}/murshid_backend'\n",
|
| 200 |
+
"BACKEND_LOCAL= '/content/murshid_backend'\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"# ── الخطوة 1: استخراج ZIP من Drive إذا لزم ────────────────────\n",
|
| 203 |
+
"if not os.path.isdir(BACKEND_DRIVE):\n",
|
| 204 |
+
" if os.path.isfile(ZIP_PATH):\n",
|
| 205 |
+
" print(f'📦 ZIP found — extracting to Drive...')\n",
|
| 206 |
+
" with zipfile.ZipFile(ZIP_PATH, 'r') as z:\n",
|
| 207 |
+
" z.extractall(DRIVE_BASE)\n",
|
| 208 |
+
" print(f'✅ Extracted to {BACKEND_DRIVE}')\n",
|
| 209 |
+
" else:\n",
|
| 210 |
+
" print('❌ ERROR: مجلد murshid_backend غير موجود على Drive')\n",
|
| 211 |
+
" print(f' المطلوب: {BACKEND_DRIVE}')\n",
|
| 212 |
+
" print(f' أو رفع: {ZIP_PATH}')\n",
|
| 213 |
+
" raise FileNotFoundError(f'Backend not found. Upload murshid_backend_for_drive.zip to Google Drive MyDrive.')\n",
|
| 214 |
+
"else:\n",
|
| 215 |
+
" print(f'✅ murshid_backend found on Drive: {BACKEND_DRIVE}')\n",
|
| 216 |
+
"\n",
|
| 217 |
+
"# ── الخطوة 2: نسخ إلى /content (أسرع بكثير من Drive أثناء التشغيل) ─\n",
|
| 218 |
+
"if os.path.exists(BACKEND_LOCAL):\n",
|
| 219 |
+
" shutil.rmtree(BACKEND_LOCAL)\n",
|
| 220 |
+
"\n",
|
| 221 |
+
"shutil.copytree(\n",
|
| 222 |
+
" BACKEND_DRIVE,\n",
|
| 223 |
+
" BACKEND_LOCAL,\n",
|
| 224 |
+
" ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '.venv', '*.db', '*.log')\n",
|
| 225 |
+
")\n",
|
| 226 |
+
"\n",
|
| 227 |
+
"# ── الخطوة 3: إضافة للـ Python path ──────────────────────────\n",
|
| 228 |
+
"if BACKEND_LOCAL not in sys.path:\n",
|
| 229 |
+
" sys.path.insert(0, BACKEND_LOCAL)\n",
|
| 230 |
+
"\n",
|
| 231 |
+
"os.chdir(BACKEND_LOCAL)\n",
|
| 232 |
+
"\n",
|
| 233 |
+
"# ── تحقق ─────────────────────────────────────────────────────\n",
|
| 234 |
+
"file_count = sum(len(files) for _, _, files in os.walk(BACKEND_LOCAL))\n",
|
| 235 |
+
"print(f'✅ Backend ready at {BACKEND_LOCAL} ({file_count} files)')\n",
|
| 236 |
+
"print(f'✅ Working dir: {os.getcwd()}')\n",
|
| 237 |
+
"\n",
|
| 238 |
+
"# عرض الهيكل\n",
|
| 239 |
+
"print('\\nStructure:')\n",
|
| 240 |
+
"for item in sorted(os.listdir(BACKEND_LOCAL)):\n",
|
| 241 |
+
" full = os.path.join(BACKEND_LOCAL, item)\n",
|
| 242 |
+
" if os.path.isdir(full):\n",
|
| 243 |
+
" sub_count = len(os.listdir(full))\n",
|
| 244 |
+
" print(f' 📁 {item}/ ({sub_count} items)')\n",
|
| 245 |
+
" else:\n",
|
| 246 |
+
" size = os.path.getsize(full)\n",
|
| 247 |
+
" print(f' 📄 {item} ({size:,} bytes)')"
|
| 248 |
+
]
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"cell_type": "markdown",
|
| 252 |
+
"metadata": {},
|
| 253 |
+
"source": [
|
| 254 |
+
"---\n",
|
| 255 |
+
"## الخلية 4: تثبيت المتطلبات\n"
|
| 256 |
+
]
|
| 257 |
+
},
|
| 258 |
+
{
|
| 259 |
+
"cell_type": "code",
|
| 260 |
+
"execution_count": null,
|
| 261 |
+
"metadata": {},
|
| 262 |
+
"outputs": [],
|
| 263 |
+
"source": [
|
| 264 |
+
"print('📦 Installing requirements...')\n",
|
| 265 |
+
"\n",
|
| 266 |
+
"# ── الحزم الأساسية للباكند ──────────────────────────────────────\n",
|
| 267 |
+
"!pip install -q \\\n",
|
| 268 |
+
" fastapi==0.115.0 \\\n",
|
| 269 |
+
" \"uvicorn[standard]==0.32.0\" \\\n",
|
| 270 |
+
" pydantic==2.9.0 \\\n",
|
| 271 |
+
" pydantic-settings==2.6.0 \\\n",
|
| 272 |
+
" python-dotenv==1.0.0 \\\n",
|
| 273 |
+
" sqlalchemy==2.0.0 \\\n",
|
| 274 |
+
" alembic==1.13.0 \\\n",
|
| 275 |
+
" aiofiles \\\n",
|
| 276 |
+
" scikit-learn==1.6.1 \\\n",
|
| 277 |
+
" joblib \\\n",
|
| 278 |
+
" lxml \\\n",
|
| 279 |
+
" openpyxl \\\n",
|
| 280 |
+
" nest-asyncio \\\n",
|
| 281 |
+
" pyngrok\n",
|
| 282 |
+
"\n",
|
| 283 |
+
"# ── bitsandbytes: مطلوب لتحميل LLaMA بـ 4-bit على GPU ─────────\n",
|
| 284 |
+
"print('📦 Installing bitsandbytes (required for LLaMA 4-bit)...')\n",
|
| 285 |
+
"!pip install -q -U \"bitsandbytes>=0.46.1\"\n",
|
| 286 |
+
"\n",
|
| 287 |
+
"# ── accelerate: مطلوب لـ device_map=\"auto\" ────────────────────\n",
|
| 288 |
+
"!pip install -q -U accelerate\n",
|
| 289 |
+
"\n",
|
| 290 |
+
"# ── تحقق من التثبيت ──────────────────────────────────────────\n",
|
| 291 |
+
"import importlib\n",
|
| 292 |
+
"for pkg in ['bitsandbytes', 'accelerate', 'fastapi', 'sklearn']:\n",
|
| 293 |
+
" try:\n",
|
| 294 |
+
" mod = importlib.import_module(pkg if pkg != 'sklearn' else 'sklearn')\n",
|
| 295 |
+
" ver = getattr(mod, '__version__', '?')\n",
|
| 296 |
+
" print(f' ✅ {pkg}=={ver}')\n",
|
| 297 |
+
" except ImportError:\n",
|
| 298 |
+
" print(f' ❌ {pkg} — فشل التثبيت')\n",
|
| 299 |
+
"\n",
|
| 300 |
+
"print('\\n✅ All requirements installed')"
|
| 301 |
+
]
|
| 302 |
+
},
|
| 303 |
+
{
|
| 304 |
+
"cell_type": "markdown",
|
| 305 |
+
"metadata": {},
|
| 306 |
+
"source": [
|
| 307 |
+
"---\n",
|
| 308 |
+
"## الخلية 5: إعداد ملف .env\n"
|
| 309 |
+
]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"cell_type": "code",
|
| 313 |
+
"execution_count": null,
|
| 314 |
+
"metadata": {},
|
| 315 |
+
"outputs": [],
|
| 316 |
+
"source": [
|
| 317 |
+
"import os\n",
|
| 318 |
+
"\n",
|
| 319 |
+
"# ✏️ ضعي HF Token هنا إذا لم تُضيفيه عبر Colab Secrets\n",
|
| 320 |
+
"HF_TOKEN = os.environ.get('HF_TOKEN', 'ادخل التوكن')\n",
|
| 321 |
+
"\n",
|
| 322 |
+
"env_content = f\"\"\"# Auto-generated .env for Colab FULL mode\n",
|
| 323 |
+
"MURSHID_DB_URL=sqlite:////content/murshid.db\n",
|
| 324 |
+
"MURSHID_MODELS_DIR={NEEDED_PATH}\n",
|
| 325 |
+
"HF_TOKEN={HF_TOKEN}\n",
|
| 326 |
+
"MURSHID_SKIP_LLM=false\n",
|
| 327 |
+
"SECRET_KEY=murshid_colab_2026\n",
|
| 328 |
+
"LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct\n",
|
| 329 |
+
"EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus\n",
|
| 330 |
+
"LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
|
| 331 |
+
"LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
|
| 332 |
+
"LABEL_COLUMNS_JSON=murshid_label_columns.json\n",
|
| 333 |
+
"\"\"\"\n",
|
| 334 |
+
"\n",
|
| 335 |
+
"env_path = '/content/murshid_backend/.env'\n",
|
| 336 |
+
"with open(env_path, 'w') as f:\n",
|
| 337 |
+
" f.write(env_content)\n",
|
| 338 |
+
"\n",
|
| 339 |
+
"print('✅ .env created at', env_path)\n",
|
| 340 |
+
"print('\\nContents:')\n",
|
| 341 |
+
"with open(env_path) as f:\n",
|
| 342 |
+
" for line in f:\n",
|
| 343 |
+
" if 'TOKEN' in line or 'SECRET' in line:\n",
|
| 344 |
+
" key = line.split('=')[0]\n",
|
| 345 |
+
" print(f' {key}=****')\n",
|
| 346 |
+
" else:\n",
|
| 347 |
+
" print(' ', line.rstrip())"
|
| 348 |
+
]
|
| 349 |
+
},
|
| 350 |
+
{
|
| 351 |
+
"cell_type": "markdown",
|
| 352 |
+
"metadata": {},
|
| 353 |
+
"source": [
|
| 354 |
+
"---\n",
|
| 355 |
+
"## الخلية 6: تهجير قاعدة البيانات (Alembic)\n"
|
| 356 |
+
]
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"cell_type": "code",
|
| 360 |
+
"execution_count": null,
|
| 361 |
+
"metadata": {},
|
| 362 |
+
"outputs": [],
|
| 363 |
+
"source": [
|
| 364 |
+
"import subprocess, os\n",
|
| 365 |
+
"\n",
|
| 366 |
+
"os.chdir('/content/murshid_backend')\n",
|
| 367 |
+
"\n",
|
| 368 |
+
"result = subprocess.run(\n",
|
| 369 |
+
" ['python', '-m', 'alembic', 'upgrade', 'head'],\n",
|
| 370 |
+
" capture_output=True, text=True\n",
|
| 371 |
+
")\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"print(result.stdout)\n",
|
| 374 |
+
"if result.stderr:\n",
|
| 375 |
+
" print(result.stderr)\n",
|
| 376 |
+
"\n",
|
| 377 |
+
"import os\n",
|
| 378 |
+
"db_exists = os.path.isfile('/content/murshid.db')\n",
|
| 379 |
+
"print('✅ Database ready:', '/content/murshid.db' if db_exists else '❌ لم يُنشأ')"
|
| 380 |
+
]
|
| 381 |
+
},
|
| 382 |
+
{
|
| 383 |
+
"cell_type": "markdown",
|
| 384 |
+
"metadata": {},
|
| 385 |
+
"source": [
|
| 386 |
+
"---\n",
|
| 387 |
+
"## الخلية 7: استيراد قوالب WQL من Excel\n"
|
| 388 |
+
]
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"cell_type": "code",
|
| 392 |
+
"execution_count": null,
|
| 393 |
+
"metadata": {},
|
| 394 |
+
"outputs": [],
|
| 395 |
+
"source": [
|
| 396 |
+
"import sys\n",
|
| 397 |
+
"sys.path.insert(0, '/content/murshid_backend')\n",
|
| 398 |
+
"os.chdir('/content/murshid_backend')\n",
|
| 399 |
+
"\n",
|
| 400 |
+
"excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n",
|
| 401 |
+
"\n",
|
| 402 |
+
"if os.path.isfile(excel_path):\n",
|
| 403 |
+
" from app.db.session import SessionLocal\n",
|
| 404 |
+
" from scripts.import_excel_templates import run as import_excel\n",
|
| 405 |
+
"\n",
|
| 406 |
+
" db = SessionLocal()\n",
|
| 407 |
+
" try:\n",
|
| 408 |
+
" result = import_excel(db, replace=False)\n",
|
| 409 |
+
" print('✅ Excel import result:')\n",
|
| 410 |
+
" for k, v in result.items():\n",
|
| 411 |
+
" print(f' {k}: {v}')\n",
|
| 412 |
+
" finally:\n",
|
| 413 |
+
" db.close()\n",
|
| 414 |
+
"else:\n",
|
| 415 |
+
" print(f'⚠️ Excel file not found at: {excel_path}')\n",
|
| 416 |
+
" print(' يمكنك المتابعة — القوالب ستُضاف لاحقاً يدوياً')"
|
| 417 |
+
]
|
| 418 |
+
},
|
| 419 |
+
{
|
| 420 |
+
"cell_type": "markdown",
|
| 421 |
+
"metadata": {},
|
| 422 |
+
"source": [
|
| 423 |
+
"---\n",
|
| 424 |
+
"## الخلية 8: تشغيل FastAPI + ngrok\n",
|
| 425 |
+
"\n",
|
| 426 |
+
"> ⏳ هذه الخلية تأخذ **5-10 دقائق** لتحميل LLaMA (4.5GB) و SecureBERT+\n",
|
| 427 |
+
"\n",
|
| 428 |
+
"> 🔑 **الرابط العام سيظهر في النهاية** — انسخيه للفرونت\n"
|
| 429 |
+
]
|
| 430 |
+
},
|
| 431 |
+
{
|
| 432 |
+
"cell_type": "code",
|
| 433 |
+
"execution_count": null,
|
| 434 |
+
"metadata": {},
|
| 435 |
+
"outputs": [],
|
| 436 |
+
"source": [
|
| 437 |
+
"import subprocess, time, os, sys, urllib.request\n",
|
| 438 |
+
"import nest_asyncio\n",
|
| 439 |
+
"nest_asyncio.apply()\n",
|
| 440 |
+
"\n",
|
| 441 |
+
"os.chdir('/content/murshid_backend')\n",
|
| 442 |
+
"\n",
|
| 443 |
+
"# ─── التحقق من bitsandbytes قبل تشغيل الخادم ─────────────────\n",
|
| 444 |
+
"try:\n",
|
| 445 |
+
" import bitsandbytes as bnb\n",
|
| 446 |
+
" print(f'✅ bitsandbytes {bnb.__version__}')\n",
|
| 447 |
+
"except ImportError:\n",
|
| 448 |
+
" print('❌ bitsandbytes غير مثبّت — شغّلي الخلية 4 أولاً')\n",
|
| 449 |
+
" raise\n",
|
| 450 |
+
"\n",
|
| 451 |
+
"# ─── تشغيل uvicorn ───────────────────────────────────────────\n",
|
| 452 |
+
"log_path = '/content/murshid_server.log'\n",
|
| 453 |
+
"log_file = open(log_path, 'w')\n",
|
| 454 |
+
"\n",
|
| 455 |
+
"server_proc = subprocess.Popen(\n",
|
| 456 |
+
" [\n",
|
| 457 |
+
" 'python', '-m', 'uvicorn', 'app.main:app',\n",
|
| 458 |
+
" '--host', '0.0.0.0',\n",
|
| 459 |
+
" '--port', '8000',\n",
|
| 460 |
+
" '--log-level', 'info'\n",
|
| 461 |
+
" ],\n",
|
| 462 |
+
" cwd='/content/murshid_backend',\n",
|
| 463 |
+
" stdout=log_file,\n",
|
| 464 |
+
" stderr=subprocess.STDOUT\n",
|
| 465 |
+
")\n",
|
| 466 |
+
"\n",
|
| 467 |
+
"print('⏳ Loading LLaMA 3 8B + SecureBERT+...')\n",
|
| 468 |
+
"print(' جاري التحميل — انتظري حتى تظهر الرسالة النهائية')\n",
|
| 469 |
+
"\n",
|
| 470 |
+
"# ─── انتظار ذكي مع عرض اللوج ────────────────────────────────\n",
|
| 471 |
+
"started = False\n",
|
| 472 |
+
"last_log_size = 0\n",
|
| 473 |
+
"\n",
|
| 474 |
+
"for i in range(180): # 15 دقيقة كحد أقصى\n",
|
| 475 |
+
" time.sleep(5)\n",
|
| 476 |
+
"\n",
|
| 477 |
+
" # تحقق إذا بدأ الخادم\n",
|
| 478 |
+
" try:\n",
|
| 479 |
+
" resp = urllib.request.urlopen('http://localhost:8000/health', timeout=3)\n",
|
| 480 |
+
" if resp.status == 200:\n",
|
| 481 |
+
" started = True\n",
|
| 482 |
+
" break\n",
|
| 483 |
+
" except Exception:\n",
|
| 484 |
+
" pass\n",
|
| 485 |
+
"\n",
|
| 486 |
+
" # عرض اللوج الجديد كل 30 ثانية\n",
|
| 487 |
+
" if i % 6 == 0:\n",
|
| 488 |
+
" elapsed = (i + 1) * 5\n",
|
| 489 |
+
" log_file.flush()\n",
|
| 490 |
+
" try:\n",
|
| 491 |
+
" with open(log_path) as f:\n",
|
| 492 |
+
" log_content = f.read()\n",
|
| 493 |
+
" new_content = log_content[last_log_size:]\n",
|
| 494 |
+
" last_log_size = len(log_content)\n",
|
| 495 |
+
"\n",
|
| 496 |
+
" # تحقق من خطأ مبكر\n",
|
| 497 |
+
" if 'ERROR' in new_content or 'ImportError' in new_content:\n",
|
| 498 |
+
" print(f'\\n❌ خطأ في الخادم عند {elapsed}s:')\n",
|
| 499 |
+
" # عرض آخر 1000 حرف من اللوج\n",
|
| 500 |
+
" print(log_content[-1500:])\n",
|
| 501 |
+
" server_proc.terminate()\n",
|
| 502 |
+
" log_file.close()\n",
|
| 503 |
+
" raise RuntimeError('Server failed to start. See log above.')\n",
|
| 504 |
+
"\n",
|
| 505 |
+
" # عرض ما تم تحميله\n",
|
| 506 |
+
" if 'Loaded' in new_content or 'loaded' in new_content or 'Application' in new_content:\n",
|
| 507 |
+
" for line in new_content.strip().split('\\n'):\n",
|
| 508 |
+
" if any(k in line for k in ['INFO', 'Loaded', 'loaded', 'Application', 'WARNING']):\n",
|
| 509 |
+
" print(f' {line.strip()}')\n",
|
| 510 |
+
" else:\n",
|
| 511 |
+
" mins = elapsed // 60\n",
|
| 512 |
+
" secs = elapsed % 60\n",
|
| 513 |
+
" print(f' ⏳ {mins}m {secs}s — يجري تحميل النماذج...')\n",
|
| 514 |
+
" except RuntimeError:\n",
|
| 515 |
+
" raise\n",
|
| 516 |
+
" except Exception:\n",
|
| 517 |
+
" print(f' ⏳ {elapsed}s elapsed...')\n",
|
| 518 |
+
"\n",
|
| 519 |
+
"log_file.flush()\n",
|
| 520 |
+
"log_file.close()\n",
|
| 521 |
+
"\n",
|
| 522 |
+
"if not started:\n",
|
| 523 |
+
" print('\\n❌ Server did not start after 15 minutes.')\n",
|
| 524 |
+
" print('─── آخر سطور اللوج ───')\n",
|
| 525 |
+
" with open(log_path) as f:\n",
|
| 526 |
+
" print(f.read()[-3000:])\n",
|
| 527 |
+
"else:\n",
|
| 528 |
+
" print('\\n✅ Server started successfully!')\n",
|
| 529 |
+
"\n",
|
| 530 |
+
" # ─── Cloudflare Tunnel (مجاني — بدون حساب) ──────────────────\n",
|
| 531 |
+
" import subprocess, re, threading, time\n",
|
| 532 |
+
"\n",
|
| 533 |
+
" # تثبيت cloudflared\n",
|
| 534 |
+
" subprocess.run(\n",
|
| 535 |
+
" ['wget', '-q', 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
|
| 536 |
+
" '-O', '/usr/local/bin/cloudflared'],\n",
|
| 537 |
+
" check=True\n",
|
| 538 |
+
" )\n",
|
| 539 |
+
" subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
|
| 540 |
+
" print('✅ cloudflared installed')\n",
|
| 541 |
+
"\n",
|
| 542 |
+
" # تشغيل النفق\n",
|
| 543 |
+
" cf_log = open('/content/cloudflared.log', 'w')\n",
|
| 544 |
+
" cf_proc = subprocess.Popen(\n",
|
| 545 |
+
" ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
|
| 546 |
+
" stdout=cf_log, stderr=subprocess.STDOUT\n",
|
| 547 |
+
" )\n",
|
| 548 |
+
"\n",
|
| 549 |
+
" # انتظار ظهور الرابط في اللوج\n",
|
| 550 |
+
" public_url = None\n",
|
| 551 |
+
" for _ in range(30):\n",
|
| 552 |
+
" time.sleep(2)\n",
|
| 553 |
+
" cf_log.flush()\n",
|
| 554 |
+
" try:\n",
|
| 555 |
+
" with open('/content/cloudflared.log') as f:\n",
|
| 556 |
+
" content = f.read()\n",
|
| 557 |
+
" match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
|
| 558 |
+
" if match:\n",
|
| 559 |
+
" public_url = match.group(0)\n",
|
| 560 |
+
" break\n",
|
| 561 |
+
" except Exception:\n",
|
| 562 |
+
" pass\n",
|
| 563 |
+
"\n",
|
| 564 |
+
" if public_url:\n",
|
| 565 |
+
" print('\\n' + '='*60)\n",
|
| 566 |
+
" print('🌐 PUBLIC URL (الرابط العام — Cloudflare):')\n",
|
| 567 |
+
" print(f' {public_url}')\n",
|
| 568 |
+
" print('='*60)\n",
|
| 569 |
+
" print(f'📖 Swagger: {public_url}/docs')\n",
|
| 570 |
+
" print(f'💚 Health: {public_url}/health')\n",
|
| 571 |
+
" print(f'🗄️ DB Summary: {public_url}/api/db/summary')\n",
|
| 572 |
+
" print('='*60)\n",
|
| 573 |
+
" print('\\n📋 انسخي هذا السطر والصقيه في الفرونت (index.html):')\n",
|
| 574 |
+
" print(f\" const BASE = '{public_url}';\")\n",
|
| 575 |
+
" else:\n",
|
| 576 |
+
" print('⚠️ Cloudflare tunnel URL not found, check /content/cloudflared.log')\n",
|
| 577 |
+
" with open('/content/cloudflared.log') as f:\n",
|
| 578 |
+
" print(f.read()[-1000:])"
|
| 579 |
+
]
|
| 580 |
+
},
|
| 581 |
+
{
|
| 582 |
+
"cell_type": "code",
|
| 583 |
+
"execution_count": null,
|
| 584 |
+
"metadata": {},
|
| 585 |
+
"outputs": [],
|
| 586 |
+
"source": [
|
| 587 |
+
"# ─── تشغيل Cloudflare Tunnel بشكل منفصل (إذا فشل مع الخلية 8) ─\n",
|
| 588 |
+
"# شغّلي هذه الخلية فقط إذا كان الخادم يعمل لكن الـ tunnel فشل\n",
|
| 589 |
+
"\n",
|
| 590 |
+
"import subprocess, re, time, os\n",
|
| 591 |
+
"\n",
|
| 592 |
+
"# تثبيت cloudflared إذا لم يُثبَّت\n",
|
| 593 |
+
"if not os.path.isfile('/usr/local/bin/cloudflared'):\n",
|
| 594 |
+
" subprocess.run(\n",
|
| 595 |
+
" ['wget', '-q',\n",
|
| 596 |
+
" 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
|
| 597 |
+
" '-O', '/usr/local/bin/cloudflared'],\n",
|
| 598 |
+
" check=True\n",
|
| 599 |
+
" )\n",
|
| 600 |
+
" subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
|
| 601 |
+
" print('✅ cloudflared installed')\n",
|
| 602 |
+
"else:\n",
|
| 603 |
+
" print('✅ cloudflared already installed')\n",
|
| 604 |
+
"\n",
|
| 605 |
+
"# تشغيل النفق\n",
|
| 606 |
+
"cf_log_path = '/content/cloudflared.log'\n",
|
| 607 |
+
"cf_log = open(cf_log_path, 'w')\n",
|
| 608 |
+
"cf_proc = subprocess.Popen(\n",
|
| 609 |
+
" ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
|
| 610 |
+
" stdout=cf_log, stderr=subprocess.STDOUT\n",
|
| 611 |
+
")\n",
|
| 612 |
+
"\n",
|
| 613 |
+
"print('⏳ Opening Cloudflare tunnel...')\n",
|
| 614 |
+
"\n",
|
| 615 |
+
"public_url = None\n",
|
| 616 |
+
"for _ in range(30):\n",
|
| 617 |
+
" time.sleep(2)\n",
|
| 618 |
+
" cf_log.flush()\n",
|
| 619 |
+
" try:\n",
|
| 620 |
+
" with open(cf_log_path) as f:\n",
|
| 621 |
+
" content = f.read()\n",
|
| 622 |
+
" match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
|
| 623 |
+
" if match:\n",
|
| 624 |
+
" public_url = match.group(0)\n",
|
| 625 |
+
" break\n",
|
| 626 |
+
" except Exception:\n",
|
| 627 |
+
" pass\n",
|
| 628 |
+
"\n",
|
| 629 |
+
"if public_url:\n",
|
| 630 |
+
" print('\\n' + '='*60)\n",
|
| 631 |
+
" print(f'🌐 PUBLIC URL: {public_url}')\n",
|
| 632 |
+
" print(f'📖 Swagger: {public_url}/docs')\n",
|
| 633 |
+
" print(f'💚 Health: {public_url}/health')\n",
|
| 634 |
+
" print('='*60)\n",
|
| 635 |
+
" print('\\n📋 الصقي هذا السطر في index.html:')\n",
|
| 636 |
+
" print(f\" const BASE = '{public_url}';\")\n",
|
| 637 |
+
"else:\n",
|
| 638 |
+
" print('❌ لم يُعثر على URL. اللوج:')\n",
|
| 639 |
+
" with open(cf_log_path) as f:\n",
|
| 640 |
+
" print(f.read())\n"
|
| 641 |
+
]
|
| 642 |
+
},
|
| 643 |
+
{
|
| 644 |
+
"cell_type": "markdown",
|
| 645 |
+
"metadata": {},
|
| 646 |
+
"source": [
|
| 647 |
+
"---\n",
|
| 648 |
+
"## الخلية 9: ربط الفرونت بـ Cloudflare URL\n",
|
| 649 |
+
"\n",
|
| 650 |
+
"بعد تشغيل الخلية السابقة، ستظهر رسالة مثل:\n",
|
| 651 |
+
"```\n",
|
| 652 |
+
"🌐 PUBLIC URL: https://xxxx-xxxx.trycloudflare.com\n",
|
| 653 |
+
"```\n",
|
| 654 |
+
"\n",
|
| 655 |
+
"**الخلية أدناه تُحدّث الفرونت تلقائياً** — أو يمكنك التعديل يدوياً في `index.html`:\n",
|
| 656 |
+
"```javascript\n",
|
| 657 |
+
"const BASE = 'https://xxxx-xxxx.trycloudflare.com';\n",
|
| 658 |
+
"```\n"
|
| 659 |
+
]
|
| 660 |
+
},
|
| 661 |
+
{
|
| 662 |
+
"cell_type": "code",
|
| 663 |
+
"execution_count": null,
|
| 664 |
+
"metadata": {},
|
| 665 |
+
"outputs": [],
|
| 666 |
+
"source": [
|
| 667 |
+
"import subprocess, re, time, os\n",
|
| 668 |
+
"\n",
|
| 669 |
+
"# ── الخطوة 1: تثبيت cloudflared ──────────────────────────────\n",
|
| 670 |
+
"if not os.path.isfile('/usr/local/bin/cloudflared'):\n",
|
| 671 |
+
" subprocess.run([\n",
|
| 672 |
+
" 'wget', '-q',\n",
|
| 673 |
+
" 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
|
| 674 |
+
" '-O', '/usr/local/bin/cloudflared'\n",
|
| 675 |
+
" ], check=True)\n",
|
| 676 |
+
" subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
|
| 677 |
+
" print('✅ cloudflared installed')\n",
|
| 678 |
+
"else:\n",
|
| 679 |
+
" print('✅ cloudflared ready')\n",
|
| 680 |
+
"\n",
|
| 681 |
+
"# ── الخطوة 2: تشغيل النفق ────────────────────────────────────\n",
|
| 682 |
+
"cf_log_path = '/content/cf.log'\n",
|
| 683 |
+
"cf_log = open(cf_log_path, 'w')\n",
|
| 684 |
+
"subprocess.Popen(\n",
|
| 685 |
+
" ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
|
| 686 |
+
" stdout=cf_log, stderr=subprocess.STDOUT\n",
|
| 687 |
+
")\n",
|
| 688 |
+
"\n",
|
| 689 |
+
"print('⏳ Opening Cloudflare tunnel...')\n",
|
| 690 |
+
"\n",
|
| 691 |
+
"# ── الخطوة 3: انتظار الرابط ───────────────────────────────────\n",
|
| 692 |
+
"public_url = None\n",
|
| 693 |
+
"for _ in range(30):\n",
|
| 694 |
+
" time.sleep(2)\n",
|
| 695 |
+
" cf_log.flush()\n",
|
| 696 |
+
" with open(cf_log_path) as f:\n",
|
| 697 |
+
" content = f.read()\n",
|
| 698 |
+
" match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
|
| 699 |
+
" if match:\n",
|
| 700 |
+
" public_url = match.group(0)\n",
|
| 701 |
+
" break\n",
|
| 702 |
+
"\n",
|
| 703 |
+
"if not public_url:\n",
|
| 704 |
+
" print('❌ Tunnel failed. Log:')\n",
|
| 705 |
+
" with open(cf_log_path) as f: print(f.read())\n",
|
| 706 |
+
"else:\n",
|
| 707 |
+
" # ── الخطوة 4: تحديث index.html تلقائياً ─────────────────\n",
|
| 708 |
+
" frontend_path = '/content/drive/MyDrive/murshid_frontend/index.html'\n",
|
| 709 |
+
"\n",
|
| 710 |
+
" if os.path.isfile(frontend_path):\n",
|
| 711 |
+
" with open(frontend_path, 'r', encoding='utf-8') as f:\n",
|
| 712 |
+
" html = f.read()\n",
|
| 713 |
+
" html_updated = re.sub(r\"const BASE = '[^']*';\",\n",
|
| 714 |
+
" f\"const BASE = '{public_url}';\", html)\n",
|
| 715 |
+
" with open(frontend_path, 'w', encoding='utf-8') as f:\n",
|
| 716 |
+
" f.write(html_updated)\n",
|
| 717 |
+
" print(f'✅ index.html updated automatically')\n",
|
| 718 |
+
" else:\n",
|
| 719 |
+
" print(f'⚠️ index.html not found — عدّليه يدوياً')\n",
|
| 720 |
+
"\n",
|
| 721 |
+
" print('\\n' + '='*60)\n",
|
| 722 |
+
" print(f'🌐 PUBLIC URL: {public_url}')\n",
|
| 723 |
+
" print(f'📖 Swagger: {public_url}/docs')\n",
|
| 724 |
+
" print(f'💚 Health: {public_url}/health')\n",
|
| 725 |
+
" print(f'🖥️ Frontend: {public_url}/index.html')\n",
|
| 726 |
+
" print('='*60)\n",
|
| 727 |
+
" print(f\"\\n📋 const BASE = '{public_url}';\")"
|
| 728 |
+
]
|
| 729 |
+
},
|
| 730 |
+
{
|
| 731 |
+
"cell_type": "markdown",
|
| 732 |
+
"metadata": {},
|
| 733 |
+
"source": [
|
| 734 |
+
"---\n",
|
| 735 |
+
"## الخلية 10: اختبار الـ API\n"
|
| 736 |
+
]
|
| 737 |
+
},
|
| 738 |
+
{
|
| 739 |
+
"cell_type": "code",
|
| 740 |
+
"execution_count": null,
|
| 741 |
+
"metadata": {},
|
| 742 |
+
"outputs": [],
|
| 743 |
+
"source": [
|
| 744 |
+
"import urllib.request, json\n",
|
| 745 |
+
"\n",
|
| 746 |
+
"# ─── Health Check ────────────────────────────────────────────\n",
|
| 747 |
+
"with urllib.request.urlopen('http://localhost:8000/health') as r:\n",
|
| 748 |
+
" health = json.load(r)\n",
|
| 749 |
+
"\n",
|
| 750 |
+
"print('=== Health Check ===')\n",
|
| 751 |
+
"print(f\" status: {health['status']}\")\n",
|
| 752 |
+
"print(f\" pipeline_mode: {health['pipeline_mode']}\")\n",
|
| 753 |
+
"print(f\" llama_loaded: {health['components']['llama_loaded']}\")\n",
|
| 754 |
+
"print(f\" embedder_loaded: {health['components']['embedder_loaded']}\")\n",
|
| 755 |
+
"print(f\" logreg_loaded: {health['components']['logreg_loaded']}\")\n",
|
| 756 |
+
"print(f\" cuda_available: {health['components']['cuda_available']}\")\n",
|
| 757 |
+
"\n",
|
| 758 |
+
"mode = health.get('pipeline_mode', 'unknown')\n",
|
| 759 |
+
"if mode == 'full':\n",
|
| 760 |
+
" print('\\n✅ FULL mode — نتائج مطابقة 100% للدفتر')\n",
|
| 761 |
+
"elif mode == 'local':\n",
|
| 762 |
+
" print('\\n⚠️ LOCAL mode — LLaMA لم يُحمَّل، تحققي من MURSHID_SKIP_LLM=false')\n",
|
| 763 |
+
"else:\n",
|
| 764 |
+
" print('\\n❌ LITE mode — تحققي من تثبيت torch والنماذج')"
|
| 765 |
+
]
|
| 766 |
+
},
|
| 767 |
+
{
|
| 768 |
+
"cell_type": "code",
|
| 769 |
+
"execution_count": null,
|
| 770 |
+
"metadata": {},
|
| 771 |
+
"outputs": [],
|
| 772 |
+
"source": [
|
| 773 |
+
"# ─── تحليل قاعدة اختبار ──────────────────────────────────────\n",
|
| 774 |
+
"import urllib.request, json\n",
|
| 775 |
+
"\n",
|
| 776 |
+
"test_rule = '''<rule id=\"18205\" level=\"5\">\n",
|
| 777 |
+
" <if_sid>18201</if_sid>\n",
|
| 778 |
+
" <id>^634$|^4730$</id>\n",
|
| 779 |
+
" <description>Windows: Security Enabled Global Group Deleted</description>\n",
|
| 780 |
+
" <mitre><id>T1484</id></mitre>\n",
|
| 781 |
+
" <group>group_deleted,win_group_deleted</group>\n",
|
| 782 |
+
"</rule>'''\n",
|
| 783 |
+
"\n",
|
| 784 |
+
"payload = json.dumps({'rule_xml': test_rule}).encode()\n",
|
| 785 |
+
"req = urllib.request.Request(\n",
|
| 786 |
+
" 'http://localhost:8000/rules/analyze',\n",
|
| 787 |
+
" data=payload,\n",
|
| 788 |
+
" headers={'Content-Type': 'application/json'},\n",
|
| 789 |
+
" method='POST'\n",
|
| 790 |
+
")\n",
|
| 791 |
+
"\n",
|
| 792 |
+
"with urllib.request.urlopen(req) as r:\n",
|
| 793 |
+
" result = json.load(r)\n",
|
| 794 |
+
"\n",
|
| 795 |
+
"print('=== Analyze Result ===')\n",
|
| 796 |
+
"print(f\" rule_id: {result['rule_id']}\")\n",
|
| 797 |
+
"print(f\" pipeline_mode: {result['pipeline_mode']}\")\n",
|
| 798 |
+
"print(f\" summary: {result['summary']}\")\n",
|
| 799 |
+
"print(f\"\\n TOP 5 Techniques:\")\n",
|
| 800 |
+
"print(f\" {'Technique':<15} {'Conf%':>8} {'Proba':>8} {'Thr':>6} {'Gap':>8} {'Pred':>6}\")\n",
|
| 801 |
+
"print(f\" {'-'*55}\")\n",
|
| 802 |
+
"for r in result['all_results'][:5]:\n",
|
| 803 |
+
" pred = '✅' if r['predicted'] else ' '\n",
|
| 804 |
+
" print(f\" {pred} {r['technique_id']:<13} {r['confidence_percent']:>7.2f}%\"\n",
|
| 805 |
+
" f\" {r['proba']:>8.4f} {r['threshold']:>6.2f} {r['gap']:>+8.4f}\")\n",
|
| 806 |
+
"\n",
|
| 807 |
+
"print(f\"\\n Detected: {len(result['detected'])} technique(s)\")\n",
|
| 808 |
+
"for d in result['detected']:\n",
|
| 809 |
+
" print(f\" ✅ {d['technique_id']} — {d['confidence_percent']}%\")"
|
| 810 |
+
]
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"cell_type": "code",
|
| 814 |
+
"execution_count": null,
|
| 815 |
+
"metadata": {},
|
| 816 |
+
"outputs": [],
|
| 817 |
+
"source": [
|
| 818 |
+
"# ─── قوالب WQL للتقنية المكتشفة ──────────────────────────────\n",
|
| 819 |
+
"if result['detected']:\n",
|
| 820 |
+
" top_technique = result['detected'][0]['technique_id']\n",
|
| 821 |
+
"\n",
|
| 822 |
+
" with urllib.request.urlopen(f'http://localhost:8000/queries/{top_technique}') as r:\n",
|
| 823 |
+
" queries = json.load(r)\n",
|
| 824 |
+
"\n",
|
| 825 |
+
" print(f'=== WQL Templates for {top_technique} ===')\n",
|
| 826 |
+
" for i, q in enumerate(queries, 1):\n",
|
| 827 |
+
" print(f\"\\n [{i}] {q.get('purpose', 'N/A')}\")\n",
|
| 828 |
+
" print(f\" Query: {q['wql_query'][:120]}...\")\n",
|
| 829 |
+
" print(f\" Note: {q.get('note', 'N/A')}\")"
|
| 830 |
+
]
|
| 831 |
+
},
|
| 832 |
+
{
|
| 833 |
+
"cell_type": "markdown",
|
| 834 |
+
"metadata": {},
|
| 835 |
+
"source": [
|
| 836 |
+
"---\n",
|
| 837 |
+
"## الخلية 11: تصدير النتائج (اختياري)\n",
|
| 838 |
+
"\n",
|
| 839 |
+
"لحفظ النتائج بصيغة JSON لاستخدامها لاحقاً على الجهاز المحلي\n"
|
| 840 |
+
]
|
| 841 |
+
},
|
| 842 |
+
{
|
| 843 |
+
"cell_type": "code",
|
| 844 |
+
"execution_count": null,
|
| 845 |
+
"metadata": {},
|
| 846 |
+
"outputs": [],
|
| 847 |
+
"source": [
|
| 848 |
+
"# ─── تحليل قائمة من القواعد وتصديرها ───────────────────────\n",
|
| 849 |
+
"import urllib.request, json, os\n",
|
| 850 |
+
"\n",
|
| 851 |
+
"# ✏️ أضيفي Rule IDs التي تريدين تحليلها\n",
|
| 852 |
+
"# يمكنك قراءتها من ملف\n",
|
| 853 |
+
"test_ids_path = f'{NEEDED_PATH}/test_rule_ids.json'\n",
|
| 854 |
+
"\n",
|
| 855 |
+
"if os.path.isfile(test_ids_path):\n",
|
| 856 |
+
" with open(test_ids_path) as f:\n",
|
| 857 |
+
" rule_ids = json.load(f)\n",
|
| 858 |
+
" print(f'Loaded {len(rule_ids)} rule IDs from test_rule_ids.json')\n",
|
| 859 |
+
"else:\n",
|
| 860 |
+
" # قواعد تجريبية\n",
|
| 861 |
+
" rule_ids = ['18205']\n",
|
| 862 |
+
" print('Using default test rule')\n",
|
| 863 |
+
"\n",
|
| 864 |
+
"print(f'Processing {len(rule_ids)} rules...')\n",
|
| 865 |
+
"\n",
|
| 866 |
+
"export_results = []\n",
|
| 867 |
+
"\n",
|
| 868 |
+
"for rule_id in rule_ids:\n",
|
| 869 |
+
" try:\n",
|
| 870 |
+
" with urllib.request.urlopen(f'http://localhost:8000/results/{rule_id}') as r:\n",
|
| 871 |
+
" data = json.load(r)\n",
|
| 872 |
+
" data['source'] = 'colab_full_mode'\n",
|
| 873 |
+
" export_results.append(data)\n",
|
| 874 |
+
" detected = len(data.get('detected', []))\n",
|
| 875 |
+
" top = data['mappings'][0] if data['mappings'] else {}\n",
|
| 876 |
+
" print(f\" ✅ {rule_id}: {top.get('technique_id','?')} ({top.get('confidence_percent','?')}%) — {detected} detected\")\n",
|
| 877 |
+
" except Exception as e:\n",
|
| 878 |
+
" print(f\" ⚠️ {rule_id}: {e}\")\n",
|
| 879 |
+
"\n",
|
| 880 |
+
"# حفظ النتائج\n",
|
| 881 |
+
"export_path = f'{NEEDED_PATH}/murshid_full_results.json'\n",
|
| 882 |
+
"with open(export_path, 'w', encoding='utf-8') as f:\n",
|
| 883 |
+
" json.dump(export_results, f, ensure_ascii=False, indent=2)\n",
|
| 884 |
+
"\n",
|
| 885 |
+
"print(f'\\n✅ Exported {len(export_results)} results to:')\n",
|
| 886 |
+
"print(f' {export_path}')\n",
|
| 887 |
+
"print('\\nيمكنك الآن استيراد هذا الملف في الباكند المحلي')"
|
| 888 |
+
]
|
| 889 |
+
},
|
| 890 |
+
{
|
| 891 |
+
"cell_type": "markdown",
|
| 892 |
+
"metadata": {},
|
| 893 |
+
"source": [
|
| 894 |
+
"---\n",
|
| 895 |
+
"## الخلية 12: إيقاف الخادم (عند الانتهاء)\n"
|
| 896 |
+
]
|
| 897 |
+
},
|
| 898 |
+
{
|
| 899 |
+
"cell_type": "code",
|
| 900 |
+
"execution_count": null,
|
| 901 |
+
"metadata": {},
|
| 902 |
+
"outputs": [],
|
| 903 |
+
"source": [
|
| 904 |
+
"# إيقاف الخادم وإغلاق ngrok\n",
|
| 905 |
+
"try:\n",
|
| 906 |
+
" from pyngrok import ngrok\n",
|
| 907 |
+
" ngrok.kill()\n",
|
| 908 |
+
" print('✅ ngrok tunnel closed')\n",
|
| 909 |
+
"except Exception:\n",
|
| 910 |
+
" pass\n",
|
| 911 |
+
"\n",
|
| 912 |
+
"try:\n",
|
| 913 |
+
" server_proc.terminate()\n",
|
| 914 |
+
" print('✅ Server stopped')\n",
|
| 915 |
+
"except Exception:\n",
|
| 916 |
+
" pass"
|
| 917 |
+
]
|
| 918 |
+
},
|
| 919 |
+
{
|
| 920 |
+
"cell_type": "markdown",
|
| 921 |
+
"metadata": {},
|
| 922 |
+
"source": [
|
| 923 |
+
"---\n",
|
| 924 |
+
"## ملاحظات مهمة\n",
|
| 925 |
+
"\n",
|
| 926 |
+
"### إذا انقطع الاتصال بـ Colab\n",
|
| 927 |
+
"- الخادم يتوقف تلقائياً\n",
|
| 928 |
+
"- أعيدي تشغيل الخلايا من الخلية 8\n",
|
| 929 |
+
"- رابط ngrok سيتغيّر — عدّلي الفرونت بالرابط الجديد\n",
|
| 930 |
+
"\n",
|
| 931 |
+
"### إذا ظهر خطأ في LLaMA\n",
|
| 932 |
+
"- تأكدي أن لديك صلاحية الوصول للنموذج: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct\n",
|
| 933 |
+
"- تأكدي من صحة HF_TOKEN\n",
|
| 934 |
+
"\n",
|
| 935 |
+
"### المقارنة مع الجهاز المحلي\n",
|
| 936 |
+
"| | Colab (FULL) | الجهاز المحلي (LOCAL) |\n",
|
| 937 |
+
"|--|-------------|----------------------|\n",
|
| 938 |
+
"| LLaMA | ✅ | ❌ |\n",
|
| 939 |
+
"| T1484 confidence | **94.76%** | 89.29% |\n",
|
| 940 |
+
"| القرار النهائي | T1484 ✅ | T1484 ✅ |\n",
|
| 941 |
+
"\n",
|
| 942 |
+
"### للعرض التقديمي\n",
|
| 943 |
+
"1. شغّلي الخلايا 1-8 مسبقاً (قبل العرض بـ 15 دقيقة)\n",
|
| 944 |
+
"2. انسخي رابط ngrok\n",
|
| 945 |
+
"3. عدّلي الفرونت\n",
|
| 946 |
+
"4. افتحي `https://xxxx.ngrok-free.app/index.html`\n"
|
| 947 |
+
]
|
| 948 |
+
}
|
| 949 |
+
],
|
| 950 |
+
"metadata": {
|
| 951 |
+
"accelerator": "GPU",
|
| 952 |
+
"colab": {
|
| 953 |
+
"gpuType": "T4",
|
| 954 |
+
"machine_shape": "hm",
|
| 955 |
+
"provenance": []
|
| 956 |
+
},
|
| 957 |
+
"kernelspec": {
|
| 958 |
+
"display_name": "Python 3",
|
| 959 |
+
"name": "python3"
|
| 960 |
+
},
|
| 961 |
+
"language_info": {
|
| 962 |
+
"name": "python"
|
| 963 |
+
}
|
| 964 |
+
},
|
| 965 |
+
"nbformat": 4,
|
| 966 |
+
"nbformat_minor": 0
|
| 967 |
+
}
|
MurshidBackend_Colab_Report.md
ADDED
|
@@ -0,0 +1,545 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# تقرير تقني: آلية عمل MurshidBackend_Colab.ipynb
|
| 2 |
+
|
| 3 |
+
## مشروع مُرشِد | From Alerts to Guidance
|
| 4 |
+
### MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## 1. نظرة عامة
|
| 9 |
+
|
| 10 |
+
`MurshidBackend_Colab.ipynb` هو دفتر Jupyter مُصمَّم لتشغيل الباكند الكامل لمشروع مُرشِد على بيئة **Google Colab** باستخدام **GPU (Tesla T4)**، مما يُتيح تشغيل نموذج **LLaMA 3 8B** بتكميم 4-bit لتوليد ملخصات دلالية غنية لقواعد Wazuh XML، وذلك على عكس البيئة المحلية التي تعمل بدون LLaMA (LOCAL mode).
|
| 11 |
+
|
| 12 |
+
### الهدف الرئيسي
|
| 13 |
+
تشغيل **FULL mode** للـ pipeline:
|
| 14 |
+
```
|
| 15 |
+
قاعدة Wazuh XML
|
| 16 |
+
↓
|
| 17 |
+
LLaMA 3 8B ←── ملخص دلالي غني (GPU)
|
| 18 |
+
↓
|
| 19 |
+
SecureBERT+ ←── 768-dim embedding
|
| 20 |
+
↓
|
| 21 |
+
Logistic Regression ←── confidence scores لكل تقنية
|
| 22 |
+
↓
|
| 23 |
+
FastAPI + SQLite ←── تخزين وخدمة النتائج
|
| 24 |
+
↓
|
| 25 |
+
Cloudflare Tunnel ←── رابط عام للفرونت
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## 2. المتطلبات قبل التشغيل
|
| 31 |
+
|
| 32 |
+
### 2.1 إعداد Google Colab
|
| 33 |
+
| المتطلب | التفاصيل |
|
| 34 |
+
|---------|----------|
|
| 35 |
+
| **GPU** | Tesla T4 — يُفعَّل من: `Runtime → Change runtime type → T4 GPU` |
|
| 36 |
+
| **الذاكرة** | High RAM (machine_shape: "hm") |
|
| 37 |
+
| **الإنترنت** | مفعَّل لتنزيل النماذج من Hugging Face |
|
| 38 |
+
|
| 39 |
+
### 2.2 الملفات المطلوبة على Google Drive
|
| 40 |
+
```
|
| 41 |
+
MyDrive/
|
| 42 |
+
├── murshid_backend_for_drive.zip ← ملفات الباكند مضغوطة (44 KB)
|
| 43 |
+
│ أو
|
| 44 |
+
├── murshid_backend/ ← المجلد مستخرج مسبقاً
|
| 45 |
+
│ ├── app/
|
| 46 |
+
│ │ ├── main.py
|
| 47 |
+
│ │ ├── config.py
|
| 48 |
+
│ │ ├── api/routes/
|
| 49 |
+
│ │ ├── ml/
|
| 50 |
+
│ │ ├── models/
|
| 51 |
+
│ │ ├── services/
|
| 52 |
+
│ │ └── repositories/
|
| 53 |
+
│ ├── alembic/
|
| 54 |
+
│ ├── scripts/
|
| 55 |
+
│ ├── alembic.ini
|
| 56 |
+
│ └── requirements.txt
|
| 57 |
+
│
|
| 58 |
+
└── Needed/
|
| 59 |
+
├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib ← نموذج LogReg
|
| 60 |
+
├── murshid_logreg_thresholds_manual_oof_pcatuned.npy ← عتبات التنبؤ
|
| 61 |
+
├── murshid_label_columns.json ← أسماء التقنيات الـ 20
|
| 62 |
+
└── murshid_query_template_structure_clean_shared.xlsx ← 60 قالب WQL
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### 2.3 Hugging Face Token
|
| 66 |
+
مطلوب للوصول إلى نموذج `meta-llama/Meta-Llama-3-8B-Instruct`:
|
| 67 |
+
- يُضاف في `Colab Secrets` باسم `HF_TOKEN`
|
| 68 |
+
- أو مباشرةً في خلية 5 من الدفتر
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## 3. شرح الخلايا بالتفصيل
|
| 73 |
+
|
| 74 |
+
### الخلية 1: التحقق من GPU
|
| 75 |
+
|
| 76 |
+
**الهدف:** التأكد من وجود GPU قبل البدء.
|
| 77 |
+
|
| 78 |
+
```python
|
| 79 |
+
import torch
|
| 80 |
+
print('CUDA available:', torch.cuda.is_available())
|
| 81 |
+
print('GPU:', torch.cuda.get_device_name(0))
|
| 82 |
+
print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
**المخرج المتوقع:**
|
| 86 |
+
```
|
| 87 |
+
CUDA available: True
|
| 88 |
+
GPU: Tesla T4
|
| 89 |
+
Memory: 15.8 GB
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
**ماذا يحدث إذا لم يكن هناك GPU؟**
|
| 93 |
+
- LLaMA لن يُحمَّل (يحتاج CUDA)
|
| 94 |
+
- الخادم سيعمل بـ LOCAL mode فقط (بدون تلخيص)
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
### الخلية 2: تحميل Google Drive والتحقق من الملفات
|
| 99 |
+
|
| 100 |
+
**الهدف:** ربط Colab بـ Google Drive والتحقق من وجود جميع الملفات المطلوبة.
|
| 101 |
+
|
| 102 |
+
```python
|
| 103 |
+
from google.colab import drive
|
| 104 |
+
drive.mount('/content/drive')
|
| 105 |
+
|
| 106 |
+
NEEDED_PATH = '/content/drive/MyDrive/Needed'
|
| 107 |
+
BACKEND_PATH = '/content/drive/MyDrive/murshid_backend'
|
| 108 |
+
ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip'
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
**ما يتحقق منه:**
|
| 112 |
+
| الملف | النوع | الحالة |
|
| 113 |
+
|-------|-------|--------|
|
| 114 |
+
| `murshid_logreg_pipeline_manual_oof_pcatuned.joblib` | إلزامي | ✅ / ❌ |
|
| 115 |
+
| `murshid_logreg_thresholds_manual_oof_pcatuned.npy` | إلزامي | ✅ / ❌ |
|
| 116 |
+
| `murshid_label_columns.json` | إلزامي | ✅ / ❌ |
|
| 117 |
+
| `murshid_query_template_structure_clean_shared.xlsx` | اختياري | ✅ / ⚠️ |
|
| 118 |
+
| `murshid_backend/` أو `.zip` | إلزامي | ✅ / ❌ |
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
### الخلية 3: تجهيز الباكند في /content
|
| 123 |
+
|
| 124 |
+
**الهدف:** نقل ملفات الباكند من Drive إلى `/content` لتسريع القراءة (Drive أبطأ في I/O).
|
| 125 |
+
|
| 126 |
+
**المنطق الذكي:**
|
| 127 |
+
```
|
| 128 |
+
هل murshid_backend/ موجود على Drive؟
|
| 129 |
+
↓ نعم → انسخ مباشرةً إلى /content
|
| 130 |
+
↓ لا
|
| 131 |
+
هل murshid_backend_for_drive.zip موجود؟
|
| 132 |
+
↓ نعم → استخرجه إلى Drive أولاً ثم انسخ
|
| 133 |
+
↓ لا
|
| 134 |
+
→ ❌ خطأ: "ارفعي ZIP إلى Google Drive"
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
**الخطوات المنفَّذة:**
|
| 138 |
+
1. **استخراج ZIP** (إذا لزم) إلى `MyDrive/`
|
| 139 |
+
2. **ن��خ** `murshid_backend/` إلى `/content/murshid_backend/` (بدون pycache وملفات مؤقتة)
|
| 140 |
+
3. **إضافة** `/content/murshid_backend` إلى `sys.path`
|
| 141 |
+
4. **تغيير** working directory إلى `/content/murshid_backend`
|
| 142 |
+
|
| 143 |
+
**لماذا النسخ إلى /content؟**
|
| 144 |
+
- Drive يعتمد على FUSE mount = بطيء للقراءة المتكررة
|
| 145 |
+
- `/content` على SSD محلي للـ VM = أسرع بـ 5-10x
|
| 146 |
+
|
| 147 |
+
---
|
| 148 |
+
|
| 149 |
+
### الخلية 4: تثبيت المتطلبات
|
| 150 |
+
|
| 151 |
+
**الهدف:** تثبيت جميع المكتبات اللازمة لتشغيل الباكند.
|
| 152 |
+
|
| 153 |
+
**المكتبات المثبَّتة:**
|
| 154 |
+
|
| 155 |
+
| المكتبة | الإصدار | الغرض |
|
| 156 |
+
|---------|---------|--------|
|
| 157 |
+
| `fastapi` | 0.115.0 | إطار API |
|
| 158 |
+
| `uvicorn` | 0.32.0 | خادم ASGI |
|
| 159 |
+
| `pydantic` | 2.9.0 | تحقق من البيانات |
|
| 160 |
+
| `sqlalchemy` | 2.0.0 | ORM |
|
| 161 |
+
| `alembic` | 1.13.0 | هجرة DB |
|
| 162 |
+
| `scikit-learn` | **1.6.1** | نموذج LogReg (يطابق بيئة التدريب) |
|
| 163 |
+
| `bitsandbytes` | ≥0.46.1 | تكميم LLaMA 4-bit |
|
| 164 |
+
| `accelerate` | آخر نسخة | `device_map="auto"` للـ GPU |
|
| 165 |
+
| `openpyxl` | آخر نسخة | قراءة ملف Excel |
|
| 166 |
+
| `lxml` | آخر نسخة | معالجة XML |
|
| 167 |
+
| `pyngrok` | آخر نسخة | (احتياطي — غير مستخدم) |
|
| 168 |
+
|
| 169 |
+
> **ملاحظة مهمة:** `scikit-learn==1.6.1` محدَّد بدقة لأن ملفات joblib دُرِّبت بهذه النسخة — استخدام نسخة مختلفة يُنتج تحذيرات `InconsistentVersionWarning`.
|
| 170 |
+
|
| 171 |
+
---
|
| 172 |
+
|
| 173 |
+
### الخلية 5: إعداد ملف .env
|
| 174 |
+
|
| 175 |
+
**الهدف:** إنشاء ملف الإعدادات لتشغيل FULL mode.
|
| 176 |
+
|
| 177 |
+
**محتوى الملف المُولَّد:**
|
| 178 |
+
```env
|
| 179 |
+
MURSHID_DB_URL=sqlite:////content/murshid.db
|
| 180 |
+
MURSHID_MODELS_DIR=/content/drive/MyDrive/Needed
|
| 181 |
+
HF_TOKEN=****
|
| 182 |
+
MURSHID_SKIP_LLM=false ← مفتاح FULL mode
|
| 183 |
+
SECRET_KEY=murshid_colab_2026
|
| 184 |
+
LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
|
| 185 |
+
EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus
|
| 186 |
+
LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib
|
| 187 |
+
LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy
|
| 188 |
+
LABEL_COLUMNS_JSON=murshid_label_columns.json
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
**الفرق بين FULL و LOCAL mode:**
|
| 192 |
+
| المتغير | FULL mode | LOCAL mode |
|
| 193 |
+
|---------|-----------|------------|
|
| 194 |
+
| `MURSHID_SKIP_LLM` | `false` | `true` |
|
| 195 |
+
| LLaMA يُحمَّل؟ | ✅ نعم | ❌ لا |
|
| 196 |
+
| جودة التلخيص | عالية | الوصف الخام فقط |
|
| 197 |
+
| T1484 confidence (مثال) | **94.76%** | 89.29% |
|
| 198 |
+
|
| 199 |
+
---
|
| 200 |
+
|
| 201 |
+
### الخلية 6: تهجير قاعدة البيانات (Alembic)
|
| 202 |
+
|
| 203 |
+
**الهدف:** إنشاء جداول قاعدة البيانات SQLite.
|
| 204 |
+
|
| 205 |
+
```bash
|
| 206 |
+
python -m alembic upgrade head
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
**الجداول المُنشأة (من migration 0001):**
|
| 210 |
+
|
| 211 |
+
| الجدول | الغرض | مصدره في التقرير |
|
| 212 |
+
|--------|--------|-----------------|
|
| 213 |
+
| `users` | مستخدمو النظام (admin/analyst) | ER Diagram §3.2.6 |
|
| 214 |
+
| `mapping_jobs` | وظائف معالجة ملفات القواعد | ER Diagram §3.2.6 |
|
| 215 |
+
| `rules` | قواعد Wazuh المُحلَّلة | ER Diagram §3.2.6 |
|
| 216 |
+
| `techniques` | تقنيات MITRE ATT&CK | ER Diagram §3.2.6 |
|
| 217 |
+
| `rule_technique_mappings` | ربط القواعد بالتقنيات + confidence | ER Diagram §3.2.6 |
|
| 218 |
+
| `query_templates` | قوالب WQL للتحقيق | ER Diagram §3.2.6 |
|
| 219 |
+
|
| 220 |
+
> **ملاحظة:** قاعدة البيانات في `/content/murshid.db` — تُنشأ من جديد في كل جلسة Colab.
|
| 221 |
+
|
| 222 |
+
---
|
| 223 |
+
|
| 224 |
+
### الخلية 7: استيراد قوالب WQL من Excel
|
| 225 |
+
|
| 226 |
+
**الهدف:** تحميل 60 قالب WQL من ملف Excel إلى قاعدة البيانات.
|
| 227 |
+
|
| 228 |
+
**البيانات المستوردة:**
|
| 229 |
+
|
| 230 |
+
| الإحصائية | القيمة |
|
| 231 |
+
|-----------|--------|
|
| 232 |
+
| إجمالي التقنيات | 20 تقنية |
|
| 233 |
+
| إجمالي القوالب | 60 قالب (3 لكل تقنية) |
|
| 234 |
+
| التقنيات المشمولة | T1047, T1055, T1059.001, T1070.004, T1078, T1083, T1095, T1098, T1105, T1110, T1112, T1114, T1176, T1190, T1484, T1498, T1499, T1529, T1531, T1562.001 |
|
| 235 |
+
|
| 236 |
+
**مثال على قالب WQL (T1484):**
|
| 237 |
+
```
|
| 238 |
+
Template 1: Host pivot
|
| 239 |
+
agent.name:${HOST} AND win.system.eventID:(4728 OR 4729 ...) AND @timestamp:[now-24h TO now]
|
| 240 |
+
|
| 241 |
+
Template 2: Actor pivot
|
| 242 |
+
win.eventdata.SubjectUserName:${USER} AND win.system.eventID:(...) AND @timestamp:[now-24h TO now]
|
| 243 |
+
|
| 244 |
+
Template 3: High-impact target change
|
| 245 |
+
win.system.eventID:(...) AND win.eventdata.TargetUserName:("Domain Admins" OR ...) AND @timestamp:[now-24h TO now]
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
**منع التكرار:**
|
| 249 |
+
- يتحقق من وجود (`technique_id` + `purpose`) قبل الإضافة
|
| 250 |
+
- `replace=False` بشكل افتراضي (لا يُعيد الكتابة)
|
| 251 |
+
|
| 252 |
+
---
|
| 253 |
+
|
| 254 |
+
### الخلية 8: تشغيل FastAPI + Cloudflare Tunnel
|
| 255 |
+
|
| 256 |
+
**الهدف:** الخلية الرئيسية — تُشغّل الباكند وتُنشئ رابطاً عاماً.
|
| 257 |
+
|
| 258 |
+
#### 8.1 التحقق من bitsandbytes
|
| 259 |
+
```python
|
| 260 |
+
import bitsandbytes as bnb
|
| 261 |
+
print(f'✅ bitsandbytes {bnb.__version__}')
|
| 262 |
+
```
|
| 263 |
+
> إذا فشل: يُوقف التشغيل فوراً مع رسالة واضحة.
|
| 264 |
+
|
| 265 |
+
#### 8.2 تشغيل uvicorn
|
| 266 |
+
```bash
|
| 267 |
+
python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 --log-level info
|
| 268 |
+
```
|
| 269 |
+
- `--host 0.0.0.0`: يستمع على كل الواجهات (مطلوب للـ tunnel)
|
| 270 |
+
- اللوج يُحفظ في `/content/murshid_server.log`
|
| 271 |
+
|
| 272 |
+
#### 8.3 تحميل النماذج (lifespan)
|
| 273 |
+
عند بدء الخادم تُنفَّذ `load_models()` بهذا الترتيب:
|
| 274 |
+
|
| 275 |
+
```
|
| 276 |
+
1. hf_login(token) ← 1-2 ثانية
|
| 277 |
+
2. LLaMA 3 8B-Instruct (4-bit NF4) ← 5-8 دقائق (4.5 GB)
|
| 278 |
+
- BitsAndBytesConfig: load_in_4bit=True
|
| 279 |
+
- bnb_4bit_quant_type="nf4"
|
| 280 |
+
- bnb_4bit_compute_dtype=float16
|
| 281 |
+
3. SecureBERT+ (ehsanaghaei) ← 1-2 دقيقة
|
| 282 |
+
- AutoModel + AutoTokenizer
|
| 283 |
+
- mean pooling 768-dim
|
| 284 |
+
4. LogisticRegressionModel ← < 1 ثانية
|
| 285 |
+
- joblib.load (Pipeline: PCA + OneVsRestClassifier)
|
| 286 |
+
- np.load thresholds
|
| 287 |
+
```
|
| 288 |
+
|
| 289 |
+
#### 8.4 الانتظار الذكي
|
| 290 |
+
```python
|
| 291 |
+
for i in range(180): # 15 دقيقة كحد أقصى
|
| 292 |
+
time.sleep(5)
|
| 293 |
+
# فحص /health كل 5 ثوانٍ
|
| 294 |
+
# عرض اللوج كل 30 ثانية
|
| 295 |
+
# كشف مبكر للأخطاء (ERROR, ImportError)
|
| 296 |
+
```
|
| 297 |
+
|
| 298 |
+
#### 8.5 Cloudflare Tunnel
|
| 299 |
+
```bash
|
| 300 |
+
wget cloudflared-linux-amd64 → /usr/local/bin/cloudflared
|
| 301 |
+
cloudflared tunnel --url http://localhost:8000
|
| 302 |
+
```
|
| 303 |
+
- لا يحتاج حساباً أو توكناً
|
| 304 |
+
- يُنتج رابطاً مثل: `https://xxxx.trycloudflare.com`
|
| 305 |
+
- صالح طوال جلسة Colab
|
| 306 |
+
|
| 307 |
+
---
|
| 308 |
+
|
| 309 |
+
### الخلية 9: ربط الفرونت تلقائياً
|
| 310 |
+
|
| 311 |
+
**الهدف:** تحديث `index.html` بالرابط الجديد من Cloudflare تلقائياً.
|
| 312 |
+
|
| 313 |
+
```python
|
| 314 |
+
# استخراج الرابط
|
| 315 |
+
match = re.search(r'https://[a-z0-9\-]+\.trycloudflare\.com', content)
|
| 316 |
+
public_url = match.group(0)
|
| 317 |
+
|
| 318 |
+
# تحديث index.html على Drive
|
| 319 |
+
html = re.sub(
|
| 320 |
+
r"const BASE = '[^']*';",
|
| 321 |
+
f"const BASE = '{public_url}';",
|
| 322 |
+
html
|
| 323 |
+
)
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
**النتيجة:**
|
| 327 |
+
```javascript
|
| 328 |
+
// قبل
|
| 329 |
+
const BASE = 'http://127.0.0.1:8000';
|
| 330 |
+
|
| 331 |
+
// بعد
|
| 332 |
+
const BASE = 'https://xxxx.trycloudflare.com';
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
---
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
### الخلية 10: اختبار الـ API
|
| 340 |
+
|
| 341 |
+
**الهدف:** التحقق من عمل كل مكون.
|
| 342 |
+
|
| 343 |
+
#### 10.1 Health Check
|
| 344 |
+
```python
|
| 345 |
+
urllib.request.urlopen('http://localhost:8000/health')
|
| 346 |
+
```
|
| 347 |
+
|
| 348 |
+
**المخرج المتوقع (FULL mode):**
|
| 349 |
+
```json
|
| 350 |
+
{
|
| 351 |
+
"pipeline_mode": "full",
|
| 352 |
+
"pipeline_description": "LLaMA + SecureBERT+ + LogReg",
|
| 353 |
+
"components": {
|
| 354 |
+
"llama_loaded": true,
|
| 355 |
+
"embedder_loaded": true,
|
| 356 |
+
"logreg_loaded": true,
|
| 357 |
+
"cuda_available": true
|
| 358 |
+
},
|
| 359 |
+
"all_model_files_present": true
|
| 360 |
+
}
|
| 361 |
+
```
|
| 362 |
+
|
| 363 |
+
#### 10.2 تحليل قاعدة اختبار
|
| 364 |
+
```python
|
| 365 |
+
rule_xml = '<rule id="18205" level="5">...'
|
| 366 |
+
POST http://localhost:8000/rules/analyze
|
| 367 |
+
```
|
| 368 |
+
|
| 369 |
+
**الـ pipeline خطوة بخطوة:**
|
| 370 |
+
|
| 371 |
+
```
|
| 372 |
+
XML Input (rule 18205)
|
| 373 |
+
↓
|
| 374 |
+
sanitize_rule_from_string()
|
| 375 |
+
- حذف: mitre, if_sid, group, if_group
|
| 376 |
+
↓
|
| 377 |
+
summarize_one_rule() [LLaMA]
|
| 378 |
+
- Input: sanitized XML
|
| 379 |
+
- Output: "Detects the deletion of a security-enabled global group on a Windows system."
|
| 380 |
+
↓
|
| 381 |
+
build_text_for_embedding()
|
| 382 |
+
- text = summary + ". " + description
|
| 383 |
+
- "Detects the deletion of a security-enabled global group on a Windows system. Windows: Security Enabled Global Group Deleted."
|
| 384 |
+
↓
|
| 385 |
+
SecureBERTEmbedder.embed_text()
|
| 386 |
+
- Chunks (256 tokens max)
|
| 387 |
+
- mean pooling per chunk
|
| 388 |
+
- average chunks → 768-dim vector
|
| 389 |
+
- L2 normalize
|
| 390 |
+
↓
|
| 391 |
+
LogisticRegressionModel.predict()
|
| 392 |
+
- predict_proba(X_user)
|
| 393 |
+
- pred = (proba >= logreg_thr)
|
| 394 |
+
- conf = proba * 100
|
| 395 |
+
- gap = proba - logreg_thr
|
| 396 |
+
↓
|
| 397 |
+
save_technique_mappings() [DB]
|
| 398 |
+
- حفظ 20 تقنية مع confidence
|
| 399 |
+
↓
|
| 400 |
+
JSON Response
|
| 401 |
+
```
|
| 402 |
+
|
| 403 |
+
**المخرج للقاعدة 18205:**
|
| 404 |
+
```
|
| 405 |
+
Technique Pred Conf% Proba Thr Gap
|
| 406 |
+
T1484 ✅ 94.76 0.9476 0.74 +0.2076 ← Primary
|
| 407 |
+
T1531 ❌ 27.92 0.2792 ... ...
|
| 408 |
+
T1070.004 ❌ 21.03 0.2103 ... ...
|
| 409 |
+
T1098 ❌ 10.65 0.1065 ... ...
|
| 410 |
+
T1112 ❌ 9.27 0.0927 ... ...
|
| 411 |
+
```
|
| 412 |
+
|
| 413 |
+
---
|
| 414 |
+
الخطوات القادمة للمود المحلي (lOCAL Mode) غير ضروريه
|
| 415 |
+
|
| 416 |
+
### الخلية 11: تصدير النتائج (اختياري)
|
| 417 |
+
|
| 418 |
+
**الهدف:** تصدير نتائج القواعد المُحلَّلة إلى JSON لاستخدامها لاحقاً على الجهاز المحلي.
|
| 419 |
+
|
| 420 |
+
```python
|
| 421 |
+
export_path = f'{NEEDED_PATH}/murshid_full_results.json'
|
| 422 |
+
json.dump(export_results, f, ensure_ascii=False, indent=2)
|
| 423 |
+
```
|
| 424 |
+
|
| 425 |
+
**الاستخدام:** يُمكِّن استيراد نتائج FULL mode في الباكند المحلي بدون GPU.
|
| 426 |
+
|
| 427 |
+
---
|
| 428 |
+
|
| 429 |
+
### الخلية 12: إيقاف الخادم
|
| 430 |
+
|
| 431 |
+
```python
|
| 432 |
+
cf_proc.terminate() # إغلاق Cloudflare tunnel
|
| 433 |
+
server_proc.terminate() # إيقاف uvicorn
|
| 434 |
+
```
|
| 435 |
+
|
| 436 |
+
---
|
| 437 |
+
|
| 438 |
+
## 4. مقارنة أوضاع التشغيل
|
| 439 |
+
|
| 440 |
+
| | FULL mode (Colab) | LOCAL mode (الجهاز) | LITE mode |
|
| 441 |
+
|--|-------------------|---------------------|-----------|
|
| 442 |
+
| **LLaMA** | ✅ | ❌ | ❌ |
|
| 443 |
+
| **SecureBERT+** | ✅ | ✅ | ❌ |
|
| 444 |
+
| **LogReg** | ✅ | ✅ | ✅ |
|
| 445 |
+
| **GPU** | Tesla T4 | لا يلزم | لا يلزم |
|
| 446 |
+
| **Embedding** | نص مُثرى بـ LLaMA | وصف القاعدة فقط | عشوائي |
|
| 447 |
+
| **T1484 confidence** | **94.76%** | 89.29% | غير موثوق |
|
| 448 |
+
| **القرار النهائي** | T1484 ✅ | T1484 ✅ | غير موثوق |
|
| 449 |
+
| **وقت التحليل/قاعدة** | ~30-60 ثانية | ~2-5 ثوانٍ | < 1 ثانية |
|
| 450 |
+
| **الاستخدام** | إنتاج / عرض | تطوير محلي | اختبار فقط |
|
| 451 |
+
|
| 452 |
+
---
|
| 453 |
+
|
| 454 |
+
## 5. معمارية النظام الكاملة على Colab
|
| 455 |
+
|
| 456 |
+
```
|
| 457 |
+
┌─────────────────────────────────────────────────────┐
|
| 458 |
+
│ Google Colab VM │
|
| 459 |
+
│ │
|
| 460 |
+
│ ┌─────────────────────────────────┐ │
|
| 461 |
+
│ │ /content/murshid_backend/ │ │
|
| 462 |
+
│ │ │ │
|
| 463 |
+
│ │ FastAPI (uvicorn :8000) │ │
|
| 464 |
+
│ │ ├── /health │ │
|
| 465 |
+
│ │ ├── POST /rules/analyze │ │
|
| 466 |
+
│ │ ├── GET /results/{rule_id} │ │
|
| 467 |
+
│ │ ├── GET /queries/{tech_id} │ │
|
| 468 |
+
│ │ └── GET /api/db/... │ │
|
| 469 |
+
│ └───────────────┬─────────────────┘ │
|
| 470 |
+
│ │ │
|
| 471 |
+
│ ┌───────────────┴───────────┐ │
|
| 472 |
+
│ │ ML Models (GPU VRAM) │ │
|
| 473 |
+
│ │ ├── LLaMA 3 8B (4-bit) │ │
|
| 474 |
+
│ │ ├── SecureBERT+ │ │
|
| 475 |
+
│ │ └── LogReg Pipeline │ │
|
| 476 |
+
│ └───────────────────────────┘ │
|
| 477 |
+
│ │ │
|
| 478 |
+
│ ┌───────────────┴───────────┐ │
|
| 479 |
+
│ │ /content/murshid.db │ │
|
| 480 |
+
│ │ (SQLite — 6 جداول) │ │
|
| 481 |
+
│ └───────────────────────────┘ │
|
| 482 |
+
│ │
|
| 483 |
+
│ ┌───────────────────────────┐ │
|
| 484 |
+
│ │ cloudflared tunnel │ │
|
| 485 |
+
│ │ localhost:8000 → HTTPS │ │
|
| 486 |
+
│ └───────────────┬───────────┘ │
|
| 487 |
+
└──────────────────┼──────────────────────────────────┘
|
| 488 |
+
│
|
| 489 |
+
▼
|
| 490 |
+
https://xxxx.trycloudflare.com
|
| 491 |
+
│
|
| 492 |
+
▼
|
| 493 |
+
┌─────────────────────────┐
|
| 494 |
+
│ المتصفح / الفرونت │
|
| 495 |
+
│ index.html (React) │
|
| 496 |
+
└─────────────────────────┘
|
| 497 |
+
```
|
| 498 |
+
|
| 499 |
+
---
|
| 500 |
+
|
| 501 |
+
## 6. الأخطاء الشائعة وحلولها
|
| 502 |
+
|
| 503 |
+
| الخطأ | السبب | الحل |
|
| 504 |
+
|-------|-------|------|
|
| 505 |
+
| `ImportError: bitsandbytes>=0.46.1` | نسخة قديمة | شغّلي `!pip install -U bitsandbytes>=0.46.1` |
|
| 506 |
+
| `FileNotFoundError: murshid_backend` | ZIP غير مرفوع | ارفعي `murshid_backend_for_drive.zip` إلى Drive |
|
| 507 |
+
| `ERR_NGROK_4018` | ngrok يحتاج حساباً | استخدمي Cloudflare Tunnel (خلية 9) |
|
| 508 |
+
| `Cannot connect to backend` | CORS مغلق | `allow_origins=["*"]` في `main.py` |
|
| 509 |
+
| Server يستغرق > 15 دقيقة | تنزيل LLaMA بطيء | في الجلسة الثانية التنزيل من Cache |
|
| 510 |
+
| `InconsistentVersionWarning` | sklearn إصدار مختلف | تأكدي من `scikit-learn==1.6.1` |
|
| 511 |
+
|
| 512 |
+
---
|
| 513 |
+
|
| 514 |
+
## 7. الـ Endpoints المتاحة بعد التشغيل
|
| 515 |
+
|
| 516 |
+
| Method | Endpoint | الوصف |
|
| 517 |
+
|--------|----------|-------|
|
| 518 |
+
| `GET` | `/health` | حالة الخادم والنماذج |
|
| 519 |
+
| `GET` | `/api/stats` | إحصائيات Dashboard |
|
| 520 |
+
| `GET` | `/api/db/summary` | عدد الصفوف في الجداول |
|
| 521 |
+
| `GET` | `/api/db/rules` | جميع القواعد في DB |
|
| 522 |
+
| `GET` | `/api/db/mappings` | جميع المطابقات |
|
| 523 |
+
| `GET` | `/api/db/techniques` | تقنيات MITRE المخزّنة |
|
| 524 |
+
| `GET` | `/api/db/templates` | قوالب WQL |
|
| 525 |
+
| `POST` | `/api/db/import-excel` | استيراد Excel |
|
| 526 |
+
| `POST` | `/rules/analyze` | تحليل قاعدة XML (FULL pipeline) |
|
| 527 |
+
| `GET` | `/results/{rule_id}` | نتائج تقنية قاعدة محددة |
|
| 528 |
+
| `GET` | `/queries/{technique_id}` | استعلامات WQL لتقنية |
|
| 529 |
+
| `POST` | `/admin/templates` | إضافة قالب WQL |
|
| 530 |
+
| `PATCH` | `/admin/templates/{id}` | تعديل قالب |
|
| 531 |
+
| `GET` | `/docs` | Swagger UI التفاعلي |
|
| 532 |
+
|
| 533 |
+
---
|
| 534 |
+
|
| 535 |
+
## 8. ملاحظات للعرض التقديمي
|
| 536 |
+
|
| 537 |
+
1. **شغّلي الخلايا قبل العرض بـ 15 دقيقة** (وقت تحميل LLaMA)
|
| 538 |
+
2. **انسخي رابط Cloudflare** وتحققي منه في المتصفح
|
| 539 |
+
3. **الفرونت يُحدَّث تلقائياً** بالرابط الجديد في خلية 9
|
| 540 |
+
4. **كل جلسة Colab جديدة = رابط Cloudflare جديد** — كرّري الخطوات
|
| 541 |
+
5. **DB فارغة في كل جلسة** — حلّلي القواعد عبر Admin Panel أو خلية اختبار
|
| 542 |
+
|
| 543 |
+
---
|
| 544 |
+
|
| 545 |
+
*تاريخ الإنشاء: 8 أبريل 2026 | مشروع مُرشِد — CCIS, PNU*
|
Needed/murshid_label_columns.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
"T1047",
|
| 3 |
+
"T1055",
|
| 4 |
+
"T1059.001",
|
| 5 |
+
"T1070.004",
|
| 6 |
+
"T1078",
|
| 7 |
+
"T1083",
|
| 8 |
+
"T1095",
|
| 9 |
+
"T1098",
|
| 10 |
+
"T1105",
|
| 11 |
+
"T1110",
|
| 12 |
+
"T1112",
|
| 13 |
+
"T1114",
|
| 14 |
+
"T1176",
|
| 15 |
+
"T1190",
|
| 16 |
+
"T1484",
|
| 17 |
+
"T1498",
|
| 18 |
+
"T1499",
|
| 19 |
+
"T1529",
|
| 20 |
+
"T1531",
|
| 21 |
+
"T1562.001"
|
| 22 |
+
]
|
Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be629d9f6780456a9435f8be2655e3fa0a848fbe2a4f166813913331b4c43ba4
|
| 3 |
+
size 206584
|
Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:005a664d7faa22104e4a9e58ace6976628d1d00c1cabcaead1833ff792366c79
|
| 3 |
+
size 208
|
Needed/murshid_query_template_structure_clean_shared.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1491c4dee86bbf29691b3c4254a344e2cb87eabbb77f04f49da09856cb1d145
|
| 3 |
+
size 20938
|
README.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Murshid - مُرشِد
|
| 3 |
+
emoji: 🛡️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# 🛡️ Murshid | مُرشِد
|
| 12 |
+
|
| 13 |
+
**From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts**
|
| 14 |
+
|
| 15 |
+
REST API + Dashboard for analyzing Wazuh IDS rules and mapping them to MITRE ATT&CK techniques.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **Rule Analysis**: Parse Wazuh XML rules and classify MITRE ATT&CK techniques
|
| 20 |
+
- **WQL Queries**: Get pre-built Wazuh Query Language templates per technique
|
| 21 |
+
- **Dashboard**: Interactive web UI with statistics and DB viewer
|
| 22 |
+
- **ML Pipeline**: Logistic Regression with SecureBERT+ embeddings
|
| 23 |
+
|
| 24 |
+
## Tech Stack
|
| 25 |
+
|
| 26 |
+
- **FastAPI** — REST API
|
| 27 |
+
- **SQLite** — Database
|
| 28 |
+
- **Logistic Regression** — Primary classification model
|
| 29 |
+
- **SecureBERT+** — Text embeddings (optional, requires torch)
|
| 30 |
+
|
| 31 |
+
## API Endpoints
|
| 32 |
+
|
| 33 |
+
| Method | URL | Description |
|
| 34 |
+
|--------|-----|-------------|
|
| 35 |
+
| `GET` | `/health` | System health check |
|
| 36 |
+
| `POST` | `/rules/analyze` | Analyze a Wazuh XML rule |
|
| 37 |
+
| `GET` | `/results/{rule_id}` | Get stored results for a rule |
|
| 38 |
+
| `GET` | `/queries/{technique_id}` | Get WQL templates for a technique |
|
| 39 |
+
| `GET` | `/docs` | Interactive Swagger documentation |
|
murshid_backend/README.md
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Murshid Backend
|
| 2 |
+
|
| 3 |
+
REST API لمشروع "مرشد — من التنبيهات إلى التوجيه: ربط تقنيات MITRE ATT&CK لمحللي SOC"
|
| 4 |
+
|
| 5 |
+
## التقنيات
|
| 6 |
+
|
| 7 |
+
- **FastAPI** — REST API
|
| 8 |
+
- **MySQL** + **SQLAlchemy** — قاعدة البيانات
|
| 9 |
+
- **Alembic** — هجرة الجداول
|
| 10 |
+
- **Logistic Regression** — النموذج الأساسي في هذه المرحلة
|
| 11 |
+
- **SecureBERT+** — تضمينات نصية
|
| 12 |
+
- **Llama 3 8B** — تلخيص قواعد Wazuh
|
| 13 |
+
|
| 14 |
+
> المنطق مستخرج من `MurshidUIPipeline.ipynb` دون تعديله.
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## هيكل المشروع
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
murshid_backend/
|
| 22 |
+
app/
|
| 23 |
+
main.py ← نقطة تشغيل FastAPI
|
| 24 |
+
config.py
|
| 25 |
+
api/routes/
|
| 26 |
+
health.py ← GET /health
|
| 27 |
+
rules.py ← POST /rules/analyze + GET /results/{rule_id}
|
| 28 |
+
queries.py ← GET /queries/{technique_id} + Admin endpoints
|
| 29 |
+
services/
|
| 30 |
+
ml_service.py
|
| 31 |
+
rule_service.py
|
| 32 |
+
result_service.py
|
| 33 |
+
template_service.py
|
| 34 |
+
ml/
|
| 35 |
+
sanitizer.py ← تنظيف XML
|
| 36 |
+
summarizer.py ← تلخيص Llama
|
| 37 |
+
embedder.py ← SecureBERT+
|
| 38 |
+
logistic_model.py ← Logistic Regression inference
|
| 39 |
+
pipeline.py ← analyze_rule() الشامل
|
| 40 |
+
models/ ← SQLAlchemy ORM (6 جداول من ER Diagram)
|
| 41 |
+
schemas/ ← Pydantic schemas
|
| 42 |
+
repositories/ ← DB access layer
|
| 43 |
+
db/
|
| 44 |
+
base.py
|
| 45 |
+
session.py
|
| 46 |
+
alembic/
|
| 47 |
+
versions/0001_initial_schema.py
|
| 48 |
+
requirements.txt
|
| 49 |
+
.env.example
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
---
|
| 53 |
+
|
| 54 |
+
## جداول قاعدة البيانات (مستخرجة من ER Diagram §3.2.6)
|
| 55 |
+
|
| 56 |
+
| جدول | المصدر في التقرير |
|
| 57 |
+
|------|-------------------|
|
| 58 |
+
| `users` | User entity — username, email, password_hash, role |
|
| 59 |
+
| `mapping_jobs` | MappingJob entity — job_id, file_name, status, progress, timestamp |
|
| 60 |
+
| `rules` | Rule entity — rule_id, embedding_vector, job_id |
|
| 61 |
+
| `techniques` | Technique entity — technique_id, technique_name, tactic |
|
| 62 |
+
| `rule_technique_mappings` | RuleTechniqueMapping — rule_id, technique_id, confidence_score |
|
| 63 |
+
| `query_templates` | QueryTemplate — purpose, wql_query, note, is_active |
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## الإعداد والتشغيل
|
| 68 |
+
|
| 69 |
+
### 1) متطلبات
|
| 70 |
+
|
| 71 |
+
- Python 3.10+
|
| 72 |
+
- MySQL 8+
|
| 73 |
+
- GPU موصى به لـ Llama 3 8B
|
| 74 |
+
|
| 75 |
+
### 2) تثبيت
|
| 76 |
+
|
| 77 |
+
```powershell
|
| 78 |
+
cd d:\GP\murshid_backend
|
| 79 |
+
python -m venv .venv
|
| 80 |
+
.\.venv\Scripts\activate
|
| 81 |
+
pip install -r requirements.txt
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
### 3) إعداد قاعدة البيانات
|
| 85 |
+
|
| 86 |
+
إنشاء قاعدة البيانات في MySQL:
|
| 87 |
+
```sql
|
| 88 |
+
CREATE DATABASE murshid_db CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
### 4) إعداد `.env`
|
| 92 |
+
|
| 93 |
+
```powershell
|
| 94 |
+
copy .env.example .env
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
عدّلي القيم:
|
| 98 |
+
```env
|
| 99 |
+
MURSHID_DB_URL=mysql+pymysql://root:YOUR_PASSWORD@localhost:3306/murshid_db
|
| 100 |
+
MURSHID_MODELS_DIR=d:/GP/Needed
|
| 101 |
+
HF_TOKEN=hf_xxxx
|
| 102 |
+
MURSHID_SKIP_LLM=false
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
### 5) تأكد من وجود ملفات النماذج في `d:\GP\Needed`
|
| 106 |
+
|
| 107 |
+
```
|
| 108 |
+
murshid_logreg_pipeline_manual_oof_pcatuned.joblib
|
| 109 |
+
murshid_logreg_thresholds_manual_oof_pcatuned.npy
|
| 110 |
+
murshid_label_columns.json
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
### 6) تشغيل Alembic (هجرة الجداول)
|
| 114 |
+
|
| 115 |
+
```powershell
|
| 116 |
+
alembic upgrade head
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
### 7) تشغيل الـ API
|
| 120 |
+
|
| 121 |
+
```powershell
|
| 122 |
+
uvicorn app.main:app --reload --host 127.0.0.1 --port 8000
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
---
|
| 126 |
+
|
| 127 |
+
## الـ Endpoints
|
| 128 |
+
|
| 129 |
+
| Method | URL | الوصف |
|
| 130 |
+
|--------|-----|--------|
|
| 131 |
+
| `GET` | `/health` | فحص حالة النظام والنماذج |
|
| 132 |
+
| `POST` | `/rules/analyze` | تحليل قاعدة Wazuh XML وحفظ النتائج |
|
| 133 |
+
| `GET` | `/results/{rule_id}` | استرجاع التقنيات المخزنة لمعرف القاعدة |
|
| 134 |
+
| `GET` | `/queries/{technique_id}` | جلب قوالب WQL لتقنية معينة |
|
| 135 |
+
| `POST` | `/admin/templates` | إضافة قالب WQL جديد (Admin) |
|
| 136 |
+
| `PATCH` | `/admin/templates/{id}` | تعديل أو تعطيل قالب (Admin) |
|
| 137 |
+
|
| 138 |
+
### مثال — تحليل قاعدة
|
| 139 |
+
|
| 140 |
+
```bash
|
| 141 |
+
curl -X POST http://127.0.0.1:8000/rules/analyze \
|
| 142 |
+
-H "Content-Type: application/json" \
|
| 143 |
+
-d '{"rule_xml": "<rule id=\"597\" level=\"5\"><description>Registry Key Entry Deleted.</description></rule>"}'
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
### التوثيق التفاعلي
|
| 147 |
+
|
| 148 |
+
افتحي: **http://127.0.0.1:8000/docs**
|
| 149 |
+
|
| 150 |
+
---
|
| 151 |
+
|
| 152 |
+
## ملاحظات
|
| 153 |
+
|
| 154 |
+
- الملف الأصلي `MurshidUIPipeline.ipynb` **لم يُعدَّل** — المنطق منسوخ إلى طبقة `app/ml/`.
|
| 155 |
+
- النموذج المعتمد في هذه المرحلة: **Logistic Regression** فقط.
|
| 156 |
+
- لتشغيل بدون GPU للاختبار فقط: ضعي `MURSHID_SKIP_LLM=true` في `.env` (لكن `/rules/analyze` ستعيد 503).
|
murshid_backend/TECHNICAL_REPORT.md
ADDED
|
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# تقرير تقني مفصّل — مشروع مُرشِد (Murshid)
|
| 2 |
+
## From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
## 1. نظرة عامة
|
| 7 |
+
|
| 8 |
+
مُرشِد نظام ذكي يحوّل تنبيهات قواعد Wazuh XML إلى تقنيات MITRE ATT&CK مُرتّبة بدرجات ثقة، ويُنتج استعلامات تحقيق WQL جاهزة لمحللي SOC.
|
| 9 |
+
|
| 10 |
+
```
|
| 11 |
+
قاعدة Wazuh XML
|
| 12 |
+
↓
|
| 13 |
+
Sanitization (حذف if_sid, group, mitre)
|
| 14 |
+
↓
|
| 15 |
+
LLaMA 3 8B (تلخيص بجملة واحدة)
|
| 16 |
+
↓
|
| 17 |
+
SecureBERT+ (768-dim embedding)
|
| 18 |
+
↓
|
| 19 |
+
Logistic Regression + PCA (تصنيف)
|
| 20 |
+
↓
|
| 21 |
+
تقنيات MITRE ATT&CK + Confidence Scores
|
| 22 |
+
↓
|
| 23 |
+
قوالب WQL للتحقيق
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
## 2. هيكل المشروع الكامل
|
| 29 |
+
|
| 30 |
+
```
|
| 31 |
+
d:\GP\
|
| 32 |
+
├── MurshidUIPipeline.ipynb ← الدفتر الأصلي (لا يُعدَّل)
|
| 33 |
+
├── Needed\ ← ملفات النماذج المدرّبة
|
| 34 |
+
│ ├── murshid_logreg_pipeline_manual_oof_pcatuned.joblib
|
| 35 |
+
│ ├── murshid_logreg_thresholds_manual_oof_pcatuned.npy
|
| 36 |
+
│ ├── murshid_svmlinear_per_label_thresholds.joblib
|
| 37 |
+
│ └── murshid_label_columns.json (20 تقنية)
|
| 38 |
+
├── murshid_backend\ ← خدمة FastAPI
|
| 39 |
+
│ ├── app\
|
| 40 |
+
│ │ ├── main.py
|
| 41 |
+
│ │ ├── config.py
|
| 42 |
+
│ │ ├── api\routes\
|
| 43 |
+
│ │ │ ├── health.py GET /health
|
| 44 |
+
│ │ │ ├── rules.py POST /rules/analyze | GET /results/{rule_id}
|
| 45 |
+
│ │ │ ├── queries.py GET /queries/{technique_id} | POST,PATCH /admin/templates
|
| 46 |
+
│ │ │ ├── stats.py GET /api/stats
|
| 47 |
+
│ │ │ └── db_viewer.py GET /api/db/{summary|rules|mappings|...}
|
| 48 |
+
│ │ ├── ml\
|
| 49 |
+
│ │ │ ├── sanitizer.py تنظيف XML
|
| 50 |
+
│ │ │ ├── summarizer.py LLaMA inference
|
| 51 |
+
│ │ │ ├── embedder.py SecureBERT+ embeddings
|
| 52 |
+
│ │ │ ├── logistic_model.py LogReg inference (PRIMARY)
|
| 53 |
+
│ │ │ └── pipeline.py تنسيق المراحل (FULL|LOCAL|LITE)
|
| 54 |
+
│ │ ├── models\ SQLAlchemy ORM
|
| 55 |
+
│ │ │ ├── user.py
|
| 56 |
+
│ │ │ ├── mapping_job.py
|
| 57 |
+
│ │ │ ├── rule.py
|
| 58 |
+
│ │ │ ├── technique.py
|
| 59 |
+
│ │ │ ├── rule_technique_mapping.py
|
| 60 |
+
│ │ │ └── query_template.py
|
| 61 |
+
│ │ ├── schemas\ Pydantic schemas
|
| 62 |
+
│ │ ├── services\ Business logic
|
| 63 |
+
│ │ ├── repositories\ DB access
|
| 64 |
+
│ │ └── db\ SQLAlchemy session
|
| 65 |
+
│ ├── alembic\ Migrations
|
| 66 |
+
│ ├── murshid.db SQLite database
|
| 67 |
+
│ ├── .env
|
| 68 |
+
│ └── requirements.txt
|
| 69 |
+
└── murshid_frontend\ واجهة React
|
| 70 |
+
└── index.html
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## 3. طبقة الباكند (FastAPI)
|
| 76 |
+
|
| 77 |
+
### 3.1 الـ Endpoints
|
| 78 |
+
|
| 79 |
+
| Method | URL | الوصف | Actor |
|
| 80 |
+
|--------|-----|--------|-------|
|
| 81 |
+
| `GET` | `/health` | حالة النظام + pipeline mode + ملفات النماذج | All |
|
| 82 |
+
| `GET` | `/api/stats` | إحصائيات Dashboard (KPIs + Technique Frequency) | All |
|
| 83 |
+
| `GET` | `/api/db/summary` | عدد الصفوف في كل جدول | Testing |
|
| 84 |
+
| `GET` | `/api/db/rules` | جميع القواعد المخزّنة | Testing |
|
| 85 |
+
| `GET` | `/api/db/mappings` | جميع مطابقات القواعد-التقنيات | Testing |
|
| 86 |
+
| `GET` | `/api/db/techniques` | جميع تقنيات MITRE المخزّنة | Testing |
|
| 87 |
+
| `GET` | `/api/db/templates` | جميع قوالب WQL | Testing |
|
| 88 |
+
| `POST` | `/rules/analyze` | تحليل قاعدة XML → تخزين النتائج | Admin |
|
| 89 |
+
| `GET` | `/results/{rule_id}` | استرجاع تقنيات قاعدة محددة (Figure 4-11/12) | SOC Analyst |
|
| 90 |
+
| `GET` | `/queries/{technique_id}` | قوالب WQL لتقنية محددة | SOC Analyst |
|
| 91 |
+
| `POST` | `/admin/templates` | إضافة قالب WQL جديد | Admin |
|
| 92 |
+
| `PATCH` | `/admin/templates/{id}` | تعديل/تعطيل قالب | Admin |
|
| 93 |
+
|
| 94 |
+
### 3.2 معمارية الطبقات
|
| 95 |
+
|
| 96 |
+
```
|
| 97 |
+
HTTP Request
|
| 98 |
+
│
|
| 99 |
+
▼
|
| 100 |
+
API Layer (FastAPI routes)
|
| 101 |
+
│ validates input (Pydantic)
|
| 102 |
+
▼
|
| 103 |
+
Service Layer
|
| 104 |
+
│ orchestrates business logic
|
| 105 |
+
▼
|
| 106 |
+
ML Layer Repository Layer
|
| 107 |
+
│ │
|
| 108 |
+
▼ ▼
|
| 109 |
+
Pipeline SQLAlchemy ORM
|
| 110 |
+
(sanitize→embed→classify) │
|
| 111 |
+
│ ▼
|
| 112 |
+
└──────────→ SQLite DB
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
### 3.3 قاعدة البيانات (SQLite + SQLAlchemy)
|
| 116 |
+
|
| 117 |
+
مستخرجة حرفياً من ER Diagram (§3.2.6 من التقرير):
|
| 118 |
+
|
| 119 |
+
| الجدول | الأعمدة الرئيسية | المصدر في التقرير |
|
| 120 |
+
|--------|------------------|-------------------|
|
| 121 |
+
| `users` | user_id, username, email, password_hash, role | User entity |
|
| 122 |
+
| `mapping_jobs` | job_id, user_id, file_name, status, progress, timestamp | MappingJob entity |
|
| 123 |
+
| `rules` | rule_id (PK), job_id, embedding_vector | Rule entity |
|
| 124 |
+
| `techniques` | technique_id (PK), technique_name, tactic | Technique entity |
|
| 125 |
+
| `rule_technique_mappings` | mapping_id, rule_id, technique_id, confidence_score | RuleTechniqueMapping |
|
| 126 |
+
| `query_templates` | template_id, technique_id, purpose, wql_query, note, is_active | QueryTemplate |
|
| 127 |
+
|
| 128 |
+
> Index على `rule_id` في `rule_technique_mappings` (Use Case 6 §3.2.7)
|
| 129 |
+
|
| 130 |
+
---
|
| 131 |
+
|
| 132 |
+
## 4. طبقة ML
|
| 133 |
+
|
| 134 |
+
### 4.1 مراحل الـ Pipeline (من الدفتر)
|
| 135 |
+
|
| 136 |
+
#### المرحلة 1: Sanitization
|
| 137 |
+
```python
|
| 138 |
+
# ml/sanitizer.py — من cell 10 في الدفتر
|
| 139 |
+
REMOVE_TAGS_ANYWHERE = {"mitre", "if_sid", "group", "if_group"}
|
| 140 |
+
# يُحذف: group tags, if_sid, mitre IDs, compliance tags
|
| 141 |
+
# يبقى: description, id, category, decoded_as, info
|
| 142 |
+
```
|
| 143 |
+
|
| 144 |
+
#### المرحلة 2: LLM Summarization (LLaMA 3 8B)
|
| 145 |
+
```python
|
| 146 |
+
# ml/summarizer.py — من cell 11 في الدفتر
|
| 147 |
+
# Input: sanitized XML
|
| 148 |
+
# Prompt: "Write EXACTLY ONE sentence describing the observable event pattern"
|
| 149 |
+
# Output: JSON {"summary": "Detects ..."}
|
| 150 |
+
# Constraints: 7-18 words, يبدأ بـ Detects/Monitors/...
|
| 151 |
+
```
|
| 152 |
+
|
| 153 |
+
#### المرحلة 3: Paragraph Construction
|
| 154 |
+
```python
|
| 155 |
+
# ml/embedder.py — من cell 12 في الدفتر
|
| 156 |
+
text = f"{summary}. {description}."
|
| 157 |
+
# مثال: "Detects deletion of global group. Windows: Security Enabled Global Group Deleted."
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
#### المرحلة 4: SecureBERT+ Embedding
|
| 161 |
+
```python
|
| 162 |
+
# ml/embedder.py — من cell 15 في الدفتر
|
| 163 |
+
# Model: ehsanaghaei/SecureBERT_Plus
|
| 164 |
+
# MAX_LEN: 512 tokens, chunks
|
| 165 |
+
# Pooling: Mean pooling across tokens → 768-dim vector
|
| 166 |
+
# Normalization: L2
|
| 167 |
+
```
|
| 168 |
+
|
| 169 |
+
#### المرحلة 5: Logistic Regression Inference
|
| 170 |
+
```python
|
| 171 |
+
# ml/logistic_model.py — من cell 18-19 في الدفتر
|
| 172 |
+
proba = logreg_model.predict_proba(X_user)
|
| 173 |
+
proba = proba.reshape(-1)
|
| 174 |
+
pred = (proba >= logreg_thr).astype(int)
|
| 175 |
+
conf = proba * 100
|
| 176 |
+
gap = proba - logreg_thr
|
| 177 |
+
# تُرجع جميع الـ 20 تقنية مرتّبة تنازلياً
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
### 4.2 أوضاع التشغيل
|
| 181 |
+
|
| 182 |
+
| الوضع | الشرط | الدقة | الاستخدام |
|
| 183 |
+
|-------|--------|-------|-----------|
|
| 184 |
+
| **FULL** | LLaMA + SecureBERT + LogReg | 100% (مطابق للدفتر) | Colab/GPU |
|
| 185 |
+
| **LOCAL** | SecureBERT + LogReg (بدون LLaMA) | ~95% (وصف بدون ملخص) | الجهاز المحلي |
|
| 186 |
+
| **LITE** | LogReg فقط (بدون torch) | منخفضة (عشوائي) | اختبار البنية فقط |
|
| 187 |
+
|
| 188 |
+
---
|
| 189 |
+
|
| 190 |
+
## 5. طبقة الفرونت (React + Tailwind + Chart.js)
|
| 191 |
+
|
| 192 |
+
### 5.1 الصفحات (CDN-based React, بدون Build Step)
|
| 193 |
+
|
| 194 |
+
| الصفحة | ID | المستخدم | الوصف |
|
| 195 |
+
|--------|-----|----------|--------|
|
| 196 |
+
| Login | — | All | تسجيل دخول + اختيار دور |
|
| 197 |
+
| Dashboard | `dashboard` | All | KPIs + MITRE Technique Frequency Chart |
|
| 198 |
+
| Rule Lookup | `rules` | SOC Analyst | بحث بـ Rule ID → Figure 4-11 + Figure 4-12 |
|
| 199 |
+
| نتائج DB | `dbviewer` | All | استعراض قاعدة البيانات للاختبار |
|
| 200 |
+
| Rule Mapping | `admin` | Admin | رفع XML + تحليل + جدول التقدم |
|
| 201 |
+
| WQL Templates | `templates` | Admin | إدارة قوالب الاستعلامات |
|
| 202 |
+
| Settings | `settings` | All | ملف شخصي + Dark Mode + ألوان |
|
| 203 |
+
|
| 204 |
+
### 5.2 الـ Figures كما في التقرير
|
| 205 |
+
|
| 206 |
+
| Figure | الصفحة | المكوّن |
|
| 207 |
+
|--------|--------|---------|
|
| 208 |
+
| Figure 4-10 | Rule Lookup | Search bar + Rule ID input |
|
| 209 |
+
| Figure 4-11 | Rule Lookup | `TechniqueDistributionChart` — Horizontal bar chart (Top 5, مُلوَّن H/M/L) |
|
| 210 |
+
| Figure 4-12 | Rule Lookup | Investigation Queries table (Primary + Secondary ≥50%) |
|
| 211 |
+
| Figure 4-13 | Admin | Rule Mapping Panel (paste XML + Submit) |
|
| 212 |
+
| Figure 4-14 | Admin | Mapping Progress Table (Job ID, Status, Progress) |
|
| 213 |
+
| Figure 4-9 | Dashboard | KPIs + Technique Frequency Bar Chart |
|
| 214 |
+
|
| 215 |
+
### 5.3 ربط الفرونت بالباكند
|
| 216 |
+
|
| 217 |
+
```javascript
|
| 218 |
+
const BASE = 'http://127.0.0.1:8000';
|
| 219 |
+
// CORS مُفعَّل في الباكند لـ http://localhost:5173 و http://127.0.0.1:5173
|
| 220 |
+
// الفرونت يُخدَّم مباشرةً من FastAPI عبر StaticFiles
|
| 221 |
+
```
|
| 222 |
+
|
| 223 |
+
---
|
| 224 |
+
|
| 225 |
+
## 6. مخطط تدفق البيانات الكامل
|
| 226 |
+
|
| 227 |
+
```
|
| 228 |
+
┌─────────────────────────────────────────┐
|
| 229 |
+
│ SOC Analyst / Admin │
|
| 230 |
+
│ (murshid_frontend/index.html) │
|
| 231 |
+
└────────────────┬────────────────────────┘
|
| 232 |
+
│ HTTP/JSON
|
| 233 |
+
▼
|
| 234 |
+
┌─────────────────────────────────────────┐
|
| 235 |
+
│ FastAPI (port 8000) │
|
| 236 |
+
│ │
|
| 237 |
+
│ /health → pipeline status │
|
| 238 |
+
│ POST /rules/analyze: │
|
| 239 |
+
│ 1. sanitizer.py → clean XML │
|
| 240 |
+
│ 2. summarizer.py → LLaMA summary │ ← FULL mode only
|
| 241 |
+
│ 3. embedder.py → 768-dim vector │
|
| 242 |
+
│ 4. logistic_model → proba + scores │
|
| 243 |
+
│ 5. rule_repo → save to DB │
|
| 244 |
+
│ │
|
| 245 |
+
│ GET /results/{id} → from DB │
|
| 246 |
+
│ GET /queries/{id} → WQL templates │
|
| 247 |
+
└────────────────┬────────────────────────┘
|
| 248 |
+
│ SQLAlchemy
|
| 249 |
+
▼
|
| 250 |
+
┌─────────────────────────────────────────┐
|
| 251 |
+
│ SQLite (murshid.db) │
|
| 252 |
+
│ rules | techniques | mappings │
|
| 253 |
+
│ query_templates | mapping_jobs │
|
| 254 |
+
└─────────────────────────────────────────┘
|
| 255 |
+
```
|
| 256 |
+
|
| 257 |
+
---
|
| 258 |
+
|
| 259 |
+
## 7. التشغيل
|
| 260 |
+
|
| 261 |
+
### المتطلبات
|
| 262 |
+
- Python 3.12 (عبر uv)
|
| 263 |
+
- ملفات النماذج في `d:\GP\Needed\`
|
| 264 |
+
- اتصال إنترنت (لـ SecureBERT+ من HuggingFace أول مرة)
|
| 265 |
+
|
| 266 |
+
### تشغيل الخادم
|
| 267 |
+
```powershell
|
| 268 |
+
cd d:\GP\murshid_backend
|
| 269 |
+
.venv\Scripts\python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
### الروابط
|
| 273 |
+
| الرابط | الوصف |
|
| 274 |
+
|--------|--------|
|
| 275 |
+
| http://127.0.0.1:8000/index.html | الواجهة الرئيسية |
|
| 276 |
+
| http://127.0.0.1:8000/docs | Swagger API Documentation |
|
| 277 |
+
| http://127.0.0.1:8000/health | فحص حالة النظام |
|
| 278 |
+
| http://127.0.0.1:8000/api/db/summary | ملخص قاعدة البيانات |
|
| 279 |
+
|
| 280 |
+
### اختبار سريع
|
| 281 |
+
```powershell
|
| 282 |
+
# 1. تحليل قاعدة
|
| 283 |
+
$body = '{"rule_xml":"<rule id=\"597\"><description>Registry Key Entry Deleted.</description></rule>"}'
|
| 284 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/rules/analyze" -Method POST -ContentType "application/json" -Body $body
|
| 285 |
+
|
| 286 |
+
# 2. استرجاع النتائج
|
| 287 |
+
Invoke-RestMethod "http://127.0.0.1:8000/results/597"
|
| 288 |
+
|
| 289 |
+
# 3. إضافة قالب WQL
|
| 290 |
+
$t = '{"technique_id":"T1112","purpose":"Detect registry modification","wql_query":"agent.name:${HOST} AND rule.description:\"registry\"","note":"Replace ${HOST}"}'
|
| 291 |
+
Invoke-RestMethod -Uri "http://127.0.0.1:8000/admin/templates" -Method POST -ContentType "application/json" -Body $t
|
| 292 |
+
|
| 293 |
+
# 4. جلب الاستعلامات
|
| 294 |
+
Invoke-RestMethod "http://127.0.0.1:8000/queries/T1112"
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
---
|
| 298 |
+
|
| 299 |
+
## 8. الفرق بين FULL mode (Colab) و LOCAL mode (الجهاز)
|
| 300 |
+
|
| 301 |
+
| | Colab (FULL) | الجهاز المحلي (LOCAL) |
|
| 302 |
+
|--|-------------|----------------------|
|
| 303 |
+
| Input text | `"Detects deletion of a security-enabled global group. Windows: Security Enabled Global Group Deleted."` | `"Windows: Security Enabled Global Group Deleted"` |
|
| 304 |
+
| T1484 proba | **0.9476 (94.76%)** | **0.8929 (89.29%)** |
|
| 305 |
+
| سبب الفرق | LLaMA يُثري النص بسياق دلالي | الوصف فقط بدون إثراء |
|
| 306 |
+
| القرار الصحيح | T1484 ✅ | T1484 ✅ |
|
| 307 |
+
|
| 308 |
+
**الاستنتاج:** القرار النهائي صحيح في كلا الوضعين — الاختلاف في درجة الثقة فقط.
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
## 9. حالات الاستخدام المُنفَّذة (من التقرير)
|
| 313 |
+
|
| 314 |
+
| Use Case | الوصف | مُنفَّذ |
|
| 315 |
+
|----------|--------|---------|
|
| 316 |
+
| UC1 | View techniques and scores for a rule | ✅ `GET /results/{rule_id}` |
|
| 317 |
+
| UC2 | View WQL investigation queries | ✅ `GET /queries/{technique_id}` |
|
| 318 |
+
| UC3 | Copy and fill investigation query | ✅ زر Copy في الفرونت |
|
| 319 |
+
| UC4 | Upload Wazuh rule(s) | ✅ Admin Panel |
|
| 320 |
+
| UC5 | Process rule via ML pipeline | ✅ `POST /rules/analyze` |
|
| 321 |
+
| UC6 | Store mapped techniques in DB | ✅ تلقائي بعد analyze |
|
| 322 |
+
| UC7 | Manage WQL templates repository | ✅ `POST/PATCH /admin/templates` |
|
murshid_backend/alembic.ini
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[alembic]
|
| 2 |
+
script_location = alembic
|
| 3 |
+
prepend_sys_path = .
|
| 4 |
+
sqlalchemy.url = sqlite:///murshid.db
|
| 5 |
+
|
| 6 |
+
[loggers]
|
| 7 |
+
keys = root,sqlalchemy,alembic
|
| 8 |
+
|
| 9 |
+
[handlers]
|
| 10 |
+
keys = console
|
| 11 |
+
|
| 12 |
+
[formatters]
|
| 13 |
+
keys = generic
|
| 14 |
+
|
| 15 |
+
[logger_root]
|
| 16 |
+
level = WARN
|
| 17 |
+
handlers = console
|
| 18 |
+
qualname =
|
| 19 |
+
|
| 20 |
+
[logger_sqlalchemy]
|
| 21 |
+
level = WARN
|
| 22 |
+
handlers =
|
| 23 |
+
qualname = sqlalchemy.engine
|
| 24 |
+
|
| 25 |
+
[logger_alembic]
|
| 26 |
+
level = INFO
|
| 27 |
+
handlers =
|
| 28 |
+
qualname = alembic
|
| 29 |
+
|
| 30 |
+
[handler_console]
|
| 31 |
+
class = StreamHandler
|
| 32 |
+
args = (sys.stderr,)
|
| 33 |
+
level = NOTSET
|
| 34 |
+
formatter = generic
|
| 35 |
+
|
| 36 |
+
[formatter_generic]
|
| 37 |
+
format = %(levelname)-5.5s [%(name)s] %(message)s
|
| 38 |
+
datefmt = %H:%M:%S
|
murshid_backend/alembic/env.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
from logging.config import fileConfig
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from sqlalchemy import engine_from_config, pool
|
| 6 |
+
|
| 7 |
+
from alembic import context
|
| 8 |
+
|
| 9 |
+
# make app importable
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
| 11 |
+
|
| 12 |
+
from app.config import settings
|
| 13 |
+
from app.db.base import Base
|
| 14 |
+
import app.models # noqa: F401 — registers all models with Base.metadata
|
| 15 |
+
|
| 16 |
+
config = context.config
|
| 17 |
+
config.set_main_option("sqlalchemy.url", settings.murshid_db_url)
|
| 18 |
+
|
| 19 |
+
if config.config_file_name is not None:
|
| 20 |
+
fileConfig(config.config_file_name)
|
| 21 |
+
|
| 22 |
+
target_metadata = Base.metadata
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def run_migrations_offline() -> None:
|
| 26 |
+
url = config.get_main_option("sqlalchemy.url")
|
| 27 |
+
context.configure(
|
| 28 |
+
url=url,
|
| 29 |
+
target_metadata=target_metadata,
|
| 30 |
+
literal_binds=True,
|
| 31 |
+
dialect_opts={"paramstyle": "named"},
|
| 32 |
+
)
|
| 33 |
+
with context.begin_transaction():
|
| 34 |
+
context.run_migrations()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def run_migrations_online() -> None:
|
| 38 |
+
connectable = engine_from_config(
|
| 39 |
+
config.get_section(config.config_ini_section, {}),
|
| 40 |
+
prefix="sqlalchemy.",
|
| 41 |
+
poolclass=pool.NullPool,
|
| 42 |
+
)
|
| 43 |
+
with connectable.connect() as connection:
|
| 44 |
+
context.configure(connection=connection, target_metadata=target_metadata)
|
| 45 |
+
with context.begin_transaction():
|
| 46 |
+
context.run_migrations()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if context.is_offline_mode():
|
| 50 |
+
run_migrations_offline()
|
| 51 |
+
else:
|
| 52 |
+
run_migrations_online()
|
murshid_backend/alembic/script.py.mako
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""${message}
|
| 2 |
+
|
| 3 |
+
Revision ID: ${up_revision}
|
| 4 |
+
Revises: ${down_revision | comma,n}
|
| 5 |
+
Create Date: ${create_date}
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from typing import Sequence, Union
|
| 9 |
+
|
| 10 |
+
from alembic import op
|
| 11 |
+
import sqlalchemy as sa
|
| 12 |
+
${imports if imports else ""}
|
| 13 |
+
|
| 14 |
+
revision: str = ${repr(up_revision)}
|
| 15 |
+
down_revision: Union[str, None] = ${repr(down_revision)}
|
| 16 |
+
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
| 17 |
+
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def upgrade() -> None:
|
| 21 |
+
${upgrades if upgrades else "pass"}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def downgrade() -> None:
|
| 25 |
+
${downgrades if downgrades else "pass"}
|
murshid_backend/alembic/versions/0001_initial_schema.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""initial schema — all 6 tables from ER Diagram §3.2.6
|
| 2 |
+
|
| 3 |
+
Revision ID: 0001
|
| 4 |
+
Revises:
|
| 5 |
+
Create Date: 2026-04-08
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Sequence, Union
|
| 9 |
+
|
| 10 |
+
import sqlalchemy as sa
|
| 11 |
+
from alembic import op
|
| 12 |
+
|
| 13 |
+
revision: str = "0001"
|
| 14 |
+
down_revision: Union[str, None] = None
|
| 15 |
+
branch_labels: Union[str, Sequence[str], None] = None
|
| 16 |
+
depends_on: Union[str, Sequence[str], None] = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def upgrade() -> None:
|
| 20 |
+
op.create_table(
|
| 21 |
+
"users",
|
| 22 |
+
sa.Column("user_id", sa.Integer(), primary_key=True, autoincrement=True),
|
| 23 |
+
sa.Column("username", sa.String(100), unique=True, nullable=False),
|
| 24 |
+
sa.Column("email", sa.String(255), unique=True, nullable=False),
|
| 25 |
+
sa.Column("password_hash", sa.String(255), nullable=False),
|
| 26 |
+
sa.Column("role", sa.String(20), nullable=False, server_default="analyst"),
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
op.create_table(
|
| 30 |
+
"mapping_jobs",
|
| 31 |
+
sa.Column("job_id", sa.Integer(), primary_key=True, autoincrement=True),
|
| 32 |
+
sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.user_id"), nullable=False),
|
| 33 |
+
sa.Column("file_name", sa.String(255), nullable=False),
|
| 34 |
+
sa.Column("rules_count", sa.Integer(), server_default="0"),
|
| 35 |
+
sa.Column("status", sa.String(20), nullable=False, server_default="pending"),
|
| 36 |
+
sa.Column("progress", sa.Integer(), server_default="0"),
|
| 37 |
+
sa.Column("timestamp", sa.DateTime(), server_default=sa.func.now()),
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
op.create_table(
|
| 41 |
+
"rules",
|
| 42 |
+
sa.Column("rule_id", sa.String(50), primary_key=True),
|
| 43 |
+
sa.Column("job_id", sa.Integer(), sa.ForeignKey("mapping_jobs.job_id"), nullable=True),
|
| 44 |
+
sa.Column("embedding_vector", sa.Text(), nullable=True),
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
op.create_table(
|
| 48 |
+
"techniques",
|
| 49 |
+
sa.Column("technique_id", sa.String(20), primary_key=True),
|
| 50 |
+
sa.Column("technique_name", sa.String(255), nullable=False),
|
| 51 |
+
sa.Column("tactic", sa.String(100), nullable=True),
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
op.create_table(
|
| 55 |
+
"rule_technique_mappings",
|
| 56 |
+
sa.Column("mapping_id", sa.Integer(), primary_key=True, autoincrement=True),
|
| 57 |
+
sa.Column("rule_id", sa.String(50), sa.ForeignKey("rules.rule_id"), nullable=False),
|
| 58 |
+
sa.Column(
|
| 59 |
+
"technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False
|
| 60 |
+
),
|
| 61 |
+
sa.Column("confidence_score", sa.Float(), nullable=False),
|
| 62 |
+
)
|
| 63 |
+
# Index on rule_id — Use Case 6 §3.2.7
|
| 64 |
+
op.create_index("ix_rule_technique_rule_id", "rule_technique_mappings", ["rule_id"])
|
| 65 |
+
|
| 66 |
+
op.create_table(
|
| 67 |
+
"query_templates",
|
| 68 |
+
sa.Column("template_id", sa.Integer(), primary_key=True, autoincrement=True),
|
| 69 |
+
sa.Column(
|
| 70 |
+
"technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False
|
| 71 |
+
),
|
| 72 |
+
sa.Column("purpose", sa.String(255), nullable=True),
|
| 73 |
+
sa.Column("wql_query", sa.Text(), nullable=False),
|
| 74 |
+
sa.Column("note", sa.Text(), nullable=True),
|
| 75 |
+
sa.Column("is_active", sa.Boolean(), nullable=False, server_default="1"),
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def downgrade() -> None:
|
| 80 |
+
op.drop_table("query_templates")
|
| 81 |
+
op.drop_index("ix_rule_technique_rule_id", table_name="rule_technique_mappings")
|
| 82 |
+
op.drop_table("rule_technique_mappings")
|
| 83 |
+
op.drop_table("techniques")
|
| 84 |
+
op.drop_table("rules")
|
| 85 |
+
op.drop_table("mapping_jobs")
|
| 86 |
+
op.drop_table("users")
|
| 87 |
+
pass # SQLite: no custom types to drop
|
murshid_backend/app/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Murshid backend package."""
|
murshid_backend/app/api/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""API layer — FastAPI routers."""
|
murshid_backend/app/api/routes/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Route modules."""
|
murshid_backend/app/api/routes/db_viewer.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GET /api/db/rules — all rules in DB
|
| 3 |
+
GET /api/db/mappings — all rule-technique mappings
|
| 4 |
+
GET /api/db/techniques — all techniques
|
| 5 |
+
GET /api/db/templates — all query templates
|
| 6 |
+
GET /api/db/summary — counts per table
|
| 7 |
+
POST /api/db/import-excel — import WQL templates from Excel file
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from fastapi import APIRouter, Depends, HTTPException, Query
|
| 11 |
+
from sqlalchemy import func
|
| 12 |
+
from sqlalchemy.orm import Session
|
| 13 |
+
|
| 14 |
+
from app.db.session import get_db
|
| 15 |
+
from app.models.mapping_job import MappingJob
|
| 16 |
+
from app.models.query_template import QueryTemplate
|
| 17 |
+
from app.models.rule import Rule
|
| 18 |
+
from app.models.rule_technique_mapping import RuleTechniqueMapping
|
| 19 |
+
from app.models.technique import Technique
|
| 20 |
+
|
| 21 |
+
router = APIRouter(prefix="/api/db", tags=["db-viewer"])
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@router.get("/summary")
|
| 25 |
+
def db_summary(db: Session = Depends(get_db)):
|
| 26 |
+
return {
|
| 27 |
+
"rules": db.query(func.count(Rule.rule_id)).scalar(),
|
| 28 |
+
"techniques": db.query(func.count(Technique.technique_id)).scalar(),
|
| 29 |
+
"rule_mappings": db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar(),
|
| 30 |
+
"query_templates": db.query(func.count(QueryTemplate.template_id)).scalar(),
|
| 31 |
+
"mapping_jobs": db.query(func.count(MappingJob.job_id)).scalar(),
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
@router.get("/rules")
|
| 36 |
+
def all_rules(db: Session = Depends(get_db)):
|
| 37 |
+
rows = db.query(Rule).order_by(Rule.rule_id).all()
|
| 38 |
+
return [
|
| 39 |
+
{
|
| 40 |
+
"rule_id": r.rule_id,
|
| 41 |
+
"job_id": r.job_id,
|
| 42 |
+
"has_embedding": r.embedding_vector is not None,
|
| 43 |
+
}
|
| 44 |
+
for r in rows
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@router.get("/mappings")
|
| 49 |
+
def all_mappings(db: Session = Depends(get_db)):
|
| 50 |
+
rows = (
|
| 51 |
+
db.query(RuleTechniqueMapping)
|
| 52 |
+
.order_by(
|
| 53 |
+
RuleTechniqueMapping.rule_id,
|
| 54 |
+
RuleTechniqueMapping.confidence_score.desc(),
|
| 55 |
+
)
|
| 56 |
+
.all()
|
| 57 |
+
)
|
| 58 |
+
return [
|
| 59 |
+
{
|
| 60 |
+
"mapping_id": m.mapping_id,
|
| 61 |
+
"rule_id": m.rule_id,
|
| 62 |
+
"technique_id": m.technique_id,
|
| 63 |
+
"confidence_score": round(m.confidence_score, 4),
|
| 64 |
+
"confidence_pct": round(m.confidence_score * 100, 2),
|
| 65 |
+
}
|
| 66 |
+
for m in rows
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
@router.get("/techniques")
|
| 71 |
+
def all_techniques(db: Session = Depends(get_db)):
|
| 72 |
+
rows = db.query(Technique).order_by(Technique.technique_id).all()
|
| 73 |
+
return [
|
| 74 |
+
{
|
| 75 |
+
"technique_id": t.technique_id,
|
| 76 |
+
"technique_name": t.technique_name,
|
| 77 |
+
"tactic": t.tactic,
|
| 78 |
+
}
|
| 79 |
+
for t in rows
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
@router.get("/templates")
|
| 84 |
+
def all_templates(db: Session = Depends(get_db)):
|
| 85 |
+
rows = db.query(QueryTemplate).order_by(QueryTemplate.technique_id, QueryTemplate.template_id).all()
|
| 86 |
+
return [
|
| 87 |
+
{
|
| 88 |
+
"template_id": t.template_id,
|
| 89 |
+
"technique_id": t.technique_id,
|
| 90 |
+
"purpose": t.purpose,
|
| 91 |
+
"wql_query": t.wql_query,
|
| 92 |
+
"note": t.note,
|
| 93 |
+
"is_active": t.is_active,
|
| 94 |
+
}
|
| 95 |
+
for t in rows
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@router.post("/import-excel")
|
| 100 |
+
def import_excel_templates(
|
| 101 |
+
replace: bool = Query(False, description="Update existing templates if True"),
|
| 102 |
+
db: Session = Depends(get_db),
|
| 103 |
+
):
|
| 104 |
+
"""
|
| 105 |
+
Import WQL query templates from the Excel file:
|
| 106 |
+
murshid_query_template_structure_clean_shared.xlsx
|
| 107 |
+
|
| 108 |
+
The file is read from MURSHID_MODELS_DIR or the GP root folder.
|
| 109 |
+
Pass ?replace=true to overwrite existing templates.
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
from scripts.import_excel_templates import run
|
| 113 |
+
result = run(db, replace=replace)
|
| 114 |
+
except FileNotFoundError as e:
|
| 115 |
+
raise HTTPException(status_code=404, detail=str(e))
|
| 116 |
+
except Exception as e:
|
| 117 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 118 |
+
|
| 119 |
+
if "error" in result:
|
| 120 |
+
raise HTTPException(status_code=404, detail=result["error"])
|
| 121 |
+
|
| 122 |
+
return result
|
murshid_backend/app/api/routes/health.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GET /health — system readiness check with clear pipeline mode info."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from fastapi import APIRouter
|
| 6 |
+
|
| 7 |
+
from app.config import settings
|
| 8 |
+
from app.ml.pipeline import _store, is_ready
|
| 9 |
+
|
| 10 |
+
router = APIRouter(tags=["health"])
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
import torch
|
| 14 |
+
_CUDA = torch.cuda.is_available()
|
| 15 |
+
_TORCH = True
|
| 16 |
+
_TORCH_ERR = None
|
| 17 |
+
except (ImportError, OSError) as _e:
|
| 18 |
+
_CUDA = False
|
| 19 |
+
_TORCH = False
|
| 20 |
+
_TORCH_ERR = str(_e)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _check_model_files() -> dict:
|
| 24 |
+
base = Path(settings.murshid_models_dir).resolve()
|
| 25 |
+
files = {
|
| 26 |
+
"logreg_joblib": base / settings.logreg_joblib,
|
| 27 |
+
"logreg_thresholds": base / settings.logreg_thresholds_npy,
|
| 28 |
+
"label_columns": base / settings.label_columns_json,
|
| 29 |
+
}
|
| 30 |
+
return {k: v.is_file() for k, v in files.items()}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@router.get("/health")
|
| 34 |
+
def health():
|
| 35 |
+
model_files = _check_model_files()
|
| 36 |
+
all_files_ok = all(model_files.values())
|
| 37 |
+
|
| 38 |
+
if _store.llama_model is not None:
|
| 39 |
+
mode = "full"
|
| 40 |
+
mode_desc = "LLaMA + SecureBERT+ + LogReg"
|
| 41 |
+
elif _store.embedder is not None and _store.logreg is not None:
|
| 42 |
+
mode = "local"
|
| 43 |
+
mode_desc = "SecureBERT+ + LogReg (no LLaMA — using description as text)"
|
| 44 |
+
elif _store.logreg is not None:
|
| 45 |
+
mode = "lite"
|
| 46 |
+
mode_desc = "LogReg only (no embedder — random vectors, testing only)"
|
| 47 |
+
else:
|
| 48 |
+
mode = "not_ready"
|
| 49 |
+
mode_desc = "No ML models loaded"
|
| 50 |
+
|
| 51 |
+
return {
|
| 52 |
+
"status": "ok",
|
| 53 |
+
"pipeline_ready": is_ready(),
|
| 54 |
+
"pipeline_mode": mode,
|
| 55 |
+
"pipeline_description": mode_desc,
|
| 56 |
+
"analyze_available": _store.logreg is not None,
|
| 57 |
+
"components": {
|
| 58 |
+
"llama_loaded": _store.llama_model is not None,
|
| 59 |
+
"embedder_loaded": _store.embedder is not None,
|
| 60 |
+
"logreg_loaded": _store.logreg is not None,
|
| 61 |
+
"torch_installed": _TORCH,
|
| 62 |
+
"cuda_available": _CUDA,
|
| 63 |
+
"torch_error": _TORCH_ERR,
|
| 64 |
+
},
|
| 65 |
+
"model_files": model_files,
|
| 66 |
+
"all_model_files_present": all_files_ok,
|
| 67 |
+
"models_dir": str(settings.murshid_models_dir.resolve()),
|
| 68 |
+
"skip_llm_env": settings.murshid_skip_llm,
|
| 69 |
+
"next_step": (
|
| 70 |
+
"POST /rules/analyze is ready!" if _store.logreg is not None
|
| 71 |
+
else "Copy .joblib and .npy files to MURSHID_MODELS_DIR and restart."
|
| 72 |
+
),
|
| 73 |
+
}
|
murshid_backend/app/api/routes/queries.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GET /queries/{technique_id} — SOC Analyst: fetch WQL templates.
|
| 3 |
+
POST /admin/templates — Admin: add new template.
|
| 4 |
+
PATCH /admin/templates/{template_id} — Admin: update / disable template.
|
| 5 |
+
|
| 6 |
+
Based on:
|
| 7 |
+
Use Case 2 (View Investigation WQL Queries) — §3.2.7
|
| 8 |
+
Use Case 7 (Manage static query templates) — §3.2.7
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 12 |
+
from sqlalchemy.orm import Session
|
| 13 |
+
|
| 14 |
+
from app.db.session import get_db
|
| 15 |
+
from app.schemas.query import QueryTemplateIn, QueryTemplateOut, QueryTemplateUpdate
|
| 16 |
+
from app.services.template_service import TemplateService
|
| 17 |
+
|
| 18 |
+
router = APIRouter(tags=["queries"])
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _get_template_service(db: Session = Depends(get_db)) -> TemplateService:
|
| 22 |
+
return TemplateService(db=db)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# GET /queries/{technique_id}
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@router.get("/queries/{technique_id}", response_model=list[QueryTemplateOut])
|
| 31 |
+
def get_queries(
|
| 32 |
+
technique_id: str,
|
| 33 |
+
svc: TemplateService = Depends(_get_template_service),
|
| 34 |
+
):
|
| 35 |
+
"""
|
| 36 |
+
Returns all active WQL templates for the given MITRE technique.
|
| 37 |
+
Use Case 2 — §3.2.7
|
| 38 |
+
"""
|
| 39 |
+
templates = svc.get_queries_for_technique(technique_id)
|
| 40 |
+
if not templates:
|
| 41 |
+
raise HTTPException(
|
| 42 |
+
status_code=404,
|
| 43 |
+
detail=f"No active query templates found for technique '{technique_id}'.",
|
| 44 |
+
)
|
| 45 |
+
return [QueryTemplateOut(**t) for t in templates]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
+
# Admin endpoints
|
| 50 |
+
# ---------------------------------------------------------------------------
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@router.post("/admin/templates", response_model=QueryTemplateOut, status_code=201)
|
| 54 |
+
def add_template(
|
| 55 |
+
body: QueryTemplateIn,
|
| 56 |
+
svc: TemplateService = Depends(_get_template_service),
|
| 57 |
+
):
|
| 58 |
+
"""Admin: add a new WQL template. Use Case 7 — §3.2.7"""
|
| 59 |
+
result = svc.add_template(
|
| 60 |
+
technique_id=body.technique_id,
|
| 61 |
+
purpose=body.purpose,
|
| 62 |
+
wql_query=body.wql_query,
|
| 63 |
+
note=body.note,
|
| 64 |
+
)
|
| 65 |
+
return QueryTemplateOut(**result)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
@router.patch("/admin/templates/{template_id}", response_model=QueryTemplateOut)
|
| 69 |
+
def update_template(
|
| 70 |
+
template_id: int,
|
| 71 |
+
body: QueryTemplateUpdate,
|
| 72 |
+
svc: TemplateService = Depends(_get_template_service),
|
| 73 |
+
):
|
| 74 |
+
"""Admin: update or disable a WQL template. Use Case 7 — §3.2.7"""
|
| 75 |
+
result = svc.update_template(template_id, body.model_dump(exclude_none=True))
|
| 76 |
+
if result is None:
|
| 77 |
+
raise HTTPException(status_code=404, detail=f"Template {template_id} not found.")
|
| 78 |
+
return QueryTemplateOut(**result)
|
murshid_backend/app/api/routes/rules.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
POST /rules/analyze — Admin: analyze a rule, persist results.
|
| 3 |
+
GET /results/{rule_id} — SOC Analyst: retrieve stored mappings.
|
| 4 |
+
|
| 5 |
+
Based on:
|
| 6 |
+
Use Case 4+5+6 (Upload, Process, Store) — §3.2.7
|
| 7 |
+
Use Case 1 (View techniques and scores) — §3.2.7
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from fastapi import APIRouter, Depends, HTTPException
|
| 11 |
+
from sqlalchemy.orm import Session
|
| 12 |
+
|
| 13 |
+
from app.db.session import get_db
|
| 14 |
+
from app.ml.pipeline import is_ready
|
| 15 |
+
from app.schemas.result import MappingResult, ResultsResponse
|
| 16 |
+
from app.schemas.rule import AnalyzeRequest, AnalyzeResponse, TechniqueResult
|
| 17 |
+
from app.services.ml_service import MLService
|
| 18 |
+
from app.services.result_service import ResultService
|
| 19 |
+
from app.services.rule_service import RuleService
|
| 20 |
+
|
| 21 |
+
router = APIRouter(tags=["rules"])
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _get_rule_service(db: Session = Depends(get_db)) -> RuleService:
|
| 25 |
+
return RuleService(db=db, ml=MLService())
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _get_result_service(db: Session = Depends(get_db)) -> ResultService:
|
| 29 |
+
return ResultService(db=db)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ---------------------------------------------------------------------------
|
| 33 |
+
# POST /rules/analyze
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@router.post("/rules/analyze", response_model=AnalyzeResponse, status_code=201)
|
| 38 |
+
def analyze_rule(
|
| 39 |
+
body: AnalyzeRequest,
|
| 40 |
+
svc: RuleService = Depends(_get_rule_service),
|
| 41 |
+
):
|
| 42 |
+
"""
|
| 43 |
+
Runs the full ML pipeline on the submitted Wazuh rule XML and stores
|
| 44 |
+
the results in the database.
|
| 45 |
+
"""
|
| 46 |
+
if not is_ready():
|
| 47 |
+
raise HTTPException(status_code=503, detail="ML pipeline not ready.")
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
result = svc.analyze_and_persist(body.rule_xml)
|
| 51 |
+
except ValueError as exc:
|
| 52 |
+
raise HTTPException(status_code=422, detail=str(exc)) from exc
|
| 53 |
+
except RuntimeError as exc:
|
| 54 |
+
raise HTTPException(status_code=503, detail=str(exc)) from exc
|
| 55 |
+
except Exception as exc:
|
| 56 |
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
| 57 |
+
|
| 58 |
+
all_results = [TechniqueResult(**r) for r in result["results"]]
|
| 59 |
+
detected = [r for r in all_results if r.predicted]
|
| 60 |
+
|
| 61 |
+
return AnalyzeResponse(
|
| 62 |
+
rule_id=result["rule_id"],
|
| 63 |
+
sanitized_xml=result["sanitized_xml"],
|
| 64 |
+
summary=result["summary"],
|
| 65 |
+
text_for_embedding=result["text_for_embedding"],
|
| 66 |
+
embedding_dim=result["embedding_dim"],
|
| 67 |
+
pipeline_mode=result.get("pipeline_mode", "full"),
|
| 68 |
+
detected=detected,
|
| 69 |
+
all_results=all_results,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
# GET /results/{rule_id}
|
| 75 |
+
# ---------------------------------------------------------------------------
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@router.get("/results/{rule_id}", response_model=ResultsResponse)
|
| 79 |
+
def get_results(
|
| 80 |
+
rule_id: str,
|
| 81 |
+
svc: ResultService = Depends(_get_result_service),
|
| 82 |
+
):
|
| 83 |
+
"""
|
| 84 |
+
Returns all stored MITRE ATT&CK techniques for a rule ID, sorted by confidence.
|
| 85 |
+
Use Case 1 — §3.2.7
|
| 86 |
+
- mappings: ALL techniques sorted by confidence desc (for Figure 4-11 Top 5 chart)
|
| 87 |
+
- detected: primary + secondary (≥0.5) only (for Figure 4-12 WQL queries)
|
| 88 |
+
"""
|
| 89 |
+
data = svc.get_results_for_rule(rule_id)
|
| 90 |
+
if data is None:
|
| 91 |
+
raise HTTPException(
|
| 92 |
+
status_code=404,
|
| 93 |
+
detail=f"No mapping results found for rule_id '{rule_id}'. "
|
| 94 |
+
"Run POST /rules/analyze first.",
|
| 95 |
+
)
|
| 96 |
+
return ResultsResponse(
|
| 97 |
+
rule_id=rule_id,
|
| 98 |
+
mappings=[MappingResult(**m) for m in data["mappings"]],
|
| 99 |
+
detected=[MappingResult(**m) for m in data["detected"]],
|
| 100 |
+
)
|
murshid_backend/app/api/routes/stats.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GET /api/stats — dashboard KPIs."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends
|
| 4 |
+
from sqlalchemy import func
|
| 5 |
+
from sqlalchemy.orm import Session
|
| 6 |
+
|
| 7 |
+
from app.db.session import get_db
|
| 8 |
+
from app.models.rule import Rule
|
| 9 |
+
from app.models.rule_technique_mapping import RuleTechniqueMapping
|
| 10 |
+
from app.models.query_template import QueryTemplate
|
| 11 |
+
from app.models.technique import Technique
|
| 12 |
+
|
| 13 |
+
router = APIRouter(prefix="/api", tags=["stats"])
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@router.get("/stats")
|
| 17 |
+
def get_stats(db: Session = Depends(get_db)):
|
| 18 |
+
total_rules = db.query(func.count(Rule.rule_id)).scalar() or 0
|
| 19 |
+
total_mappings = db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar() or 0
|
| 20 |
+
total_queries = db.query(func.count(QueryTemplate.template_id)).filter(QueryTemplate.is_active.is_(True)).scalar() or 0
|
| 21 |
+
total_techniques = db.query(func.count(Technique.technique_id)).scalar() or 0
|
| 22 |
+
|
| 23 |
+
technique_freq = (
|
| 24 |
+
db.query(
|
| 25 |
+
RuleTechniqueMapping.technique_id,
|
| 26 |
+
func.count(RuleTechniqueMapping.mapping_id).label("count"),
|
| 27 |
+
)
|
| 28 |
+
.group_by(RuleTechniqueMapping.technique_id)
|
| 29 |
+
.order_by(func.count(RuleTechniqueMapping.mapping_id).desc())
|
| 30 |
+
.limit(10)
|
| 31 |
+
.all()
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
return {
|
| 35 |
+
"total_rules_mapped": total_rules,
|
| 36 |
+
"total_techniques": total_techniques,
|
| 37 |
+
"total_mappings": total_mappings,
|
| 38 |
+
"total_queries": total_queries,
|
| 39 |
+
"technique_frequency": [
|
| 40 |
+
{"technique_id": t.technique_id, "count": t.count}
|
| 41 |
+
for t in technique_freq
|
| 42 |
+
],
|
| 43 |
+
}
|
murshid_backend/app/config.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
|
| 3 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 4 |
+
|
| 5 |
+
_GP_ROOT = Path(__file__).resolve().parent.parent.parent
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class Settings(BaseSettings):
|
| 9 |
+
model_config = SettingsConfigDict(
|
| 10 |
+
env_file=".env",
|
| 11 |
+
env_file_encoding="utf-8",
|
| 12 |
+
extra="ignore",
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
murshid_db_url: str = "mysql+pymysql://root:password@localhost:3306/murshid_db"
|
| 16 |
+
murshid_models_dir: Path = _GP_ROOT / "Needed"
|
| 17 |
+
hf_token: str | None = None
|
| 18 |
+
murshid_skip_llm: bool = False
|
| 19 |
+
secret_key: str = "change_me"
|
| 20 |
+
|
| 21 |
+
llama_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 22 |
+
embed_model_id: str = "ehsanaghaei/SecureBERT_Plus"
|
| 23 |
+
|
| 24 |
+
logreg_joblib: str = "murshid_logreg_pipeline_manual_oof_pcatuned.joblib"
|
| 25 |
+
logreg_thresholds_npy: str = "murshid_logreg_thresholds_manual_oof_pcatuned.npy"
|
| 26 |
+
label_columns_json: str = "murshid_label_columns.json"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
settings = Settings()
|
murshid_backend/app/db/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Database layer."""
|
murshid_backend/app/db/base.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy.orm import DeclarativeBase
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class Base(DeclarativeBase):
|
| 5 |
+
pass
|
murshid_backend/app/db/session.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections.abc import Generator
|
| 2 |
+
|
| 3 |
+
from sqlalchemy import create_engine
|
| 4 |
+
from sqlalchemy.orm import Session, sessionmaker
|
| 5 |
+
|
| 6 |
+
from app.config import settings
|
| 7 |
+
|
| 8 |
+
_is_sqlite = settings.murshid_db_url.startswith("sqlite")
|
| 9 |
+
|
| 10 |
+
engine = create_engine(
|
| 11 |
+
settings.murshid_db_url,
|
| 12 |
+
connect_args={"check_same_thread": False} if _is_sqlite else {},
|
| 13 |
+
pool_pre_ping=not _is_sqlite,
|
| 14 |
+
pool_recycle=3600 if not _is_sqlite else -1,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_db() -> Generator[Session, None, None]:
|
| 21 |
+
db = SessionLocal()
|
| 22 |
+
try:
|
| 23 |
+
yield db
|
| 24 |
+
finally:
|
| 25 |
+
db.close()
|
murshid_backend/app/main.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Murshid Backend — FastAPI entrypoint.
|
| 3 |
+
|
| 4 |
+
Architecture:
|
| 5 |
+
API Layer → app/api/routes/
|
| 6 |
+
Service Layer→ app/services/
|
| 7 |
+
ML Layer → app/ml/
|
| 8 |
+
Repository → app/repositories/
|
| 9 |
+
Database → app/db/ (SQLAlchemy + Alembic, MySQL)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from contextlib import asynccontextmanager
|
| 15 |
+
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
from fastapi import FastAPI
|
| 19 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 20 |
+
from fastapi.staticfiles import StaticFiles
|
| 21 |
+
|
| 22 |
+
from app.api.routes import db_viewer, health, queries, rules, stats
|
| 23 |
+
from app.ml.pipeline import load_models, unload_models
|
| 24 |
+
|
| 25 |
+
_FRONTEND_DIR = Path(__file__).resolve().parent.parent.parent / "murshid_frontend"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@asynccontextmanager
|
| 29 |
+
async def lifespan(app: FastAPI):
|
| 30 |
+
load_models()
|
| 31 |
+
yield
|
| 32 |
+
unload_models()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
app = FastAPI(
|
| 36 |
+
title="Murshid API",
|
| 37 |
+
description=(
|
| 38 |
+
"MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts. "
|
| 39 |
+
"Transforms Wazuh IDS rules into actionable threat intelligence."
|
| 40 |
+
),
|
| 41 |
+
version="1.0.0",
|
| 42 |
+
lifespan=lifespan,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
app.add_middleware(
|
| 46 |
+
CORSMiddleware,
|
| 47 |
+
allow_origins=["*"],
|
| 48 |
+
allow_credentials=True,
|
| 49 |
+
allow_methods=["*"],
|
| 50 |
+
allow_headers=["*"],
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
app.include_router(health.router)
|
| 54 |
+
app.include_router(stats.router)
|
| 55 |
+
app.include_router(db_viewer.router)
|
| 56 |
+
app.include_router(rules.router)
|
| 57 |
+
app.include_router(queries.router)
|
| 58 |
+
|
| 59 |
+
if _FRONTEND_DIR.is_dir():
|
| 60 |
+
app.mount("/", StaticFiles(directory=str(_FRONTEND_DIR), html=True), name="frontend")
|
murshid_backend/app/ml/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""ML layer — logic extracted from MurshidUIPipeline.ipynb without modifying the original."""
|
murshid_backend/app/ml/embedder.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SecureBERT+ embedder — extracted from MurshidUIPipeline.ipynb (cell 15).
|
| 3 |
+
Produces a 768-dim float32 embedding for a text paragraph.
|
| 4 |
+
Also provides build_text_for_embedding (cell 12).
|
| 5 |
+
Original file is NOT modified.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
from lxml import etree
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
import torch
|
| 15 |
+
from transformers import AutoModel, AutoTokenizer
|
| 16 |
+
_TORCH_OK = True
|
| 17 |
+
except (ImportError, OSError):
|
| 18 |
+
_TORCH_OK = False
|
| 19 |
+
|
| 20 |
+
from app.config import settings
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _norm_spaces(s: str) -> str:
|
| 24 |
+
return " ".join((s or "").split()).strip()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _strip_end_punct(s: str) -> str:
|
| 28 |
+
return (s or "").rstrip(". ").strip()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def build_text_for_embedding(clean_rule: str, summary: str) -> str:
|
| 32 |
+
"""Combine LLM summary with rule description — cell 12 of notebook."""
|
| 33 |
+
rule_elem = etree.fromstring(clean_rule.strip())
|
| 34 |
+
raw_desc = rule_elem.findtext("description") or ""
|
| 35 |
+
description = _norm_spaces(raw_desc)
|
| 36 |
+
summary = _norm_spaces(summary)
|
| 37 |
+
description = _norm_spaces(description)
|
| 38 |
+
|
| 39 |
+
if not summary and not description:
|
| 40 |
+
return ""
|
| 41 |
+
if summary and not description:
|
| 42 |
+
return summary
|
| 43 |
+
if description and not summary:
|
| 44 |
+
return description
|
| 45 |
+
|
| 46 |
+
s0 = _strip_end_punct(summary).lower()
|
| 47 |
+
d0 = _strip_end_punct(description).lower()
|
| 48 |
+
|
| 49 |
+
if s0 == d0:
|
| 50 |
+
return _strip_end_punct(summary) + "."
|
| 51 |
+
return f"{_strip_end_punct(summary)}. {_strip_end_punct(description)}."
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class SecureBERTEmbedder:
|
| 55 |
+
"""Mean-pooling embedder using ehsanaghaei/SecureBERT_Plus — cell 15."""
|
| 56 |
+
|
| 57 |
+
MAX_LEN = 512
|
| 58 |
+
BATCH_CHUNKS = 8
|
| 59 |
+
|
| 60 |
+
def __init__(self, model_id: str | None = None, device: str | None = None):
|
| 61 |
+
if not _TORCH_OK:
|
| 62 |
+
raise RuntimeError("torch/transformers not available — SecureBERTEmbedder cannot be initialised.")
|
| 63 |
+
mid = model_id or settings.embed_model_id
|
| 64 |
+
self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
| 65 |
+
torch.backends.cudnn.deterministic = True
|
| 66 |
+
torch.backends.cudnn.benchmark = False
|
| 67 |
+
self.tokenizer = AutoTokenizer.from_pretrained(mid, use_fast=True)
|
| 68 |
+
self.model = AutoModel.from_pretrained(mid).to(self.device)
|
| 69 |
+
self.model.eval()
|
| 70 |
+
self.cls_id = self.tokenizer.cls_token_id
|
| 71 |
+
self.sep_id = self.tokenizer.sep_token_id
|
| 72 |
+
self.pad_id = (
|
| 73 |
+
self.tokenizer.pad_token_id
|
| 74 |
+
if self.tokenizer.pad_token_id is not None
|
| 75 |
+
else self.sep_id
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
def _chunk_text(self, text: str) -> list[list[int]]:
|
| 79 |
+
token_ids = self.tokenizer.encode(text, add_special_tokens=False)
|
| 80 |
+
chunk_size = self.MAX_LEN - 2
|
| 81 |
+
chunks = []
|
| 82 |
+
for i in range(0, len(token_ids), chunk_size):
|
| 83 |
+
piece = token_ids[i : i + chunk_size]
|
| 84 |
+
chunks.append([self.cls_id] + piece + [self.sep_id])
|
| 85 |
+
return chunks
|
| 86 |
+
|
| 87 |
+
def embed_text(self, text: str) -> np.ndarray:
|
| 88 |
+
chunks = self._chunk_text(text)
|
| 89 |
+
all_embs: list[np.ndarray] = []
|
| 90 |
+
|
| 91 |
+
for i in range(0, len(chunks), self.BATCH_CHUNKS):
|
| 92 |
+
batch = chunks[i : i + self.BATCH_CHUNKS]
|
| 93 |
+
max_len = max(len(x) for x in batch)
|
| 94 |
+
input_ids, masks = [], []
|
| 95 |
+
for x in batch:
|
| 96 |
+
pad = max_len - len(x)
|
| 97 |
+
input_ids.append(x + [self.pad_id] * pad)
|
| 98 |
+
masks.append([1] * len(x) + [0] * pad)
|
| 99 |
+
|
| 100 |
+
ids_t = torch.tensor(input_ids).to(self.device)
|
| 101 |
+
mask_t = torch.tensor(masks).to(self.device)
|
| 102 |
+
|
| 103 |
+
with torch.no_grad():
|
| 104 |
+
out = self.model(input_ids=ids_t, attention_mask=mask_t)
|
| 105 |
+
tok_emb = out.last_hidden_state
|
| 106 |
+
mask_exp = mask_t.unsqueeze(-1).expand(tok_emb.size()).float()
|
| 107 |
+
summed = torch.sum(tok_emb * mask_exp, dim=1)
|
| 108 |
+
denom = torch.clamp(mask_exp.sum(dim=1), min=1e-9)
|
| 109 |
+
mean_pooled = summed / denom
|
| 110 |
+
|
| 111 |
+
all_embs.append(mean_pooled.cpu().numpy())
|
| 112 |
+
|
| 113 |
+
all_embs_np = np.vstack(all_embs)
|
| 114 |
+
para_emb = all_embs_np.mean(axis=0)
|
| 115 |
+
para_emb /= np.linalg.norm(para_emb) + 1e-12
|
| 116 |
+
return para_emb.astype(np.float32)
|
murshid_backend/app/ml/logistic_model.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Logistic Regression — PRIMARY model per user decision.
|
| 3 |
+
|
| 4 |
+
Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19):
|
| 5 |
+
|
| 6 |
+
logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib")
|
| 7 |
+
logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy")
|
| 8 |
+
|
| 9 |
+
proba = logreg_model.predict_proba(X_user)
|
| 10 |
+
|
| 11 |
+
if isinstance(proba, list):
|
| 12 |
+
proba = np.column_stack([p[:, 1] for p in proba])
|
| 13 |
+
elif proba.ndim == 3:
|
| 14 |
+
proba = proba[:, :, 1]
|
| 15 |
+
|
| 16 |
+
proba = proba.reshape(-1)
|
| 17 |
+
|
| 18 |
+
pred_logreg = (proba >= logreg_thr).astype(int)
|
| 19 |
+
conf_logreg = proba * 100
|
| 20 |
+
gap_logreg = proba - logreg_thr
|
| 21 |
+
|
| 22 |
+
Original notebook file is NOT modified.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import json
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
|
| 30 |
+
import joblib
|
| 31 |
+
import numpy as np
|
| 32 |
+
|
| 33 |
+
from app.config import settings
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class LogisticRegressionModel:
|
| 37 |
+
"""
|
| 38 |
+
Wraps the trained Logistic Regression pipeline + per-label thresholds.
|
| 39 |
+
File structure (from notebook cell 18):
|
| 40 |
+
logreg_model → sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg))
|
| 41 |
+
logreg_thr → np.ndarray shape (n_techniques,) per-label thresholds
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
def __init__(self, models_dir: Path | None = None) -> None:
|
| 45 |
+
base = Path(models_dir or settings.murshid_models_dir).resolve()
|
| 46 |
+
|
| 47 |
+
logreg_path = base / settings.logreg_joblib
|
| 48 |
+
thr_path = base / settings.logreg_thresholds_npy
|
| 49 |
+
labels_path = base / settings.label_columns_json
|
| 50 |
+
|
| 51 |
+
for p in (logreg_path, thr_path, labels_path):
|
| 52 |
+
if not p.is_file():
|
| 53 |
+
raise FileNotFoundError(f"Missing model file: {p}")
|
| 54 |
+
|
| 55 |
+
# --- notebook cell 18: load model + thresholds ---
|
| 56 |
+
self._model = joblib.load(logreg_path) # logreg_model
|
| 57 |
+
self._thr = np.load(thr_path) # logreg_thr
|
| 58 |
+
|
| 59 |
+
with open(labels_path, encoding="utf-8") as f:
|
| 60 |
+
self.technique_names: list[str] = json.load(f)
|
| 61 |
+
|
| 62 |
+
n = len(self.technique_names)
|
| 63 |
+
if self._thr.shape[0] != n:
|
| 64 |
+
raise ValueError(
|
| 65 |
+
f"LogReg thresholds length {self._thr.shape[0]} != {n} labels"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# ------------------------------------------------------------------
|
| 69 |
+
|
| 70 |
+
def predict(self, embedding_1d: np.ndarray) -> list[dict]:
|
| 71 |
+
"""
|
| 72 |
+
Run LogReg inference exactly as in notebook cell 19.
|
| 73 |
+
|
| 74 |
+
Returns list of dicts sorted by confidence_percent desc:
|
| 75 |
+
technique_id, predicted, confidence_percent, proba, threshold, gap
|
| 76 |
+
"""
|
| 77 |
+
X_user = embedding_1d.reshape(1, -1)
|
| 78 |
+
|
| 79 |
+
# --- verbatim from notebook cell 19 ---
|
| 80 |
+
proba = self._model.predict_proba(X_user)
|
| 81 |
+
|
| 82 |
+
if isinstance(proba, list):
|
| 83 |
+
proba = np.column_stack([p[:, 1] for p in proba])
|
| 84 |
+
elif proba.ndim == 3:
|
| 85 |
+
proba = proba[:, :, 1]
|
| 86 |
+
|
| 87 |
+
proba = proba.reshape(-1)
|
| 88 |
+
|
| 89 |
+
pred_logreg = (proba >= self._thr).astype(int)
|
| 90 |
+
conf_logreg = proba * 100
|
| 91 |
+
gap_logreg = proba - self._thr
|
| 92 |
+
# --- end verbatim ---
|
| 93 |
+
|
| 94 |
+
results = [
|
| 95 |
+
{
|
| 96 |
+
"technique_id": self.technique_names[i],
|
| 97 |
+
"predicted": bool(pred_logreg[i]),
|
| 98 |
+
"confidence_percent": round(float(conf_logreg[i]), 2),
|
| 99 |
+
"proba": round(float(proba[i]), 4),
|
| 100 |
+
"threshold": round(float(self._thr[i]), 4),
|
| 101 |
+
"gap": round(float(gap_logreg[i]), 4),
|
| 102 |
+
}
|
| 103 |
+
for i in range(len(self.technique_names))
|
| 104 |
+
]
|
| 105 |
+
|
| 106 |
+
# sort: predicted first, then by confidence desc (notebook sort logic)
|
| 107 |
+
return sorted(
|
| 108 |
+
results,
|
| 109 |
+
key=lambda r: (r["predicted"], r["confidence_percent"]),
|
| 110 |
+
reverse=True,
|
| 111 |
+
)
|
murshid_backend/app/ml/pipeline.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Full inference pipeline — combines sanitizer → summarizer → embedder → logistic_model.
|
| 3 |
+
Exposes analyze_rule(rule_xml) -> dict as the single callable for the service layer.
|
| 4 |
+
|
| 5 |
+
Modes:
|
| 6 |
+
FULL : LLaMA available + SecureBERT+ + LogReg (GPU/Colab required)
|
| 7 |
+
LOCAL : MURSHID_SKIP_LLM=true + SecureBERT+ + LogReg
|
| 8 |
+
→ skips LLaMA; uses <description> field as the paragraph text.
|
| 9 |
+
This allows POST /rules/analyze to work locally without a GPU.
|
| 10 |
+
LITE : torch not installed → uses a trivial bag-of-words fake embedding (testing only)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import xml.etree.ElementTree as ET
|
| 16 |
+
from dataclasses import dataclass
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
|
| 21 |
+
from app.config import settings
|
| 22 |
+
from app.ml.logistic_model import LogisticRegressionModel
|
| 23 |
+
from app.ml.sanitizer import sanitize_rule_from_string
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
import torch
|
| 27 |
+
from huggingface_hub import login as hf_login
|
| 28 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 29 |
+
from app.ml.embedder import SecureBERTEmbedder, build_text_for_embedding
|
| 30 |
+
from app.ml.summarizer import summarize_one_rule
|
| 31 |
+
_TORCH_AVAILABLE = True
|
| 32 |
+
_TORCH_ERROR: str | None = None
|
| 33 |
+
except (ImportError, OSError) as _e:
|
| 34 |
+
_TORCH_AVAILABLE = False
|
| 35 |
+
_TORCH_ERROR = str(_e)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# Singleton container (loaded once at startup)
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
@dataclass
|
| 44 |
+
class _ModelStore:
|
| 45 |
+
llama_model: Any | None = None
|
| 46 |
+
llama_tokenizer: Any | None = None
|
| 47 |
+
llama_device: str = "cpu"
|
| 48 |
+
embedder: SecureBERTEmbedder | None = None
|
| 49 |
+
logreg: LogisticRegressionModel | None = None
|
| 50 |
+
ready: bool = False
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
_store = _ModelStore()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def load_models() -> None:
|
| 57 |
+
"""
|
| 58 |
+
Load all models into _store.
|
| 59 |
+
Call once at FastAPI startup (lifespan).
|
| 60 |
+
"""
|
| 61 |
+
if _TORCH_AVAILABLE and settings.hf_token:
|
| 62 |
+
hf_login(token=settings.hf_token, add_to_git_credential=False)
|
| 63 |
+
|
| 64 |
+
if not settings.murshid_skip_llm:
|
| 65 |
+
if not _TORCH_AVAILABLE:
|
| 66 |
+
print("[Murshid] WARNING: torch not installed — skipping LLM load.")
|
| 67 |
+
else:
|
| 68 |
+
bnb_cfg = BitsAndBytesConfig(
|
| 69 |
+
load_in_4bit=True,
|
| 70 |
+
bnb_4bit_use_double_quant=True,
|
| 71 |
+
bnb_4bit_quant_type="nf4",
|
| 72 |
+
bnb_4bit_compute_dtype=torch.float16,
|
| 73 |
+
)
|
| 74 |
+
tok = AutoTokenizer.from_pretrained(settings.llama_model_id, use_fast=True)
|
| 75 |
+
if tok.pad_token is None:
|
| 76 |
+
tok.pad_token = tok.eos_token
|
| 77 |
+
m = AutoModelForCausalLM.from_pretrained(
|
| 78 |
+
settings.llama_model_id,
|
| 79 |
+
quantization_config=bnb_cfg,
|
| 80 |
+
device_map="auto",
|
| 81 |
+
low_cpu_mem_usage=True,
|
| 82 |
+
dtype=torch.float16,
|
| 83 |
+
)
|
| 84 |
+
m.config.pad_token_id = tok.pad_token_id
|
| 85 |
+
m.eval()
|
| 86 |
+
_store.llama_tokenizer = tok
|
| 87 |
+
_store.llama_model = m
|
| 88 |
+
_store.llama_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 89 |
+
|
| 90 |
+
if _TORCH_AVAILABLE:
|
| 91 |
+
try:
|
| 92 |
+
_store.embedder = SecureBERTEmbedder()
|
| 93 |
+
except Exception as exc:
|
| 94 |
+
print(f"[Murshid] WARNING: SecureBERT+ not loaded — {exc}")
|
| 95 |
+
_store.embedder = None
|
| 96 |
+
else:
|
| 97 |
+
print("[Murshid] WARNING: torch not installed — embedder skipped.")
|
| 98 |
+
_store.embedder = None
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
_store.logreg = LogisticRegressionModel()
|
| 102 |
+
except FileNotFoundError as exc:
|
| 103 |
+
print(f"[Murshid] WARNING: LogReg model files missing — {exc}")
|
| 104 |
+
_store.logreg = None
|
| 105 |
+
except Exception as exc:
|
| 106 |
+
print(f"[Murshid] WARNING: LogReg not loaded — {exc}")
|
| 107 |
+
_store.logreg = None
|
| 108 |
+
|
| 109 |
+
_store.ready = True
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def unload_models() -> None:
|
| 113 |
+
_store.llama_model = None
|
| 114 |
+
_store.llama_tokenizer = None
|
| 115 |
+
_store.embedder = None
|
| 116 |
+
_store.logreg = None
|
| 117 |
+
_store.ready = False
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def is_ready() -> bool:
|
| 121 |
+
return _store.ready
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
# Public function
|
| 126 |
+
# ---------------------------------------------------------------------------
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _extract_description(clean_xml: str) -> str:
|
| 130 |
+
"""Extract <description> text from sanitized rule XML."""
|
| 131 |
+
try:
|
| 132 |
+
elem = ET.fromstring(clean_xml.strip())
|
| 133 |
+
desc = elem.findtext("description") or ""
|
| 134 |
+
return " ".join(desc.split()).strip()
|
| 135 |
+
except ET.ParseError:
|
| 136 |
+
return ""
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def analyze_rule(rule_xml: str) -> dict:
|
| 140 |
+
"""
|
| 141 |
+
Full pipeline: XML → sanitize → summarize → embed → LogReg → ranked results.
|
| 142 |
+
|
| 143 |
+
Operates in three modes depending on environment:
|
| 144 |
+
|
| 145 |
+
FULL mode (MURSHID_SKIP_LLM=false, GPU available):
|
| 146 |
+
LLaMA generates a natural-language summary → SecureBERT+ embeds it → LogReg predicts.
|
| 147 |
+
|
| 148 |
+
LOCAL mode (MURSHID_SKIP_LLM=true, torch installed):
|
| 149 |
+
Skips LLaMA. Uses the rule's <description> field directly as the text.
|
| 150 |
+
SecureBERT+ still embeds it properly → LogReg predicts.
|
| 151 |
+
⚠️ Accuracy slightly lower than FULL mode (no LLaMA enrichment).
|
| 152 |
+
|
| 153 |
+
LITE mode (torch not installed):
|
| 154 |
+
Uses a random unit-vector as a placeholder embedding.
|
| 155 |
+
Results are meaningless — for structural testing only.
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
{
|
| 159 |
+
"sanitized_xml": str,
|
| 160 |
+
"summary": str, # LLaMA output OR description OR "(lite mode)"
|
| 161 |
+
"text_for_embedding": str,
|
| 162 |
+
"embedding_dim": int,
|
| 163 |
+
"pipeline_mode": str, # "full" | "local" | "lite"
|
| 164 |
+
"results": [...], # all techniques sorted by confidence desc
|
| 165 |
+
"detected": [...], # predicted == True only
|
| 166 |
+
}
|
| 167 |
+
"""
|
| 168 |
+
if not _store.ready:
|
| 169 |
+
raise RuntimeError("Models not loaded. Call load_models() first.")
|
| 170 |
+
|
| 171 |
+
if "<rule" not in rule_xml or "</rule>" not in rule_xml:
|
| 172 |
+
raise ValueError("Incomplete XML: must contain <rule> and </rule>.")
|
| 173 |
+
|
| 174 |
+
if _store.logreg is None:
|
| 175 |
+
raise RuntimeError(
|
| 176 |
+
"LogReg model not loaded. "
|
| 177 |
+
"Copy the .joblib and .npy files to MURSHID_MODELS_DIR and restart."
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
clean_xml = sanitize_rule_from_string(rule_xml)
|
| 181 |
+
|
| 182 |
+
# ── Choose mode ────────────────────────────────────────────────────────────
|
| 183 |
+
if _store.llama_model is not None and _store.llama_tokenizer is not None:
|
| 184 |
+
# FULL mode: LLaMA summary
|
| 185 |
+
mode = "full"
|
| 186 |
+
summary = summarize_one_rule(
|
| 187 |
+
clean_xml,
|
| 188 |
+
_store.llama_model,
|
| 189 |
+
_store.llama_tokenizer,
|
| 190 |
+
_store.llama_device,
|
| 191 |
+
)
|
| 192 |
+
text = build_text_for_embedding(clean_xml, summary)
|
| 193 |
+
embedding: np.ndarray = _store.embedder.embed_text(text)
|
| 194 |
+
|
| 195 |
+
elif _store.embedder is not None:
|
| 196 |
+
# LOCAL mode: no LLaMA, use <description> as text
|
| 197 |
+
mode = "local"
|
| 198 |
+
desc = _extract_description(clean_xml)
|
| 199 |
+
summary = desc or "No description available."
|
| 200 |
+
text = desc or clean_xml[:300]
|
| 201 |
+
embedding = _store.embedder.embed_text(text)
|
| 202 |
+
|
| 203 |
+
else:
|
| 204 |
+
# LITE mode: torch not available, random unit-vector (structural test only)
|
| 205 |
+
mode = "lite"
|
| 206 |
+
desc = _extract_description(clean_xml)
|
| 207 |
+
summary = f"(lite mode — no embedder) {desc}"
|
| 208 |
+
text = desc or clean_xml[:300]
|
| 209 |
+
dim = 768
|
| 210 |
+
raw = np.random.default_rng(abs(hash(text)) % (2**32)).random(dim).astype(np.float32)
|
| 211 |
+
embedding = raw / (np.linalg.norm(raw) + 1e-12)
|
| 212 |
+
|
| 213 |
+
# ── Classify ───────────────────────────────────────────────────────────────
|
| 214 |
+
all_results = _store.logreg.predict(embedding)
|
| 215 |
+
detected = [r for r in all_results if r["predicted"]]
|
| 216 |
+
|
| 217 |
+
return {
|
| 218 |
+
"sanitized_xml": clean_xml,
|
| 219 |
+
"summary": summary,
|
| 220 |
+
"text_for_embedding": text,
|
| 221 |
+
"embedding_dim": int(embedding.shape[0]),
|
| 222 |
+
"pipeline_mode": mode,
|
| 223 |
+
"results": all_results,
|
| 224 |
+
"detected": detected,
|
| 225 |
+
}
|
murshid_backend/app/ml/sanitizer.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rule sanitizer — extracted from MurshidUIPipeline.ipynb (cell 10).
|
| 3 |
+
Removes: mitre, if_sid, group, if_group tags from Wazuh XML rule.
|
| 4 |
+
Original file is NOT modified.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import copy
|
| 10 |
+
import xml.etree.ElementTree as ET
|
| 11 |
+
|
| 12 |
+
REMOVE_TAGS_ANYWHERE: set[str] = {"mitre", "if_sid", "group", "if_group"}
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _remove_tag_anywhere(root_elem: ET.Element, tag: str) -> None:
|
| 16 |
+
for parent in list(root_elem.iter()):
|
| 17 |
+
for child in list(parent):
|
| 18 |
+
if child.tag == tag:
|
| 19 |
+
parent.remove(child)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def sanitize_rule(rule_elem: ET.Element) -> ET.Element:
|
| 23 |
+
r = copy.deepcopy(rule_elem)
|
| 24 |
+
for tag in REMOVE_TAGS_ANYWHERE:
|
| 25 |
+
_remove_tag_anywhere(r, tag)
|
| 26 |
+
return r
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def sanitize_rule_from_string(rule_xml: str) -> str:
|
| 30 |
+
rule_elem = ET.fromstring(rule_xml.strip())
|
| 31 |
+
sanitized = sanitize_rule(rule_elem)
|
| 32 |
+
return ET.tostring(sanitized, encoding="unicode")
|
murshid_backend/app/ml/summarizer.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
LLM summarizer — extracted from MurshidUIPipeline.ipynb (cells 11-12).
|
| 3 |
+
Converts sanitized Wazuh XML rule to a one-sentence behavior summary.
|
| 4 |
+
Original file is NOT modified.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import re
|
| 11 |
+
import unicodedata
|
| 12 |
+
|
| 13 |
+
import torch
|
| 14 |
+
|
| 15 |
+
# --------------------------------------------------------------------------
|
| 16 |
+
# Constants (identical to notebook)
|
| 17 |
+
# --------------------------------------------------------------------------
|
| 18 |
+
MAX_INPUT_TOKENS = 2048
|
| 19 |
+
MAX_NEW_TOKENS = 160
|
| 20 |
+
DO_SAMPLE = False
|
| 21 |
+
NUM_BEAMS = 4
|
| 22 |
+
MAX_RETRIES = 3
|
| 23 |
+
|
| 24 |
+
SYSTEM_INSTR = (
|
| 25 |
+
"You are a cybersecurity expert.\n"
|
| 26 |
+
"You will be provided with a Wazuh rule in XML format.\n"
|
| 27 |
+
"Write EXACTLY ONE sentence describing the observable event pattern the rule matches.\n\n"
|
| 28 |
+
"HARD CONSTRAINTS:\n"
|
| 29 |
+
'1) Output must be minified JSON only: {"summary":"..."}\n'
|
| 30 |
+
"2) ONE sentence only.\n"
|
| 31 |
+
"3) Start with one of: Detects, Monitors, Identifies, Flags, Reports, Tracks, Captures.\n"
|
| 32 |
+
"4) Use ONLY facts present in the XML. Describe the observable system event only.\n"
|
| 33 |
+
"5) Do NOT infer attacker intent, attack type, or technique.\n"
|
| 34 |
+
"6) Do NOT mention MITRE, ATT&CK, or attack technique names unless explicitly present in the XML.\n"
|
| 35 |
+
"7) Do NOT use speculative language: likely, potentially, possible, possibly, may indicate, or could indicate.\n"
|
| 36 |
+
"8) Length: 7 to 18 words.\n"
|
| 37 |
+
"9) SHOULD include a clear event type when possible.\n"
|
| 38 |
+
"10) Mention at least ONE concrete indicator if available (event_id, process name, file path,\n"
|
| 39 |
+
" registry key, service, protocol/port, URL pattern, command, username, IP).\n"
|
| 40 |
+
"If only a single indicator exists, still produce a complete behavior-focused sentence.\n"
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
REPAIR_HINT = (
|
| 44 |
+
"Your previous output was rejected.\n"
|
| 45 |
+
"Fix it to satisfy ALL constraints:\n"
|
| 46 |
+
'- Output MUST be minified JSON only: {"summary":"..."}\n'
|
| 47 |
+
"- One sentence only.\n"
|
| 48 |
+
"- Keep it behavior-focused.\n"
|
| 49 |
+
"- Include at least ONE concrete indicator if present in the XML.\n"
|
| 50 |
+
"- Do NOT add any extra text outside JSON.\n"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
VERB_OK = ("Detects", "Monitors", "Identifies", "Flags", "Reports", "Tracks", "Captures")
|
| 54 |
+
JSON_OBJ_RE = re.compile(r"\{.*?\}", re.DOTALL)
|
| 55 |
+
BAD_INTRO_RE = re.compile(
|
| 56 |
+
r"^\s*(this\s+(wazuh\s+)?rule|the\s+rule|this\s+alert)\b", re.IGNORECASE
|
| 57 |
+
)
|
| 58 |
+
BAD_INTENT_RE = re.compile(r"\b(likely|potentially|possible|maybe)\b", re.IGNORECASE)
|
| 59 |
+
GENERIC_RE = re.compile(
|
| 60 |
+
r"\b(detects activity|detects suspicious activity|detects potentially suspicious activity|"
|
| 61 |
+
r"monitors activity|reports activity|detects an event pattern defined by the rule indicators)\b",
|
| 62 |
+
re.IGNORECASE,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# --------------------------------------------------------------------------
|
| 67 |
+
# Helpers (identical to notebook)
|
| 68 |
+
# --------------------------------------------------------------------------
|
| 69 |
+
|
| 70 |
+
def _build_prompt(rule_xml: str, tokenizer, extra_hint: str = "") -> str:
|
| 71 |
+
sys = SYSTEM_INSTR + (("\n" + extra_hint) if extra_hint else "")
|
| 72 |
+
user = f"Wazuh rule XML:\n{rule_xml}\n\nReturn JSON only:"
|
| 73 |
+
messages = [{"role": "system", "content": sys}, {"role": "user", "content": user}]
|
| 74 |
+
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def _looks_broken_encoding(s: str) -> bool:
|
| 78 |
+
return any(m in s for m in ("Ã", "Ð", "Ñ", "â", "â")) if s else False
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _try_extract_json_summary(text: str) -> str | None:
|
| 82 |
+
t = (text or "").strip()
|
| 83 |
+
if not t:
|
| 84 |
+
return None
|
| 85 |
+
if t.startswith("{") and '"summary"' in t:
|
| 86 |
+
try:
|
| 87 |
+
obj = json.loads(t)
|
| 88 |
+
if isinstance(obj, dict) and isinstance(obj.get("summary"), str):
|
| 89 |
+
return obj["summary"].strip()
|
| 90 |
+
except Exception:
|
| 91 |
+
pass
|
| 92 |
+
m = JSON_OBJ_RE.search(t)
|
| 93 |
+
if m and '"summary"' in m.group(0):
|
| 94 |
+
blob = m.group(0)
|
| 95 |
+
try:
|
| 96 |
+
obj = json.loads(blob)
|
| 97 |
+
if isinstance(obj, dict) and isinstance(obj.get("summary"), str):
|
| 98 |
+
return obj["summary"].strip()
|
| 99 |
+
except Exception:
|
| 100 |
+
m2 = re.search(r'"summary"\s*:\s*"([^"]+)"', blob)
|
| 101 |
+
if m2:
|
| 102 |
+
return m2.group(1).strip()
|
| 103 |
+
return None
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def _normalize_one_sentence(s: str) -> str:
|
| 107 |
+
s = re.sub(r"\s+", " ", (s or "").strip()).strip()
|
| 108 |
+
s = unicodedata.normalize("NFKC", s)
|
| 109 |
+
if not s:
|
| 110 |
+
return ""
|
| 111 |
+
if BAD_INTRO_RE.match(s):
|
| 112 |
+
s = BAD_INTRO_RE.sub("", s).lstrip(":,- ").strip()
|
| 113 |
+
if not s:
|
| 114 |
+
return ""
|
| 115 |
+
if not any(s.startswith(v) for v in VERB_OK):
|
| 116 |
+
s = "Detects " + (s[0].lower() + s[1:]) if len(s) > 1 else ""
|
| 117 |
+
if not s:
|
| 118 |
+
return ""
|
| 119 |
+
m = re.search(r"[.!?](?:\s|$)", s)
|
| 120 |
+
s = s[: m.end()].strip() if m else s + "."
|
| 121 |
+
s = re.sub(r"^(Detects\s+)+", "Detects ", s).strip()
|
| 122 |
+
return re.sub(r"\s+", " ", s).strip()
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _looks_truncated(s: str) -> bool:
|
| 126 |
+
return not s or s.strip().endswith(("(", ":", " -", ","))
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _has_behavior_signal(s: str) -> bool:
|
| 130 |
+
kws = ["create","delete","execute","spawn","launch","login","logon","authentication",
|
| 131 |
+
"connect","request","query","modify","registry","process","command","file",
|
| 132 |
+
"service","ip","url","dns","http","vpn","account"]
|
| 133 |
+
return any(k in s.lower() for k in kws)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def _has_indicator_signal(s: str) -> bool:
|
| 137 |
+
kws = [".exe",".dll",".ps1",".bat",".cmd","powershell","cmd.exe","reg.exe","rundll32",
|
| 138 |
+
"svchost","registry","temp","system32","event_id","http","dns","ip","url","port","key"]
|
| 139 |
+
return any(k in s.lower() for k in kws)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _is_bad(s: str) -> bool:
|
| 143 |
+
if not s or BAD_INTRO_RE.match(s) or BAD_INTENT_RE.search(s) or GENERIC_RE.search(s):
|
| 144 |
+
return True
|
| 145 |
+
if _looks_broken_encoding(s) or _looks_truncated(s):
|
| 146 |
+
return True
|
| 147 |
+
wc = len(s.split())
|
| 148 |
+
if wc < 7 or wc > 18 or not _has_behavior_signal(s):
|
| 149 |
+
return True
|
| 150 |
+
return bool((s.startswith("{") and "summary" in s) or ('"summary"' in s and "{" in s))
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def _is_catastrophic(s: str) -> bool:
|
| 154 |
+
return not s or _looks_broken_encoding(s) or _looks_truncated(s) or len(s.split()) < 3
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _score(s: str) -> int:
|
| 158 |
+
wc = len(s.split())
|
| 159 |
+
return (
|
| 160 |
+
(3 if 7 <= wc <= 18 else 0)
|
| 161 |
+
+ (3 if _has_behavior_signal(s) else 0)
|
| 162 |
+
+ (2 if _has_indicator_signal(s) else 0)
|
| 163 |
+
+ (1 if not GENERIC_RE.search(s) else 0)
|
| 164 |
+
+ (1 if not BAD_INTENT_RE.search(s) else 0)
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _rescue_finalize(s: str) -> str:
|
| 169 |
+
s = _normalize_one_sentence(s)
|
| 170 |
+
if not s:
|
| 171 |
+
return "Detects rule-matched behavior."
|
| 172 |
+
s = re.sub(r",\s*(possibly|potentially|maybe|may)\b.*$", "", s, flags=re.IGNORECASE).strip()
|
| 173 |
+
s = re.sub(r"\b(possibly|potentially|maybe|may)\b", "", s, flags=re.IGNORECASE)
|
| 174 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 175 |
+
if len(s.split()) < 7:
|
| 176 |
+
low = s.lower()
|
| 177 |
+
for kw, rep in [
|
| 178 |
+
("powershell", "Detects powershell.exe process execution."),
|
| 179 |
+
("cmd", "Detects cmd.exe process execution."),
|
| 180 |
+
("reg", "Detects reg.exe process execution."),
|
| 181 |
+
("svchost", "Detects svchost.exe process execution."),
|
| 182 |
+
]:
|
| 183 |
+
if kw in low:
|
| 184 |
+
s = rep
|
| 185 |
+
break
|
| 186 |
+
else:
|
| 187 |
+
s = s.rstrip(".") + " matching rule indicators."
|
| 188 |
+
if _looks_truncated(s):
|
| 189 |
+
s = s.rstrip(".") + " matching rule indicators."
|
| 190 |
+
if not any(s.startswith(v) for v in VERB_OK):
|
| 191 |
+
s = "Detects " + s[0].lower() + s[1:] if len(s) > 1 else "Detects rule-matched behavior."
|
| 192 |
+
words = s.split()
|
| 193 |
+
if len(words) > 18:
|
| 194 |
+
s = " ".join(words[:18]).rstrip(".") + "."
|
| 195 |
+
return re.sub(r"\s+", " ", s if s.endswith(".") else s + ".").strip()
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
# --------------------------------------------------------------------------
|
| 199 |
+
# Public API
|
| 200 |
+
# --------------------------------------------------------------------------
|
| 201 |
+
|
| 202 |
+
def summarize_one_rule(rule_xml: str, model, tokenizer, device: str | None = None) -> str:
|
| 203 |
+
"""Generate a one-sentence summary for a sanitized Wazuh rule XML string."""
|
| 204 |
+
if device is None:
|
| 205 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 206 |
+
|
| 207 |
+
pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
|
| 208 |
+
eos_id = tokenizer.eos_token_id or pad_id
|
| 209 |
+
|
| 210 |
+
best: str | None = None
|
| 211 |
+
best_any: str | None = None
|
| 212 |
+
last_raw = ""
|
| 213 |
+
last_cleaned = ""
|
| 214 |
+
|
| 215 |
+
for attempt in range(1, MAX_RETRIES + 1):
|
| 216 |
+
prompt = _build_prompt(
|
| 217 |
+
rule_xml, tokenizer, extra_hint=REPAIR_HINT if attempt >= 2 else ""
|
| 218 |
+
)
|
| 219 |
+
inputs = tokenizer(
|
| 220 |
+
prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS
|
| 221 |
+
).to(device)
|
| 222 |
+
|
| 223 |
+
with torch.no_grad():
|
| 224 |
+
outputs = model.generate(
|
| 225 |
+
**inputs,
|
| 226 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
| 227 |
+
do_sample=DO_SAMPLE,
|
| 228 |
+
num_beams=NUM_BEAMS,
|
| 229 |
+
pad_token_id=pad_id,
|
| 230 |
+
eos_token_id=eos_id,
|
| 231 |
+
repetition_penalty=1.05,
|
| 232 |
+
no_repeat_ngram_size=3,
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
raw = tokenizer.decode(
|
| 236 |
+
outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
|
| 237 |
+
).strip()
|
| 238 |
+
last_raw = raw
|
| 239 |
+
|
| 240 |
+
parsed = _try_extract_json_summary(raw)
|
| 241 |
+
if parsed is None:
|
| 242 |
+
continue
|
| 243 |
+
|
| 244 |
+
cleaned = _normalize_one_sentence(parsed)
|
| 245 |
+
last_cleaned = cleaned
|
| 246 |
+
|
| 247 |
+
if cleaned and not _is_catastrophic(cleaned):
|
| 248 |
+
if best_any is None or _score(cleaned) > _score(best_any):
|
| 249 |
+
best_any = cleaned
|
| 250 |
+
|
| 251 |
+
if not _is_bad(cleaned):
|
| 252 |
+
best = cleaned
|
| 253 |
+
break
|
| 254 |
+
|
| 255 |
+
if best is None:
|
| 256 |
+
if best_any and not _is_catastrophic(best_any):
|
| 257 |
+
best = best_any
|
| 258 |
+
else:
|
| 259 |
+
src = last_cleaned or _try_extract_json_summary(last_raw) or last_raw
|
| 260 |
+
best = _rescue_finalize(src)
|
| 261 |
+
|
| 262 |
+
return best
|
murshid_backend/app/ml/svm_model.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SVM classifier — PRIMARY model per the report (§3.1.3 + §4.1).
|
| 3 |
+
|
| 4 |
+
Report quote:
|
| 5 |
+
"the Support Vector Machine (SVM) was adopted as the core classifier"
|
| 6 |
+
"classification using SVM to predict the associated MITRE ATT&CK techniques"
|
| 7 |
+
|
| 8 |
+
Inference logic (verbatim from MurshidUIPipeline.ipynb cell 16+19):
|
| 9 |
+
scores = svm_model.named_steps["clf"].decision_function(
|
| 10 |
+
svm_model.named_steps["pca"].transform(X_user)
|
| 11 |
+
).reshape(-1)
|
| 12 |
+
pred = (scores >= thr_per_label).astype(int)
|
| 13 |
+
margins = scores - thr_per_label
|
| 14 |
+
conf = sigmoid(margins) * 100
|
| 15 |
+
|
| 16 |
+
Original notebook file is NOT modified.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import json
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
|
| 24 |
+
import joblib
|
| 25 |
+
import numpy as np
|
| 26 |
+
|
| 27 |
+
from app.config import settings
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _sigmoid(x: np.ndarray) -> np.ndarray:
|
| 31 |
+
"""Probability calibration: sigmoid(margin) — notebook cell 17."""
|
| 32 |
+
x = np.clip(x, -30, 30)
|
| 33 |
+
return 1.0 / (1.0 + np.exp(-x))
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class SVMModel:
|
| 37 |
+
"""
|
| 38 |
+
Wraps the trained LinearSVC pipeline with per-label thresholds.
|
| 39 |
+
Structure of the .joblib pack (from notebook):
|
| 40 |
+
svm_pack["model"] → sklearn Pipeline (PCA + LinearSVC)
|
| 41 |
+
svm_pack["thresholds_per_label"] → np.ndarray shape (n_techniques,)
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
def __init__(self, models_dir: Path | None = None) -> None:
|
| 45 |
+
base = Path(models_dir or settings.murshid_models_dir).resolve()
|
| 46 |
+
|
| 47 |
+
svm_path = base / settings.svm_joblib
|
| 48 |
+
labels_path = base / settings.label_columns_json
|
| 49 |
+
|
| 50 |
+
for p in (svm_path, labels_path):
|
| 51 |
+
if not p.is_file():
|
| 52 |
+
raise FileNotFoundError(f"Missing model file: {p}")
|
| 53 |
+
|
| 54 |
+
svm_pack = joblib.load(svm_path)
|
| 55 |
+
self._model = svm_pack["model"] # Pipeline(PCA → LinearSVC)
|
| 56 |
+
self._thresholds = np.asarray(
|
| 57 |
+
svm_pack["thresholds_per_label"], dtype=np.float64
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
with open(labels_path, encoding="utf-8") as f:
|
| 61 |
+
self.technique_names: list[str] = json.load(f)
|
| 62 |
+
|
| 63 |
+
n = len(self.technique_names)
|
| 64 |
+
if self._thresholds.shape[0] != n:
|
| 65 |
+
raise ValueError(
|
| 66 |
+
f"SVM thresholds length {self._thresholds.shape[0]} != {n} labels"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
# ------------------------------------------------------------------
|
| 70 |
+
|
| 71 |
+
def predict(self, embedding_1d: np.ndarray) -> list[dict]:
|
| 72 |
+
"""
|
| 73 |
+
Run SVM inference exactly as in the notebook.
|
| 74 |
+
|
| 75 |
+
Returns list of dicts sorted by confidence_percent desc:
|
| 76 |
+
technique_id, predicted, confidence_percent, score, threshold, margin
|
| 77 |
+
"""
|
| 78 |
+
X = embedding_1d.reshape(1, -1)
|
| 79 |
+
|
| 80 |
+
# Apply PCA then LinearSVC decision function (notebook cell 19)
|
| 81 |
+
scores = self._model.named_steps["clf"].decision_function(
|
| 82 |
+
self._model.named_steps["pca"].transform(X)
|
| 83 |
+
).reshape(-1)
|
| 84 |
+
|
| 85 |
+
pred = (scores >= self._thresholds).astype(int)
|
| 86 |
+
margins = scores - self._thresholds
|
| 87 |
+
conf = _sigmoid(margins) * 100 # calibrated confidence (%)
|
| 88 |
+
|
| 89 |
+
results = [
|
| 90 |
+
{
|
| 91 |
+
"technique_id": self.technique_names[i],
|
| 92 |
+
"predicted": bool(pred[i]),
|
| 93 |
+
"confidence_percent": round(float(conf[i]), 2),
|
| 94 |
+
"score": round(float(scores[i]), 4),
|
| 95 |
+
"threshold": round(float(self._thresholds[i]), 4),
|
| 96 |
+
"margin": round(float(margins[i]), 4),
|
| 97 |
+
}
|
| 98 |
+
for i in range(len(self.technique_names))
|
| 99 |
+
]
|
| 100 |
+
|
| 101 |
+
return sorted(results, key=lambda r: r["confidence_percent"], reverse=True)
|
murshid_backend/app/models/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""SQLAlchemy ORM models (tables defined exactly per ER Diagram §3.2.6 of the report)."""
|
| 2 |
+
from app.models.user import User
|
| 3 |
+
from app.models.mapping_job import MappingJob
|
| 4 |
+
from app.models.rule import Rule
|
| 5 |
+
from app.models.technique import Technique
|
| 6 |
+
from app.models.rule_technique_mapping import RuleTechniqueMapping
|
| 7 |
+
from app.models.query_template import QueryTemplate
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"User",
|
| 11 |
+
"MappingJob",
|
| 12 |
+
"Rule",
|
| 13 |
+
"Technique",
|
| 14 |
+
"RuleTechniqueMapping",
|
| 15 |
+
"QueryTemplate",
|
| 16 |
+
]
|
murshid_backend/app/models/mapping_job.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MappingJob entity — ER Diagram §3.2.6
|
| 3 |
+
Attributes: job_ID, file_name, timestamp, rules_count, status, progress
|
| 4 |
+
Linked to User via "uploads" relationship.
|
| 5 |
+
Also visible in Figure 4-14 (Mapping Progress Table).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import enum
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
from sqlalchemy import DateTime, Enum, ForeignKey, Integer, String, func
|
| 12 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 13 |
+
|
| 14 |
+
from app.db.base import Base
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class JobStatus(str, enum.Enum):
|
| 18 |
+
pending = "pending"
|
| 19 |
+
running = "running"
|
| 20 |
+
done = "done"
|
| 21 |
+
failed = "failed"
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class MappingJob(Base):
|
| 25 |
+
__tablename__ = "mapping_jobs"
|
| 26 |
+
|
| 27 |
+
job_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
| 28 |
+
user_id: Mapped[int] = mapped_column(ForeignKey("users.user_id"), nullable=False)
|
| 29 |
+
file_name: Mapped[str] = mapped_column(String(255), nullable=False)
|
| 30 |
+
rules_count: Mapped[int] = mapped_column(Integer, default=0)
|
| 31 |
+
status: Mapped[JobStatus] = mapped_column(
|
| 32 |
+
Enum(JobStatus), nullable=False, default=JobStatus.pending
|
| 33 |
+
)
|
| 34 |
+
progress: Mapped[int] = mapped_column(Integer, default=0)
|
| 35 |
+
timestamp: Mapped[datetime] = mapped_column(
|
| 36 |
+
DateTime, nullable=False, server_default=func.now()
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
user: Mapped["User"] = relationship(back_populates="jobs")
|
| 40 |
+
rules: Mapped[list["Rule"]] = relationship(back_populates="job")
|
murshid_backend/app/models/query_template.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
QueryTemplate entity — ER Diagram §3.2.6
|
| 3 |
+
Attributes: Template_ID, Purpose, wql_query, Note
|
| 4 |
+
Linked to Technique. Admin can add/update/disable (Use Case 7, §3.2.7).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from sqlalchemy import Boolean, ForeignKey, String, Text
|
| 8 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 9 |
+
|
| 10 |
+
from app.db.base import Base
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class QueryTemplate(Base):
|
| 14 |
+
__tablename__ = "query_templates"
|
| 15 |
+
|
| 16 |
+
template_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
| 17 |
+
technique_id: Mapped[str] = mapped_column(
|
| 18 |
+
String(20), ForeignKey("techniques.technique_id"), nullable=False
|
| 19 |
+
)
|
| 20 |
+
purpose: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
| 21 |
+
# WQL with placeholders: ${HOST}, ${USER}, ${IP}
|
| 22 |
+
wql_query: Mapped[str] = mapped_column(Text, nullable=False)
|
| 23 |
+
note: Mapped[str | None] = mapped_column(Text, nullable=True)
|
| 24 |
+
# Admin can disable without deleting — Use Case 7
|
| 25 |
+
is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
|
| 26 |
+
|
| 27 |
+
technique: Mapped["Technique"] = relationship(back_populates="query_templates")
|
murshid_backend/app/models/rule.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Rule entity — ER Diagram §3.2.6
|
| 3 |
+
Attributes: Rule_ID, embedding_vector, job_ID (FK)
|
| 4 |
+
Rule_ID is the Wazuh rule ID string (e.g. "597").
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from sqlalchemy import ForeignKey, String, Text
|
| 8 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 9 |
+
|
| 10 |
+
from app.db.base import Base
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Rule(Base):
|
| 14 |
+
__tablename__ = "rules"
|
| 15 |
+
|
| 16 |
+
rule_id: Mapped[str] = mapped_column(String(50), primary_key=True)
|
| 17 |
+
job_id: Mapped[int | None] = mapped_column(
|
| 18 |
+
ForeignKey("mapping_jobs.job_id"), nullable=True
|
| 19 |
+
)
|
| 20 |
+
# 768-dimensional float vector stored as JSON string; kept nullable for
|
| 21 |
+
# rules where only the mapping result is persisted without the vector.
|
| 22 |
+
embedding_vector: Mapped[str | None] = mapped_column(Text, nullable=True)
|
| 23 |
+
|
| 24 |
+
job: Mapped["MappingJob | None"] = relationship(back_populates="rules")
|
| 25 |
+
technique_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship(
|
| 26 |
+
back_populates="rule", cascade="all, delete-orphan"
|
| 27 |
+
)
|
murshid_backend/app/models/rule_technique_mapping.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RuleTechniqueMapping associative entity — ER Diagram §3.2.6
|
| 3 |
+
Attributes: Mapping_ID, Rule_ID (FK), Technique_ID (FK), confidence_score
|
| 4 |
+
Index on rule_id for fast lookup — mentioned explicitly in Use Case 6 (§3.2.7).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from sqlalchemy import Float, ForeignKey, Index, Integer, String
|
| 8 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 9 |
+
|
| 10 |
+
from app.db.base import Base
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class RuleTechniqueMapping(Base):
|
| 14 |
+
__tablename__ = "rule_technique_mappings"
|
| 15 |
+
|
| 16 |
+
mapping_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
| 17 |
+
rule_id: Mapped[str] = mapped_column(
|
| 18 |
+
String(50), ForeignKey("rules.rule_id"), nullable=False
|
| 19 |
+
)
|
| 20 |
+
technique_id: Mapped[str] = mapped_column(
|
| 21 |
+
String(20), ForeignKey("techniques.technique_id"), nullable=False
|
| 22 |
+
)
|
| 23 |
+
confidence_score: Mapped[float] = mapped_column(Float, nullable=False)
|
| 24 |
+
|
| 25 |
+
rule: Mapped["Rule"] = relationship(back_populates="technique_mappings")
|
| 26 |
+
technique: Mapped["Technique"] = relationship(back_populates="rule_mappings")
|
| 27 |
+
|
| 28 |
+
__table_args__ = (
|
| 29 |
+
# "creates an index on rule_id for efficient lookup" — Use Case 6
|
| 30 |
+
Index("ix_rule_technique_rule_id", "rule_id"),
|
| 31 |
+
)
|
murshid_backend/app/models/technique.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Technique entity — ER Diagram §3.2.6
|
| 3 |
+
Attributes: Technique_ID, technique_name, tactic
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from sqlalchemy import String
|
| 7 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 8 |
+
|
| 9 |
+
from app.db.base import Base
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Technique(Base):
|
| 13 |
+
__tablename__ = "techniques"
|
| 14 |
+
|
| 15 |
+
technique_id: Mapped[str] = mapped_column(String(20), primary_key=True)
|
| 16 |
+
technique_name: Mapped[str] = mapped_column(String(255), nullable=False)
|
| 17 |
+
tactic: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
| 18 |
+
|
| 19 |
+
rule_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship(
|
| 20 |
+
back_populates="technique"
|
| 21 |
+
)
|
| 22 |
+
query_templates: Mapped[list["QueryTemplate"]] = relationship(
|
| 23 |
+
back_populates="technique"
|
| 24 |
+
)
|
murshid_backend/app/models/user.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
User entity — ER Diagram §3.2.6
|
| 3 |
+
Attributes: User_ID, username, email, password_hash, role
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import enum
|
| 7 |
+
|
| 8 |
+
from sqlalchemy import Enum, String
|
| 9 |
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
| 10 |
+
|
| 11 |
+
from app.db.base import Base
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class UserRole(str, enum.Enum):
|
| 15 |
+
admin = "admin"
|
| 16 |
+
analyst = "analyst"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class User(Base):
|
| 20 |
+
__tablename__ = "users"
|
| 21 |
+
|
| 22 |
+
user_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
| 23 |
+
username: Mapped[str] = mapped_column(String(100), unique=True, nullable=False)
|
| 24 |
+
email: Mapped[str] = mapped_column(String(255), unique=True, nullable=False)
|
| 25 |
+
password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
|
| 26 |
+
role: Mapped[UserRole] = mapped_column(
|
| 27 |
+
Enum(UserRole), nullable=False, default=UserRole.analyst
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
jobs: Mapped[list["MappingJob"]] = relationship(back_populates="user")
|
murshid_backend/app/repositories/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Repository layer — thin DB access wrappers."""
|
murshid_backend/app/repositories/job_repo.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CRUD for MappingJob table."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from datetime import datetime, timezone
|
| 6 |
+
|
| 7 |
+
from sqlalchemy.orm import Session
|
| 8 |
+
|
| 9 |
+
from app.models.mapping_job import JobStatus, MappingJob
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def create_job(db: Session, *, user_id: int, file_name: str, rules_count: int = 0) -> MappingJob:
|
| 13 |
+
job = MappingJob(
|
| 14 |
+
user_id=user_id,
|
| 15 |
+
file_name=file_name,
|
| 16 |
+
rules_count=rules_count,
|
| 17 |
+
status=JobStatus.pending,
|
| 18 |
+
progress=0,
|
| 19 |
+
timestamp=datetime.now(tz=timezone.utc),
|
| 20 |
+
)
|
| 21 |
+
db.add(job)
|
| 22 |
+
db.flush()
|
| 23 |
+
return job
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def update_job_status(
|
| 27 |
+
db: Session,
|
| 28 |
+
job_id: int,
|
| 29 |
+
*,
|
| 30 |
+
status: JobStatus,
|
| 31 |
+
progress: int | None = None,
|
| 32 |
+
) -> MappingJob | None:
|
| 33 |
+
job = db.get(MappingJob, job_id)
|
| 34 |
+
if job is None:
|
| 35 |
+
return None
|
| 36 |
+
job.status = status
|
| 37 |
+
if progress is not None:
|
| 38 |
+
job.progress = progress
|
| 39 |
+
db.flush()
|
| 40 |
+
return job
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def get_job(db: Session, job_id: int) -> MappingJob | None:
|
| 44 |
+
return db.get(MappingJob, job_id)
|
murshid_backend/app/repositories/rule_repo.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CRUD for Rule and RuleTechniqueMapping tables."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from sqlalchemy.orm import Session
|
| 9 |
+
|
| 10 |
+
from app.models.rule import Rule
|
| 11 |
+
from app.models.rule_technique_mapping import RuleTechniqueMapping
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def upsert_rule(
|
| 15 |
+
db: Session,
|
| 16 |
+
*,
|
| 17 |
+
rule_id: str,
|
| 18 |
+
job_id: int | None = None,
|
| 19 |
+
embedding: np.ndarray | None = None,
|
| 20 |
+
) -> Rule:
|
| 21 |
+
rule = db.get(Rule, rule_id)
|
| 22 |
+
if rule is None:
|
| 23 |
+
rule = Rule(rule_id=rule_id)
|
| 24 |
+
db.add(rule)
|
| 25 |
+
if job_id is not None:
|
| 26 |
+
rule.job_id = job_id
|
| 27 |
+
if embedding is not None:
|
| 28 |
+
rule.embedding_vector = json.dumps(embedding.tolist())
|
| 29 |
+
db.flush()
|
| 30 |
+
return rule
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def save_technique_mappings(
|
| 34 |
+
db: Session,
|
| 35 |
+
*,
|
| 36 |
+
rule_id: str,
|
| 37 |
+
results: list[dict],
|
| 38 |
+
) -> list[RuleTechniqueMapping]:
|
| 39 |
+
"""
|
| 40 |
+
Persist ALL (rule_id, technique_id, confidence_score) rows sorted by confidence.
|
| 41 |
+
Deletes existing mappings first so re-runs are idempotent.
|
| 42 |
+
Saves ALL techniques (not just detected ones) so Figure 4-11 can show Top 5.
|
| 43 |
+
"""
|
| 44 |
+
db.query(RuleTechniqueMapping).filter(
|
| 45 |
+
RuleTechniqueMapping.rule_id == rule_id
|
| 46 |
+
).delete(synchronize_session=False)
|
| 47 |
+
|
| 48 |
+
sorted_results = sorted(results, key=lambda r: r["confidence_percent"], reverse=True)
|
| 49 |
+
|
| 50 |
+
rows = []
|
| 51 |
+
for r in sorted_results:
|
| 52 |
+
row = RuleTechniqueMapping(
|
| 53 |
+
rule_id=rule_id,
|
| 54 |
+
technique_id=r["technique_id"],
|
| 55 |
+
confidence_score=r["confidence_percent"] / 100.0,
|
| 56 |
+
)
|
| 57 |
+
db.add(row)
|
| 58 |
+
rows.append(row)
|
| 59 |
+
db.flush()
|
| 60 |
+
return rows
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def get_mappings_for_rule(
|
| 64 |
+
db: Session, rule_id: str
|
| 65 |
+
) -> list[RuleTechniqueMapping]:
|
| 66 |
+
return (
|
| 67 |
+
db.query(RuleTechniqueMapping)
|
| 68 |
+
.filter(RuleTechniqueMapping.rule_id == rule_id)
|
| 69 |
+
.order_by(RuleTechniqueMapping.confidence_score.desc())
|
| 70 |
+
.all()
|
| 71 |
+
)
|
murshid_backend/app/repositories/template_repo.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CRUD for Technique and QueryTemplate tables."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from sqlalchemy.orm import Session
|
| 6 |
+
|
| 7 |
+
from app.models.query_template import QueryTemplate
|
| 8 |
+
from app.models.technique import Technique
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# --------------------------------------------------------------------------
|
| 12 |
+
# Techniques
|
| 13 |
+
# --------------------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_or_create_technique(
|
| 17 |
+
db: Session, *, technique_id: str, technique_name: str = "", tactic: str | None = None
|
| 18 |
+
) -> Technique:
|
| 19 |
+
t = db.get(Technique, technique_id)
|
| 20 |
+
if t is None:
|
| 21 |
+
t = Technique(
|
| 22 |
+
technique_id=technique_id,
|
| 23 |
+
technique_name=technique_name or technique_id,
|
| 24 |
+
tactic=tactic,
|
| 25 |
+
)
|
| 26 |
+
db.add(t)
|
| 27 |
+
db.flush()
|
| 28 |
+
return t
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_technique(db: Session, technique_id: str) -> Technique | None:
|
| 32 |
+
return db.get(Technique, technique_id)
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# --------------------------------------------------------------------------
|
| 36 |
+
# Query templates
|
| 37 |
+
# --------------------------------------------------------------------------
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_templates_for_technique(
|
| 41 |
+
db: Session, technique_id: str
|
| 42 |
+
) -> list[QueryTemplate]:
|
| 43 |
+
return (
|
| 44 |
+
db.query(QueryTemplate)
|
| 45 |
+
.filter(
|
| 46 |
+
QueryTemplate.technique_id == technique_id,
|
| 47 |
+
QueryTemplate.is_active.is_(True),
|
| 48 |
+
)
|
| 49 |
+
.all()
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def create_template(
|
| 54 |
+
db: Session,
|
| 55 |
+
*,
|
| 56 |
+
technique_id: str,
|
| 57 |
+
purpose: str | None,
|
| 58 |
+
wql_query: str,
|
| 59 |
+
note: str | None,
|
| 60 |
+
) -> QueryTemplate:
|
| 61 |
+
tpl = QueryTemplate(
|
| 62 |
+
technique_id=technique_id,
|
| 63 |
+
purpose=purpose,
|
| 64 |
+
wql_query=wql_query,
|
| 65 |
+
note=note,
|
| 66 |
+
is_active=True,
|
| 67 |
+
)
|
| 68 |
+
db.add(tpl)
|
| 69 |
+
db.flush()
|
| 70 |
+
return tpl
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def update_template(
|
| 74 |
+
db: Session,
|
| 75 |
+
template_id: int,
|
| 76 |
+
*,
|
| 77 |
+
purpose: str | None = None,
|
| 78 |
+
wql_query: str | None = None,
|
| 79 |
+
note: str | None = None,
|
| 80 |
+
is_active: bool | None = None,
|
| 81 |
+
) -> QueryTemplate | None:
|
| 82 |
+
tpl = db.get(QueryTemplate, template_id)
|
| 83 |
+
if tpl is None:
|
| 84 |
+
return None
|
| 85 |
+
if purpose is not None:
|
| 86 |
+
tpl.purpose = purpose
|
| 87 |
+
if wql_query is not None:
|
| 88 |
+
tpl.wql_query = wql_query
|
| 89 |
+
if note is not None:
|
| 90 |
+
tpl.note = note
|
| 91 |
+
if is_active is not None:
|
| 92 |
+
tpl.is_active = is_active
|
| 93 |
+
db.flush()
|
| 94 |
+
return tpl
|
murshid_backend/app/schemas/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic schemas for API request/response validation."""
|