diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..ae6cd4f88468716a796742c05e7d196422013b20
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,18 @@
+__pycache__
+*.pyc
+*.pyo
+.venv
+venv
+**/.env
+.env.local
+*.db
+*.log
+.git
+.gitignore
+*.zip
+MurshidBackend_Colab.ipynb
+MurshidBackend_Colab_Report.md
+interface_pictures/
+murshid_backend/.venv
+murshid_backend/__pycache__
+murshid_backend/TECHNICAL_REPORT.md
diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..7fbd2c18033f804f7f89e0fab8e904767abfb0ca
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..9a17f72da2c6ac77675830444782013dde54f79e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__/
+*.pyc
+*.pyo
+.venv/
+venv/
+*.db
+*.log
+**/.env
+.env.local
+murshid_backend_for_drive.zip
+interface_pictures/
diff --git a/DEPLOY_GUIDE.md b/DEPLOY_GUIDE.md
new file mode 100644
index 0000000000000000000000000000000000000000..43852eb112797ba2966ce91250449d6ad09ec459
--- /dev/null
+++ b/DEPLOY_GUIDE.md
@@ -0,0 +1,103 @@
+# ๐ ุฏููู ุงููุดุฑ ุนูู Hugging Face Spaces
+
+## ุงูู
ุชุทูุจุงุช
+- ุญุณุงุจ ุนูู [Hugging Face](https://huggingface.co/) (ู
ุฌุงูู)
+- [Git](https://git-scm.com/) ู
ุซุจูุช ุนูู ุฌูุงุฒู
+
+---
+
+## ุงูุฎุทูุงุช
+
+### 1. ุฅูุดุงุก Space ุฌุฏูุฏ
+
+1. ุงุฐูุจ ุฅูู: https://huggingface.co/new-space
+2. **Space name**: `murshid`
+3. **SDK**: ุงุฎุชุฑ **Docker**
+4. **Visibility**: Public (ู
ุฌุงูู) ุฃู Private
+5. ุงุถุบุท **Create Space**
+
+### 2. ุฑูุน ุงูู
ุดุฑูุน
+
+```powershell
+cd d:\murishd
+
+# ุชููุฆุฉ Git (ุฅุฐุง ูู
ููู ู
ูุฌูุฏุงู)
+git init
+
+# ุฅุถุงูุฉ ุงูู remote (ุบููุฑ YOUR_USERNAME ุจุงุณู
ุญุณุงุจู)
+git remote add space https://huggingface.co/spaces/YOUR_USERNAME/murshid
+
+# ุฅุถุงูุฉ ุงูู
ููุงุช ูุงูุฑูุน
+git add .
+git commit -m "Initial deployment"
+git push space main
+```
+
+> โ ๏ธ ุฅุฐุง ุทูุจ ููู
ุฉ ู
ุฑูุฑุ ุงุณุชุฎุฏู
**Access Token** ู
ู:
+> https://huggingface.co/settings/tokens
+
+### 3. ุฅุนุฏุงุฏ ุงูู
ุชุบูุฑุงุช ุงูุจูุฆูุฉ (Secrets)
+
+ุงุฐูุจ ุฅูู ุฅุนุฏุงุฏุงุช ุงูู Space: `Settings โ Variables and secrets`
+
+ุฃุถู ูุฐู ุงูู
ุชุบูุฑุงุช:
+
+| ุงูุงุณู
| ุงูููู
ุฉ | ุงูููุน |
+|-------|--------|-------|
+| `MURSHID_DB_URL` | `sqlite:////app/data/murshid.db` | Variable |
+| `MURSHID_MODELS_DIR` | `/app/Needed` | Variable |
+| `MURSHID_SKIP_LLM` | `true` | Variable |
+| `SECRET_KEY` | (ุงุฎุชุฑ ููู
ุฉ ุณุฑ ุนุดูุงุฆูุฉ) | **Secret** |
+| `HF_TOKEN` | (ุงุฎุชูุงุฑู โ ูู ุชุจุบู Llama) | **Secret** |
+
+### 4. ุงูุชุธุฑ ุงูุจูุงุก
+
+- HF Spaces ูุจูู ุงูู Docker image ุชููุงุฆูุงู
+- ูุฃุฎุฐ **3-5 ุฏูุงุฆู** ููุจูุงุก ุงูุฃูู
+- ุจุนุฏ ุงููุฌุงุญุ ุงูุฑุงุจุท ูููู:
+ ```
+ https://YOUR_USERNAME-murshid.hf.space
+ ```
+
+---
+
+## ุงูุฑูุงุจุท ุจุนุฏ ุงููุดุฑ
+
+| ุงูุฑุงุจุท | ุงููุตู |
+|--------|-------|
+| `https://YOUR_USERNAME-murshid.hf.space` | ุงููุงุฌูุฉ ุงูุฑุฆูุณูุฉ |
+| `https://YOUR_USERNAME-murshid.hf.space/docs` | ุชูุซูู Swagger |
+| `https://YOUR_USERNAME-murshid.hf.space/health` | ูุญุต ุงูุญุงูุฉ |
+
+---
+
+## ู
ูุงุญุธุงุช
+
+### ุงููุถุน ุงูุญุงูู (LITE mode)
+- ุงูู
ุดุฑูุน ููุดุฑ ุจูุถุน **LITE** (ุจุฏูู torch/SecureBERT+)
+- ุชุญููู ุงูููุงุนุฏ ูุนู
ู ููู ุจุฏูุฉ ุฃูู (embeddings ุนุดูุงุฆูุฉ)
+- ู
ูุงุณุจ ูุงุฎุชุจุงุฑ ุงููุงุฌูุฉ ูุงูู API
+
+### ููุชุฑููุฉ ุฅูู LOCAL mode (SecureBERT+ ุจุฏูู Llama)
+ุนุฏูู `Dockerfile` ูุฃุฒู ุงูุชุนููู ู
ู ุณุทุฑ torch:
+```dockerfile
+RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece
+```
+> โ ๏ธ ูุฐุง ูุฒูุฏ ุญุฌู
ุงูุตูุฑุฉ ~800MB ููุญุชุงุฌ ุฐุงูุฑุฉ ุฃูุซุฑ
+
+### ููุชุฑููุฉ ุฅูู FULL mode (ู
ุน Llama 3)
+- ุบููุฑ ุงูู Space ุฅูู **GPU (T4)** ู
ู ุงูุฅุนุฏุงุฏุงุช ($0.60/ุณุงุนุฉ)
+- ุนุฏูู `MURSHID_SKIP_LLM=false`
+- ุฃุถู `HF_TOKEN` ูู ุงูู Secrets
+- ุงุณุชุฎุฏู
`requirements.txt` ุงููุงู
ู ุจุฏู `requirements_light.txt`
+
+---
+
+## ุงุณุชูุดุงู ุงูุฃุฎุทุงุก
+
+| ุงูู
ุดููุฉ | ุงูุญู |
+|---------|------|
+| Build ูุดู | ุชุญูู ู
ู ุงูู Logs ูู ุชุจููุจ ุงูู Space |
+| 502 Bad Gateway | ุงูุชุธุฑ ุฏูููุฉ โ ุงูุฎุงุฏู
ูุจุฏุฃ |
+| DB ุฎุทุฃ | ุชุญูู ู
ู `MURSHID_DB_URL` ูู ุงูู
ุชุบูุฑุงุช |
+| Frontend ูุง ูุชุตู | ุงูู BASE URL ุฃุตุจุญ ุชููุงุฆู (`window.location.origin`) |
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6fab11db62f4f3e595d30e761e30284c3cdf321c
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,49 @@
+FROM python:3.11-slim
+
+# System deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ build-essential libxml2-dev libxslt1-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Create non-root user (HF Spaces requirement)
+RUN useradd -m -u 1000 appuser
+
+WORKDIR /app
+
+# Copy requirements first for layer caching
+COPY murshid_backend/requirements_light.txt ./requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt \
+ && pip install --no-cache-dir openpyxl aiofiles scikit-learn
+
+# Optional: install torch CPU-only for LOCAL mode (SecureBERT+ embeddings)
+# Uncomment the next line if you want LOCAL mode (adds ~800MB to image)
+# RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece
+
+# Copy backend code
+COPY murshid_backend/ ./murshid_backend/
+
+# Copy model files
+COPY Needed/ ./Needed/
+
+# Copy frontend
+COPY murshid_frontend/ ./murshid_frontend/
+
+# Create writable directory for SQLite DB
+RUN mkdir -p /app/data && chown -R appuser:appuser /app
+
+# Setup environment
+ENV MURSHID_DB_URL=sqlite:////app/data/murshid.db
+ENV MURSHID_MODELS_DIR=/app/Needed
+ENV MURSHID_SKIP_LLM=true
+ENV SECRET_KEY=murshid_hf_space_2026
+ENV PORT=7860
+
+# Run DB migrations + import templates + start server
+COPY start.sh ./start.sh
+RUN chmod +x start.sh
+
+USER appuser
+
+EXPOSE 7860
+
+CMD ["./start.sh"]
diff --git a/MurshidBackend_Colab.ipynb b/MurshidBackend_Colab.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..447346989a45fec55b47f11fe7405490972d70a1
--- /dev/null
+++ b/MurshidBackend_Colab.ipynb
@@ -0,0 +1,967 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# ๐ก๏ธ Murshid Backend โ Full Mode on Colab\n",
+ "\n",
+ "**ู
ูุฑุดูุฏ | From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts**\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## ๐ ุงูู
ููุงุช ุงูู
ุทููุจุฉ ุนูู Google Drive\n",
+ "\n",
+ "```\n",
+ "MyDrive/\n",
+ "โโโ murshid_backend_for_drive.zip โ ุงุฑูุนูู ุซู
ุดุบููู ุงูุฎููุฉ 2b ูุงุณุชุฎุฑุงุฌู\n",
+ "โ ุฃู\n",
+ "โโโ murshid_backend/ โ ุฅุฐุง ุงุณุชุฎุฑุฌุชู ู
ุณุจูุงู\n",
+ "โ โโโ app/\n",
+ "โ โโโ alembic/\n",
+ "โ โโโ scripts/\n",
+ "โ โโโ alembic.ini\n",
+ "โ โโโ requirements.txt\n",
+ "โ\n",
+ "โโโ Needed/\n",
+ " โโโ murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
+ " โโโ murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
+ " โโโ murshid_label_columns.json\n",
+ " โโโ murshid_query_template_structure_clean_shared.xlsx\n",
+ "```\n",
+ "\n",
+ "## ุชุนููู
ุงุช ุงูุชุดุบูู\n",
+ "\n",
+ "### ุงูู
ุชุทูุจุงุช ูุจู ุงูุชุดุบูู\n",
+ "1. โ
**GPU ู
ููุนููู:** `Runtime โ Change runtime type โ T4 GPU`\n",
+ "2. โ
**Google Drive ู
ูุชููุตู** (ูุญุชูู ู
ุฌูุฏ `Needed` ุจู
ููุงุช ุงููู
ุงุฐุฌ)\n",
+ "3. โ
**ู
ุฌูุฏ `murshid_backend`** ุนูู Drive ุฃู ุฑูุนู ูุฏููุงู\n",
+ "\n",
+ "### ุงูู
ููุงุช ุงูู
ุทููุจุฉ ูู Google Drive\n",
+ "```\n",
+ "MyDrive/\n",
+ "โโโ Needed/\n",
+ "โ โโโ murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
+ "โ โโโ murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
+ "โ โโโ murshid_label_columns.json\n",
+ "โ โโโ murshid_query_template_structure_clean_shared.xlsx\n",
+ "โโโ murshid_backend/ โ ู
ุฌูุฏ ุงูุจุงููุฏ ูุงู
ูุงู\n",
+ "```\n",
+ "\n",
+ "### ุชุฑุชูุจ ุงูุชุดุบูู\n",
+ "**ุดุบููู ุงูุฎูุงูุง ุจุงูุชุฑุชูุจ ู
ู ุงูุฃุนูู ููุฃุณูู โ ูุง ุชุชุฎุทูู ุฃู ุฎููุฉ**\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 1: ุงูุชุญูู ู
ู GPU\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "\n",
+ "print('CUDA available:', torch.cuda.is_available())\n",
+ "if torch.cuda.is_available():\n",
+ " print('GPU:', torch.cuda.get_device_name(0))\n",
+ " print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')\n",
+ "else:\n",
+ " print('โ ๏ธ ูุง ููุฌุฏ GPU โ ุบููุฑู Runtime ุฅูู T4 ู
ู ุงููุงุฆู
ุฉ ุฃุนูุงู')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 2: ุชุญู
ูู Google Drive\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 3: ุชุฌููุฒ ุงูุจุงููุฏ ูู /content\n",
+ "\n",
+ "> ุชููู
ูุฐู ุงูุฎููุฉ ุชููุงุฆูุงู ุจู:\n",
+ "> 1. ุงุณุชุฎุฑุงุฌ ZIP ู
ู Drive (ุฅุฐุง ูุงู ZIP ู
ูุฌูุฏุงู ููู
ููุณุชุฎุฑุฌ ุจุนุฏ)\n",
+ "> 2. ูุณุฎ ู
ุฌูุฏ `murshid_backend` ุฅูู `/content` (ุฃุณุฑุน ูููุฑุงุกุฉ)\n",
+ "> 3. ุถุจุท Python path\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print('(ูุฐู ุงูุฎููุฉ ูุงุฑุบุฉ โ ุงูููุฏ ุงูุชูู ุฅูู ุงูุฎููุฉ 3 ุฃุฏูุงู)')\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from google.colab import drive\n",
+ "import os\n",
+ "\n",
+ "drive.mount('/content/drive')\n",
+ "\n",
+ "# โ๏ธ ุนุฏููู ูุฐุง ุงูู
ุณุงุฑ ุฅุฐุง ูุงู ู
ุฌูุฏู ู
ุฎุชููุงู\n",
+ "NEEDED_PATH = '/content/drive/MyDrive/Needed'\n",
+ "BACKEND_PATH = '/content/drive/MyDrive/murshid_backend'\n",
+ "ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip'\n",
+ "\n",
+ "print('=' * 55)\n",
+ "print('๐ Checking Google Drive files...')\n",
+ "print('=' * 55)\n",
+ "\n",
+ "# โโ ุงูุชุญูู ู
ู ู
ููุงุช Needed โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "print('\\n๐ Needed/ (model files):')\n",
+ "required_files = {\n",
+ " 'murshid_logreg_pipeline_manual_oof_pcatuned.joblib': 'LogReg model',\n",
+ " 'murshid_logreg_thresholds_manual_oof_pcatuned.npy': 'LogReg thresholds',\n",
+ " 'murshid_label_columns.json': 'Technique names',\n",
+ "}\n",
+ "\n",
+ "models_ok = True\n",
+ "for fname, desc in required_files.items():\n",
+ " path = f'{NEEDED_PATH}/{fname}'\n",
+ " exists = os.path.isfile(path)\n",
+ " size = f'{os.path.getsize(path)/1024:.0f} KB' if exists else ''\n",
+ " status = 'โ
' if exists else 'โ'\n",
+ " print(f' {status} {fname} {size}')\n",
+ " if not exists:\n",
+ " models_ok = False\n",
+ "\n",
+ "excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n",
+ "excel_ok = os.path.isfile(excel_path)\n",
+ "print(f' {\"โ
\" if excel_ok else \"โ ๏ธ \"} murshid_query_template_structure_clean_shared.xlsx (optional)')\n",
+ "\n",
+ "# โโ ุงูุชุญูู ู
ู ุงูุจุงููุฏ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "print('\\n๐ murshid_backend/ (backend code):')\n",
+ "backend_ok = os.path.isdir(BACKEND_PATH)\n",
+ "zip_ok = os.path.isfile(ZIP_PATH)\n",
+ "\n",
+ "if backend_ok:\n",
+ " fcount = sum(len(f) for _, _, f in os.walk(BACKEND_PATH))\n",
+ " print(f' โ
murshid_backend/ ({fcount} files)')\n",
+ "elif zip_ok:\n",
+ " zsize = f'{os.path.getsize(ZIP_PATH)/1024:.0f} KB'\n",
+ " print(f' ๐ฆ murshid_backend_for_drive.zip ({zsize}) โ ุณููุณุชุฎุฑุฌ ุชููุงุฆูุงู ูู ุงูุฎููุฉ 3')\n",
+ "else:\n",
+ " print(f' โ murshid_backend/ ุบูุฑ ู
ูุฌูุฏ')\n",
+ " print(f' โ murshid_backend_for_drive.zip ุบูุฑ ู
ูุฌูุฏ')\n",
+ " print(f'\\n โ ๏ธ ุงุฑูุนู murshid_backend_for_drive.zip ุฅูู:')\n",
+ " print(f' Google Drive โ My Drive')\n",
+ "\n",
+ "# โโ ู
ูุฎุต โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "print('\\n' + '=' * 55)\n",
+ "if models_ok and (backend_ok or zip_ok):\n",
+ " print('โ
ูู ุดูุก ุฌุงูุฒ โ ุชุงุจุนู ุชุดุบูู ุงูุฎูุงูุง')\n",
+ "elif not models_ok:\n",
+ " print('โ ู
ููุงุช ุงููู
ุงุฐุฌ ู
ูููุฏุฉ ู
ู Needed/ โ ูุฌุจ ุฑูุนูุง ุฃููุงู')\n",
+ "else:\n",
+ " print('โ ู
ููุงุช ุงูุจุงููุฏ ู
ูููุฏุฉ โ ุงุฑูุนู ZIP ุฃููุงู')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 3: ูุณุฎ ุงูุจุงููุฏ ุฅูู /content\n",
+ "\n",
+ "> ูุณุฎ ุงูู
ููุงุช ู
ู Drive ุฅูู `/content` ูุชุณุฑูุน ุงููุฑุงุกุฉ\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import shutil, os, zipfile, sys\n",
+ "\n",
+ "DRIVE_BASE = '/content/drive/MyDrive'\n",
+ "ZIP_PATH = f'{DRIVE_BASE}/murshid_backend_for_drive.zip'\n",
+ "BACKEND_DRIVE= f'{DRIVE_BASE}/murshid_backend'\n",
+ "BACKEND_LOCAL= '/content/murshid_backend'\n",
+ "\n",
+ "# โโ ุงูุฎุทูุฉ 1: ุงุณุชุฎุฑุงุฌ ZIP ู
ู Drive ุฅุฐุง ูุฒู
โโโโโโโโโโโโโโโโโโโโ\n",
+ "if not os.path.isdir(BACKEND_DRIVE):\n",
+ " if os.path.isfile(ZIP_PATH):\n",
+ " print(f'๐ฆ ZIP found โ extracting to Drive...')\n",
+ " with zipfile.ZipFile(ZIP_PATH, 'r') as z:\n",
+ " z.extractall(DRIVE_BASE)\n",
+ " print(f'โ
Extracted to {BACKEND_DRIVE}')\n",
+ " else:\n",
+ " print('โ ERROR: ู
ุฌูุฏ murshid_backend ุบูุฑ ู
ูุฌูุฏ ุนูู Drive')\n",
+ " print(f' ุงูู
ุทููุจ: {BACKEND_DRIVE}')\n",
+ " print(f' ุฃู ุฑูุน: {ZIP_PATH}')\n",
+ " raise FileNotFoundError(f'Backend not found. Upload murshid_backend_for_drive.zip to Google Drive MyDrive.')\n",
+ "else:\n",
+ " print(f'โ
murshid_backend found on Drive: {BACKEND_DRIVE}')\n",
+ "\n",
+ "# โโ ุงูุฎุทูุฉ 2: ูุณุฎ ุฅูู /content (ุฃุณุฑุน ุจูุซูุฑ ู
ู Drive ุฃุซูุงุก ุงูุชุดุบูู) โ\n",
+ "if os.path.exists(BACKEND_LOCAL):\n",
+ " shutil.rmtree(BACKEND_LOCAL)\n",
+ "\n",
+ "shutil.copytree(\n",
+ " BACKEND_DRIVE,\n",
+ " BACKEND_LOCAL,\n",
+ " ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '.venv', '*.db', '*.log')\n",
+ ")\n",
+ "\n",
+ "# โโ ุงูุฎุทูุฉ 3: ุฅุถุงูุฉ ููู Python path โโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "if BACKEND_LOCAL not in sys.path:\n",
+ " sys.path.insert(0, BACKEND_LOCAL)\n",
+ "\n",
+ "os.chdir(BACKEND_LOCAL)\n",
+ "\n",
+ "# โโ ุชุญูู โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "file_count = sum(len(files) for _, _, files in os.walk(BACKEND_LOCAL))\n",
+ "print(f'โ
Backend ready at {BACKEND_LOCAL} ({file_count} files)')\n",
+ "print(f'โ
Working dir: {os.getcwd()}')\n",
+ "\n",
+ "# ุนุฑุถ ุงููููู\n",
+ "print('\\nStructure:')\n",
+ "for item in sorted(os.listdir(BACKEND_LOCAL)):\n",
+ " full = os.path.join(BACKEND_LOCAL, item)\n",
+ " if os.path.isdir(full):\n",
+ " sub_count = len(os.listdir(full))\n",
+ " print(f' ๐ {item}/ ({sub_count} items)')\n",
+ " else:\n",
+ " size = os.path.getsize(full)\n",
+ " print(f' ๐ {item} ({size:,} bytes)')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 4: ุชุซุจูุช ุงูู
ุชุทูุจุงุช\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print('๐ฆ Installing requirements...')\n",
+ "\n",
+ "# โโ ุงูุญุฒู
ุงูุฃุณุงุณูุฉ ููุจุงููุฏ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "!pip install -q \\\n",
+ " fastapi==0.115.0 \\\n",
+ " \"uvicorn[standard]==0.32.0\" \\\n",
+ " pydantic==2.9.0 \\\n",
+ " pydantic-settings==2.6.0 \\\n",
+ " python-dotenv==1.0.0 \\\n",
+ " sqlalchemy==2.0.0 \\\n",
+ " alembic==1.13.0 \\\n",
+ " aiofiles \\\n",
+ " scikit-learn==1.6.1 \\\n",
+ " joblib \\\n",
+ " lxml \\\n",
+ " openpyxl \\\n",
+ " nest-asyncio \\\n",
+ " pyngrok\n",
+ "\n",
+ "# โโ bitsandbytes: ู
ุทููุจ ูุชุญู
ูู LLaMA ุจู 4-bit ุนูู GPU โโโโโโโโโ\n",
+ "print('๐ฆ Installing bitsandbytes (required for LLaMA 4-bit)...')\n",
+ "!pip install -q -U \"bitsandbytes>=0.46.1\"\n",
+ "\n",
+ "# โโ accelerate: ู
ุทููุจ ูู device_map=\"auto\" โโโโโโโโโโโโโโโโโโโโ\n",
+ "!pip install -q -U accelerate\n",
+ "\n",
+ "# โโ ุชุญูู ู
ู ุงูุชุซุจูุช โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "import importlib\n",
+ "for pkg in ['bitsandbytes', 'accelerate', 'fastapi', 'sklearn']:\n",
+ " try:\n",
+ " mod = importlib.import_module(pkg if pkg != 'sklearn' else 'sklearn')\n",
+ " ver = getattr(mod, '__version__', '?')\n",
+ " print(f' โ
{pkg}=={ver}')\n",
+ " except ImportError:\n",
+ " print(f' โ {pkg} โ ูุดู ุงูุชุซุจูุช')\n",
+ "\n",
+ "print('\\nโ
All requirements installed')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 5: ุฅุนุฏุงุฏ ู
ูู .env\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "# โ๏ธ ุถุนู HF Token ููุง ุฅุฐุง ูู
ุชูุถูููู ุนุจุฑ Colab Secrets\n",
+ "HF_TOKEN = os.environ.get('HF_TOKEN', 'ุงุฏุฎู ุงูุชููู')\n",
+ "\n",
+ "env_content = f\"\"\"# Auto-generated .env for Colab FULL mode\n",
+ "MURSHID_DB_URL=sqlite:////content/murshid.db\n",
+ "MURSHID_MODELS_DIR={NEEDED_PATH}\n",
+ "HF_TOKEN={HF_TOKEN}\n",
+ "MURSHID_SKIP_LLM=false\n",
+ "SECRET_KEY=murshid_colab_2026\n",
+ "LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct\n",
+ "EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus\n",
+ "LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n",
+ "LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy\n",
+ "LABEL_COLUMNS_JSON=murshid_label_columns.json\n",
+ "\"\"\"\n",
+ "\n",
+ "env_path = '/content/murshid_backend/.env'\n",
+ "with open(env_path, 'w') as f:\n",
+ " f.write(env_content)\n",
+ "\n",
+ "print('โ
.env created at', env_path)\n",
+ "print('\\nContents:')\n",
+ "with open(env_path) as f:\n",
+ " for line in f:\n",
+ " if 'TOKEN' in line or 'SECRET' in line:\n",
+ " key = line.split('=')[0]\n",
+ " print(f' {key}=****')\n",
+ " else:\n",
+ " print(' ', line.rstrip())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 6: ุชูุฌูุฑ ูุงุนุฏุฉ ุงูุจูุงูุงุช (Alembic)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import subprocess, os\n",
+ "\n",
+ "os.chdir('/content/murshid_backend')\n",
+ "\n",
+ "result = subprocess.run(\n",
+ " ['python', '-m', 'alembic', 'upgrade', 'head'],\n",
+ " capture_output=True, text=True\n",
+ ")\n",
+ "\n",
+ "print(result.stdout)\n",
+ "if result.stderr:\n",
+ " print(result.stderr)\n",
+ "\n",
+ "import os\n",
+ "db_exists = os.path.isfile('/content/murshid.db')\n",
+ "print('โ
Database ready:', '/content/murshid.db' if db_exists else 'โ ูู
ูููุดุฃ')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 7: ุงุณุชูุฑุงุฏ ููุงูุจ WQL ู
ู Excel\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.insert(0, '/content/murshid_backend')\n",
+ "os.chdir('/content/murshid_backend')\n",
+ "\n",
+ "excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n",
+ "\n",
+ "if os.path.isfile(excel_path):\n",
+ " from app.db.session import SessionLocal\n",
+ " from scripts.import_excel_templates import run as import_excel\n",
+ "\n",
+ " db = SessionLocal()\n",
+ " try:\n",
+ " result = import_excel(db, replace=False)\n",
+ " print('โ
Excel import result:')\n",
+ " for k, v in result.items():\n",
+ " print(f' {k}: {v}')\n",
+ " finally:\n",
+ " db.close()\n",
+ "else:\n",
+ " print(f'โ ๏ธ Excel file not found at: {excel_path}')\n",
+ " print(' ูู
ููู ุงูู
ุชุงุจุนุฉ โ ุงูููุงูุจ ุณุชูุถุงู ูุงุญูุงู ูุฏููุงู')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 8: ุชุดุบูู FastAPI + ngrok\n",
+ "\n",
+ "> โณ ูุฐู ุงูุฎููุฉ ุชุฃุฎุฐ **5-10 ุฏูุงุฆู** ูุชุญู
ูู LLaMA (4.5GB) ู SecureBERT+\n",
+ "\n",
+ "> ๐ **ุงูุฑุงุจุท ุงูุนุงู
ุณูุธูุฑ ูู ุงูููุงูุฉ** โ ุงูุณุฎูู ูููุฑููุช\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import subprocess, time, os, sys, urllib.request\n",
+ "import nest_asyncio\n",
+ "nest_asyncio.apply()\n",
+ "\n",
+ "os.chdir('/content/murshid_backend')\n",
+ "\n",
+ "# โโโ ุงูุชุญูู ู
ู bitsandbytes ูุจู ุชุดุบูู ุงูุฎุงุฏู
โโโโโโโโโโโโโโโโโ\n",
+ "try:\n",
+ " import bitsandbytes as bnb\n",
+ " print(f'โ
bitsandbytes {bnb.__version__}')\n",
+ "except ImportError:\n",
+ " print('โ bitsandbytes ุบูุฑ ู
ุซุจูุช โ ุดุบููู ุงูุฎููุฉ 4 ุฃููุงู')\n",
+ " raise\n",
+ "\n",
+ "# โโโ ุชุดุบูู uvicorn โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "log_path = '/content/murshid_server.log'\n",
+ "log_file = open(log_path, 'w')\n",
+ "\n",
+ "server_proc = subprocess.Popen(\n",
+ " [\n",
+ " 'python', '-m', 'uvicorn', 'app.main:app',\n",
+ " '--host', '0.0.0.0',\n",
+ " '--port', '8000',\n",
+ " '--log-level', 'info'\n",
+ " ],\n",
+ " cwd='/content/murshid_backend',\n",
+ " stdout=log_file,\n",
+ " stderr=subprocess.STDOUT\n",
+ ")\n",
+ "\n",
+ "print('โณ Loading LLaMA 3 8B + SecureBERT+...')\n",
+ "print(' ุฌุงุฑู ุงูุชุญู
ูู โ ุงูุชุธุฑู ุญุชู ุชุธูุฑ ุงูุฑุณุงูุฉ ุงูููุงุฆูุฉ')\n",
+ "\n",
+ "# โโโ ุงูุชุธุงุฑ ุฐูู ู
ุน ุนุฑุถ ุงูููุฌ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "started = False\n",
+ "last_log_size = 0\n",
+ "\n",
+ "for i in range(180): # 15 ุฏูููุฉ ูุญุฏ ุฃูุตู\n",
+ " time.sleep(5)\n",
+ "\n",
+ " # ุชุญูู ุฅุฐุง ุจุฏุฃ ุงูุฎุงุฏู
\n",
+ " try:\n",
+ " resp = urllib.request.urlopen('http://localhost:8000/health', timeout=3)\n",
+ " if resp.status == 200:\n",
+ " started = True\n",
+ " break\n",
+ " except Exception:\n",
+ " pass\n",
+ "\n",
+ " # ุนุฑุถ ุงูููุฌ ุงูุฌุฏูุฏ ูู 30 ุซุงููุฉ\n",
+ " if i % 6 == 0:\n",
+ " elapsed = (i + 1) * 5\n",
+ " log_file.flush()\n",
+ " try:\n",
+ " with open(log_path) as f:\n",
+ " log_content = f.read()\n",
+ " new_content = log_content[last_log_size:]\n",
+ " last_log_size = len(log_content)\n",
+ "\n",
+ " # ุชุญูู ู
ู ุฎุทุฃ ู
ุจูุฑ\n",
+ " if 'ERROR' in new_content or 'ImportError' in new_content:\n",
+ " print(f'\\nโ ุฎุทุฃ ูู ุงูุฎุงุฏู
ุนูุฏ {elapsed}s:')\n",
+ " # ุนุฑุถ ุขุฎุฑ 1000 ุญุฑู ู
ู ุงูููุฌ\n",
+ " print(log_content[-1500:])\n",
+ " server_proc.terminate()\n",
+ " log_file.close()\n",
+ " raise RuntimeError('Server failed to start. See log above.')\n",
+ "\n",
+ " # ุนุฑุถ ู
ุง ุชู
ุชุญู
ููู\n",
+ " if 'Loaded' in new_content or 'loaded' in new_content or 'Application' in new_content:\n",
+ " for line in new_content.strip().split('\\n'):\n",
+ " if any(k in line for k in ['INFO', 'Loaded', 'loaded', 'Application', 'WARNING']):\n",
+ " print(f' {line.strip()}')\n",
+ " else:\n",
+ " mins = elapsed // 60\n",
+ " secs = elapsed % 60\n",
+ " print(f' โณ {mins}m {secs}s โ ูุฌุฑู ุชุญู
ูู ุงููู
ุงุฐุฌ...')\n",
+ " except RuntimeError:\n",
+ " raise\n",
+ " except Exception:\n",
+ " print(f' โณ {elapsed}s elapsed...')\n",
+ "\n",
+ "log_file.flush()\n",
+ "log_file.close()\n",
+ "\n",
+ "if not started:\n",
+ " print('\\nโ Server did not start after 15 minutes.')\n",
+ " print('โโโ ุขุฎุฑ ุณุทูุฑ ุงูููุฌ โโโ')\n",
+ " with open(log_path) as f:\n",
+ " print(f.read()[-3000:])\n",
+ "else:\n",
+ " print('\\nโ
Server started successfully!')\n",
+ "\n",
+ " # โโโ Cloudflare Tunnel (ู
ุฌุงูู โ ุจุฏูู ุญุณุงุจ) โโโโโโโโโโโโโโโโโโ\n",
+ " import subprocess, re, threading, time\n",
+ "\n",
+ " # ุชุซุจูุช cloudflared\n",
+ " subprocess.run(\n",
+ " ['wget', '-q', 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
+ " '-O', '/usr/local/bin/cloudflared'],\n",
+ " check=True\n",
+ " )\n",
+ " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
+ " print('โ
cloudflared installed')\n",
+ "\n",
+ " # ุชุดุบูู ุงูููู\n",
+ " cf_log = open('/content/cloudflared.log', 'w')\n",
+ " cf_proc = subprocess.Popen(\n",
+ " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
+ " stdout=cf_log, stderr=subprocess.STDOUT\n",
+ " )\n",
+ "\n",
+ " # ุงูุชุธุงุฑ ุธููุฑ ุงูุฑุงุจุท ูู ุงูููุฌ\n",
+ " public_url = None\n",
+ " for _ in range(30):\n",
+ " time.sleep(2)\n",
+ " cf_log.flush()\n",
+ " try:\n",
+ " with open('/content/cloudflared.log') as f:\n",
+ " content = f.read()\n",
+ " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
+ " if match:\n",
+ " public_url = match.group(0)\n",
+ " break\n",
+ " except Exception:\n",
+ " pass\n",
+ "\n",
+ " if public_url:\n",
+ " print('\\n' + '='*60)\n",
+ " print('๐ PUBLIC URL (ุงูุฑุงุจุท ุงูุนุงู
โ Cloudflare):')\n",
+ " print(f' {public_url}')\n",
+ " print('='*60)\n",
+ " print(f'๐ Swagger: {public_url}/docs')\n",
+ " print(f'๐ Health: {public_url}/health')\n",
+ " print(f'๐๏ธ DB Summary: {public_url}/api/db/summary')\n",
+ " print('='*60)\n",
+ " print('\\n๐ ุงูุณุฎู ูุฐุง ุงูุณุทุฑ ูุงูุตููู ูู ุงููุฑููุช (index.html):')\n",
+ " print(f\" const BASE = '{public_url}';\")\n",
+ " else:\n",
+ " print('โ ๏ธ Cloudflare tunnel URL not found, check /content/cloudflared.log')\n",
+ " with open('/content/cloudflared.log') as f:\n",
+ " print(f.read()[-1000:])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# โโโ ุชุดุบูู Cloudflare Tunnel ุจุดูู ู
ููุตู (ุฅุฐุง ูุดู ู
ุน ุงูุฎููุฉ 8) โ\n",
+ "# ุดุบููู ูุฐู ุงูุฎููุฉ ููุท ุฅุฐุง ูุงู ุงูุฎุงุฏู
ูุนู
ู ููู ุงูู tunnel ูุดู\n",
+ "\n",
+ "import subprocess, re, time, os\n",
+ "\n",
+ "# ุชุซุจูุช cloudflared ุฅุฐุง ูู
ููุซุจููุช\n",
+ "if not os.path.isfile('/usr/local/bin/cloudflared'):\n",
+ " subprocess.run(\n",
+ " ['wget', '-q',\n",
+ " 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
+ " '-O', '/usr/local/bin/cloudflared'],\n",
+ " check=True\n",
+ " )\n",
+ " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
+ " print('โ
cloudflared installed')\n",
+ "else:\n",
+ " print('โ
cloudflared already installed')\n",
+ "\n",
+ "# ุชุดุบูู ุงูููู\n",
+ "cf_log_path = '/content/cloudflared.log'\n",
+ "cf_log = open(cf_log_path, 'w')\n",
+ "cf_proc = subprocess.Popen(\n",
+ " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
+ " stdout=cf_log, stderr=subprocess.STDOUT\n",
+ ")\n",
+ "\n",
+ "print('โณ Opening Cloudflare tunnel...')\n",
+ "\n",
+ "public_url = None\n",
+ "for _ in range(30):\n",
+ " time.sleep(2)\n",
+ " cf_log.flush()\n",
+ " try:\n",
+ " with open(cf_log_path) as f:\n",
+ " content = f.read()\n",
+ " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
+ " if match:\n",
+ " public_url = match.group(0)\n",
+ " break\n",
+ " except Exception:\n",
+ " pass\n",
+ "\n",
+ "if public_url:\n",
+ " print('\\n' + '='*60)\n",
+ " print(f'๐ PUBLIC URL: {public_url}')\n",
+ " print(f'๐ Swagger: {public_url}/docs')\n",
+ " print(f'๐ Health: {public_url}/health')\n",
+ " print('='*60)\n",
+ " print('\\n๐ ุงูุตูู ูุฐุง ุงูุณุทุฑ ูู index.html:')\n",
+ " print(f\" const BASE = '{public_url}';\")\n",
+ "else:\n",
+ " print('โ ูู
ููุนุซุฑ ุนูู URL. ุงูููุฌ:')\n",
+ " with open(cf_log_path) as f:\n",
+ " print(f.read())\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 9: ุฑุจุท ุงููุฑููุช ุจู Cloudflare URL\n",
+ "\n",
+ "ุจุนุฏ ุชุดุบูู ุงูุฎููุฉ ุงูุณุงุจูุฉุ ุณุชุธูุฑ ุฑุณุงูุฉ ู
ุซู:\n",
+ "```\n",
+ "๐ PUBLIC URL: https://xxxx-xxxx.trycloudflare.com\n",
+ "```\n",
+ "\n",
+ "**ุงูุฎููุฉ ุฃุฏูุงู ุชูุญุฏูุซ ุงููุฑููุช ุชููุงุฆูุงู** โ ุฃู ูู
ููู ุงูุชุนุฏูู ูุฏููุงู ูู `index.html`:\n",
+ "```javascript\n",
+ "const BASE = 'https://xxxx-xxxx.trycloudflare.com';\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import subprocess, re, time, os\n",
+ "\n",
+ "# โโ ุงูุฎุทูุฉ 1: ุชุซุจูุช cloudflared โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "if not os.path.isfile('/usr/local/bin/cloudflared'):\n",
+ " subprocess.run([\n",
+ " 'wget', '-q',\n",
+ " 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n",
+ " '-O', '/usr/local/bin/cloudflared'\n",
+ " ], check=True)\n",
+ " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n",
+ " print('โ
cloudflared installed')\n",
+ "else:\n",
+ " print('โ
cloudflared ready')\n",
+ "\n",
+ "# โโ ุงูุฎุทูุฉ 2: ุชุดุบูู ุงูููู โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "cf_log_path = '/content/cf.log'\n",
+ "cf_log = open(cf_log_path, 'w')\n",
+ "subprocess.Popen(\n",
+ " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n",
+ " stdout=cf_log, stderr=subprocess.STDOUT\n",
+ ")\n",
+ "\n",
+ "print('โณ Opening Cloudflare tunnel...')\n",
+ "\n",
+ "# โโ ุงูุฎุทูุฉ 3: ุงูุชุธุงุฑ ุงูุฑุงุจุท โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "public_url = None\n",
+ "for _ in range(30):\n",
+ " time.sleep(2)\n",
+ " cf_log.flush()\n",
+ " with open(cf_log_path) as f:\n",
+ " content = f.read()\n",
+ " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n",
+ " if match:\n",
+ " public_url = match.group(0)\n",
+ " break\n",
+ "\n",
+ "if not public_url:\n",
+ " print('โ Tunnel failed. Log:')\n",
+ " with open(cf_log_path) as f: print(f.read())\n",
+ "else:\n",
+ " # โโ ุงูุฎุทูุฉ 4: ุชุญุฏูุซ index.html ุชููุงุฆูุงู โโโโโโโโโโโโโโโโโ\n",
+ " frontend_path = '/content/drive/MyDrive/murshid_frontend/index.html'\n",
+ "\n",
+ " if os.path.isfile(frontend_path):\n",
+ " with open(frontend_path, 'r', encoding='utf-8') as f:\n",
+ " html = f.read()\n",
+ " html_updated = re.sub(r\"const BASE = '[^']*';\",\n",
+ " f\"const BASE = '{public_url}';\", html)\n",
+ " with open(frontend_path, 'w', encoding='utf-8') as f:\n",
+ " f.write(html_updated)\n",
+ " print(f'โ
index.html updated automatically')\n",
+ " else:\n",
+ " print(f'โ ๏ธ index.html not found โ ุนุฏูููู ูุฏููุงู')\n",
+ "\n",
+ " print('\\n' + '='*60)\n",
+ " print(f'๐ PUBLIC URL: {public_url}')\n",
+ " print(f'๐ Swagger: {public_url}/docs')\n",
+ " print(f'๐ Health: {public_url}/health')\n",
+ " print(f'๐ฅ๏ธ Frontend: {public_url}/index.html')\n",
+ " print('='*60)\n",
+ " print(f\"\\n๐ const BASE = '{public_url}';\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 10: ุงุฎุชุจุงุฑ ุงูู API\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import urllib.request, json\n",
+ "\n",
+ "# โโโ Health Check โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "with urllib.request.urlopen('http://localhost:8000/health') as r:\n",
+ " health = json.load(r)\n",
+ "\n",
+ "print('=== Health Check ===')\n",
+ "print(f\" status: {health['status']}\")\n",
+ "print(f\" pipeline_mode: {health['pipeline_mode']}\")\n",
+ "print(f\" llama_loaded: {health['components']['llama_loaded']}\")\n",
+ "print(f\" embedder_loaded: {health['components']['embedder_loaded']}\")\n",
+ "print(f\" logreg_loaded: {health['components']['logreg_loaded']}\")\n",
+ "print(f\" cuda_available: {health['components']['cuda_available']}\")\n",
+ "\n",
+ "mode = health.get('pipeline_mode', 'unknown')\n",
+ "if mode == 'full':\n",
+ " print('\\nโ
FULL mode โ ูุชุงุฆุฌ ู
ุทุงุจูุฉ 100% ููุฏูุชุฑ')\n",
+ "elif mode == 'local':\n",
+ " print('\\nโ ๏ธ LOCAL mode โ LLaMA ูู
ููุญู
ูููุ ุชุญููู ู
ู MURSHID_SKIP_LLM=false')\n",
+ "else:\n",
+ " print('\\nโ LITE mode โ ุชุญููู ู
ู ุชุซุจูุช torch ูุงููู
ุงุฐุฌ')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# โโโ ุชุญููู ูุงุนุฏุฉ ุงุฎุชุจุงุฑ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "import urllib.request, json\n",
+ "\n",
+ "test_rule = '''\n",
+ " 18201\n",
+ " ^634$|^4730$\n",
+ " Windows: Security Enabled Global Group Deleted\n",
+ " T1484\n",
+ " group_deleted,win_group_deleted\n",
+ "'''\n",
+ "\n",
+ "payload = json.dumps({'rule_xml': test_rule}).encode()\n",
+ "req = urllib.request.Request(\n",
+ " 'http://localhost:8000/rules/analyze',\n",
+ " data=payload,\n",
+ " headers={'Content-Type': 'application/json'},\n",
+ " method='POST'\n",
+ ")\n",
+ "\n",
+ "with urllib.request.urlopen(req) as r:\n",
+ " result = json.load(r)\n",
+ "\n",
+ "print('=== Analyze Result ===')\n",
+ "print(f\" rule_id: {result['rule_id']}\")\n",
+ "print(f\" pipeline_mode: {result['pipeline_mode']}\")\n",
+ "print(f\" summary: {result['summary']}\")\n",
+ "print(f\"\\n TOP 5 Techniques:\")\n",
+ "print(f\" {'Technique':<15} {'Conf%':>8} {'Proba':>8} {'Thr':>6} {'Gap':>8} {'Pred':>6}\")\n",
+ "print(f\" {'-'*55}\")\n",
+ "for r in result['all_results'][:5]:\n",
+ " pred = 'โ
' if r['predicted'] else ' '\n",
+ " print(f\" {pred} {r['technique_id']:<13} {r['confidence_percent']:>7.2f}%\"\n",
+ " f\" {r['proba']:>8.4f} {r['threshold']:>6.2f} {r['gap']:>+8.4f}\")\n",
+ "\n",
+ "print(f\"\\n Detected: {len(result['detected'])} technique(s)\")\n",
+ "for d in result['detected']:\n",
+ " print(f\" โ
{d['technique_id']} โ {d['confidence_percent']}%\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# โโโ ููุงูุจ WQL ููุชูููุฉ ุงูู
ูุชุดูุฉ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "if result['detected']:\n",
+ " top_technique = result['detected'][0]['technique_id']\n",
+ "\n",
+ " with urllib.request.urlopen(f'http://localhost:8000/queries/{top_technique}') as r:\n",
+ " queries = json.load(r)\n",
+ "\n",
+ " print(f'=== WQL Templates for {top_technique} ===')\n",
+ " for i, q in enumerate(queries, 1):\n",
+ " print(f\"\\n [{i}] {q.get('purpose', 'N/A')}\")\n",
+ " print(f\" Query: {q['wql_query'][:120]}...\")\n",
+ " print(f\" Note: {q.get('note', 'N/A')}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 11: ุชุตุฏูุฑ ุงููุชุงุฆุฌ (ุงุฎุชูุงุฑู)\n",
+ "\n",
+ "ูุญูุธ ุงููุชุงุฆุฌ ุจุตูุบุฉ JSON ูุงุณุชุฎุฏุงู
ูุง ูุงุญูุงู ุนูู ุงูุฌูุงุฒ ุงูู
ุญูู\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# โโโ ุชุญููู ูุงุฆู
ุฉ ู
ู ุงูููุงุนุฏ ูุชุตุฏูุฑูุง โโโโโโโโโโโโโโโโโโโโโโโ\n",
+ "import urllib.request, json, os\n",
+ "\n",
+ "# โ๏ธ ุฃุถููู Rule IDs ุงูุชู ุชุฑูุฏูู ุชุญููููุง\n",
+ "# ูู
ููู ูุฑุงุกุชูุง ู
ู ู
ูู\n",
+ "test_ids_path = f'{NEEDED_PATH}/test_rule_ids.json'\n",
+ "\n",
+ "if os.path.isfile(test_ids_path):\n",
+ " with open(test_ids_path) as f:\n",
+ " rule_ids = json.load(f)\n",
+ " print(f'Loaded {len(rule_ids)} rule IDs from test_rule_ids.json')\n",
+ "else:\n",
+ " # ููุงุนุฏ ุชุฌุฑูุจูุฉ\n",
+ " rule_ids = ['18205']\n",
+ " print('Using default test rule')\n",
+ "\n",
+ "print(f'Processing {len(rule_ids)} rules...')\n",
+ "\n",
+ "export_results = []\n",
+ "\n",
+ "for rule_id in rule_ids:\n",
+ " try:\n",
+ " with urllib.request.urlopen(f'http://localhost:8000/results/{rule_id}') as r:\n",
+ " data = json.load(r)\n",
+ " data['source'] = 'colab_full_mode'\n",
+ " export_results.append(data)\n",
+ " detected = len(data.get('detected', []))\n",
+ " top = data['mappings'][0] if data['mappings'] else {}\n",
+ " print(f\" โ
{rule_id}: {top.get('technique_id','?')} ({top.get('confidence_percent','?')}%) โ {detected} detected\")\n",
+ " except Exception as e:\n",
+ " print(f\" โ ๏ธ {rule_id}: {e}\")\n",
+ "\n",
+ "# ุญูุธ ุงููุชุงุฆุฌ\n",
+ "export_path = f'{NEEDED_PATH}/murshid_full_results.json'\n",
+ "with open(export_path, 'w', encoding='utf-8') as f:\n",
+ " json.dump(export_results, f, ensure_ascii=False, indent=2)\n",
+ "\n",
+ "print(f'\\nโ
Exported {len(export_results)} results to:')\n",
+ "print(f' {export_path}')\n",
+ "print('\\nูู
ููู ุงูุขู ุงุณุชูุฑุงุฏ ูุฐุง ุงูู
ูู ูู ุงูุจุงููุฏ ุงูู
ุญูู')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ุงูุฎููุฉ 12: ุฅููุงู ุงูุฎุงุฏู
(ุนูุฏ ุงูุงูุชูุงุก)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# ุฅููุงู ุงูุฎุงุฏู
ูุฅุบูุงู ngrok\n",
+ "try:\n",
+ " from pyngrok import ngrok\n",
+ " ngrok.kill()\n",
+ " print('โ
ngrok tunnel closed')\n",
+ "except Exception:\n",
+ " pass\n",
+ "\n",
+ "try:\n",
+ " server_proc.terminate()\n",
+ " print('โ
Server stopped')\n",
+ "except Exception:\n",
+ " pass"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "## ู
ูุงุญุธุงุช ู
ูู
ุฉ\n",
+ "\n",
+ "### ุฅุฐุง ุงููุทุน ุงูุงุชุตุงู ุจู Colab\n",
+ "- ุงูุฎุงุฏู
ูุชููู ุชููุงุฆูุงู\n",
+ "- ุฃุนูุฏู ุชุดุบูู ุงูุฎูุงูุง ู
ู ุงูุฎููุฉ 8\n",
+ "- ุฑุงุจุท ngrok ุณูุชุบููุฑ โ ุนุฏููู ุงููุฑููุช ุจุงูุฑุงุจุท ุงูุฌุฏูุฏ\n",
+ "\n",
+ "### ุฅุฐุง ุธูุฑ ุฎุทุฃ ูู LLaMA\n",
+ "- ุชุฃูุฏู ุฃู ูุฏูู ุตูุงุญูุฉ ุงููุตูู ูููู
ูุฐุฌ: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct\n",
+ "- ุชุฃูุฏู ู
ู ุตุญุฉ HF_TOKEN\n",
+ "\n",
+ "### ุงูู
ูุงุฑูุฉ ู
ุน ุงูุฌูุงุฒ ุงูู
ุญูู\n",
+ "| | Colab (FULL) | ุงูุฌูุงุฒ ุงูู
ุญูู (LOCAL) |\n",
+ "|--|-------------|----------------------|\n",
+ "| LLaMA | โ
| โ |\n",
+ "| T1484 confidence | **94.76%** | 89.29% |\n",
+ "| ุงููุฑุงุฑ ุงูููุงุฆู | T1484 โ
| T1484 โ
|\n",
+ "\n",
+ "### ููุนุฑุถ ุงูุชูุฏูู
ู\n",
+ "1. ุดุบููู ุงูุฎูุงูุง 1-8 ู
ุณุจูุงู (ูุจู ุงูุนุฑุถ ุจู 15 ุฏูููุฉ)\n",
+ "2. ุงูุณุฎู ุฑุงุจุท ngrok\n",
+ "3. ุนุฏููู ุงููุฑููุช\n",
+ "4. ุงูุชุญู `https://xxxx.ngrok-free.app/index.html`\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "machine_shape": "hm",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/MurshidBackend_Colab_Report.md b/MurshidBackend_Colab_Report.md
new file mode 100644
index 0000000000000000000000000000000000000000..8365bffddb4ec81a48ef79b3a9a415ede46f3de7
--- /dev/null
+++ b/MurshidBackend_Colab_Report.md
@@ -0,0 +1,545 @@
+# ุชูุฑูุฑ ุชููู: ุขููุฉ ุนู
ู MurshidBackend_Colab.ipynb
+
+## ู
ุดุฑูุน ู
ูุฑุดูุฏ | From Alerts to Guidance
+### MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts
+
+---
+
+## 1. ูุธุฑุฉ ุนุงู
ุฉ
+
+`MurshidBackend_Colab.ipynb` ูู ุฏูุชุฑ Jupyter ู
ูุตู
ููู
ูุชุดุบูู ุงูุจุงููุฏ ุงููุงู
ู ูู
ุดุฑูุน ู
ูุฑุดูุฏ ุนูู ุจูุฆุฉ **Google Colab** ุจุงุณุชุฎุฏุงู
**GPU (Tesla T4)**ุ ู
ู
ุง ููุชูุญ ุชุดุบูู ูู
ูุฐุฌ **LLaMA 3 8B** ุจุชูู
ูู
4-bit ูุชูููุฏ ู
ูุฎุตุงุช ุฏูุงููุฉ ุบููุฉ ูููุงุนุฏ Wazuh XMLุ ูุฐูู ุนูู ุนูุณ ุงูุจูุฆุฉ ุงูู
ุญููุฉ ุงูุชู ุชุนู
ู ุจุฏูู LLaMA (LOCAL mode).
+
+### ุงููุฏู ุงูุฑุฆูุณู
+ุชุดุบูู **FULL mode** ููู pipeline:
+```
+ูุงุนุฏุฉ Wazuh XML
+ โ
+ LLaMA 3 8B โโโ ู
ูุฎุต ุฏูุงูู ุบูู (GPU)
+ โ
+ SecureBERT+ โโโ 768-dim embedding
+ โ
+ Logistic Regression โโโ confidence scores ููู ุชูููุฉ
+ โ
+ FastAPI + SQLite โโโ ุชุฎุฒูู ูุฎุฏู
ุฉ ุงููุชุงุฆุฌ
+ โ
+ Cloudflare Tunnel โโโ ุฑุงุจุท ุนุงู
ูููุฑููุช
+```
+
+---
+
+## 2. ุงูู
ุชุทูุจุงุช ูุจู ุงูุชุดุบูู
+
+### 2.1 ุฅุนุฏุงุฏ Google Colab
+| ุงูู
ุชุทูุจ | ุงูุชูุงุตูู |
+|---------|----------|
+| **GPU** | Tesla T4 โ ูููุนููู ู
ู: `Runtime โ Change runtime type โ T4 GPU` |
+| **ุงูุฐุงูุฑุฉ** | High RAM (machine_shape: "hm") |
+| **ุงูุฅูุชุฑูุช** | ู
ูุนููู ูุชูุฒูู ุงููู
ุงุฐุฌ ู
ู Hugging Face |
+
+### 2.2 ุงูู
ููุงุช ุงูู
ุทููุจุฉ ุนูู Google Drive
+```
+MyDrive/
+โโโ murshid_backend_for_drive.zip โ ู
ููุงุช ุงูุจุงููุฏ ู
ุถุบูุทุฉ (44 KB)
+โ ุฃู
+โโโ murshid_backend/ โ ุงูู
ุฌูุฏ ู
ุณุชุฎุฑุฌ ู
ุณุจูุงู
+โ โโโ app/
+โ โ โโโ main.py
+โ โ โโโ config.py
+โ โ โโโ api/routes/
+โ โ โโโ ml/
+โ โ โโโ models/
+โ โ โโโ services/
+โ โ โโโ repositories/
+โ โโโ alembic/
+โ โโโ scripts/
+โ โโโ alembic.ini
+โ โโโ requirements.txt
+โ
+โโโ Needed/
+ โโโ murshid_logreg_pipeline_manual_oof_pcatuned.joblib โ ูู
ูุฐุฌ LogReg
+ โโโ murshid_logreg_thresholds_manual_oof_pcatuned.npy โ ุนุชุจุงุช ุงูุชูุจุค
+ โโโ murshid_label_columns.json โ ุฃุณู
ุงุก ุงูุชูููุงุช ุงูู 20
+ โโโ murshid_query_template_structure_clean_shared.xlsx โ 60 ูุงูุจ WQL
+```
+
+### 2.3 Hugging Face Token
+ู
ุทููุจ ูููุตูู ุฅูู ูู
ูุฐุฌ `meta-llama/Meta-Llama-3-8B-Instruct`:
+- ููุถุงู ูู `Colab Secrets` ุจุงุณู
`HF_TOKEN`
+- ุฃู ู
ุจุงุดุฑุฉู ูู ุฎููุฉ 5 ู
ู ุงูุฏูุชุฑ
+
+---
+
+## 3. ุดุฑุญ ุงูุฎูุงูุง ุจุงูุชูุตูู
+
+### ุงูุฎููุฉ 1: ุงูุชุญูู ู
ู GPU
+
+**ุงููุฏู:** ุงูุชุฃูุฏ ู
ู ูุฌูุฏ GPU ูุจู ุงูุจุฏุก.
+
+```python
+import torch
+print('CUDA available:', torch.cuda.is_available())
+print('GPU:', torch.cuda.get_device_name(0))
+print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')
+```
+
+**ุงูู
ุฎุฑุฌ ุงูู
ุชููุน:**
+```
+CUDA available: True
+GPU: Tesla T4
+Memory: 15.8 GB
+```
+
+**ู
ุงุฐุง ูุญุฏุซ ุฅุฐุง ูู
ููู ููุงู GPUุ**
+- LLaMA ูู ููุญู
ููู (ูุญุชุงุฌ CUDA)
+- ุงูุฎุงุฏู
ุณูุนู
ู ุจู LOCAL mode ููุท (ุจุฏูู ุชูุฎูุต)
+
+---
+
+### ุงูุฎููุฉ 2: ุชุญู
ูู Google Drive ูุงูุชุญูู ู
ู ุงูู
ููุงุช
+
+**ุงููุฏู:** ุฑุจุท Colab ุจู Google Drive ูุงูุชุญูู ู
ู ูุฌูุฏ ุฌู
ูุน ุงูู
ููุงุช ุงูู
ุทููุจุฉ.
+
+```python
+from google.colab import drive
+drive.mount('/content/drive')
+
+NEEDED_PATH = '/content/drive/MyDrive/Needed'
+BACKEND_PATH = '/content/drive/MyDrive/murshid_backend'
+ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip'
+```
+
+**ู
ุง ูุชุญูู ู
ูู:**
+| ุงูู
ูู | ุงูููุน | ุงูุญุงูุฉ |
+|-------|-------|--------|
+| `murshid_logreg_pipeline_manual_oof_pcatuned.joblib` | ุฅูุฒุงู
ู | โ
/ โ |
+| `murshid_logreg_thresholds_manual_oof_pcatuned.npy` | ุฅูุฒุงู
ู | โ
/ โ |
+| `murshid_label_columns.json` | ุฅูุฒุงู
ู | โ
/ โ |
+| `murshid_query_template_structure_clean_shared.xlsx` | ุงุฎุชูุงุฑู | โ
/ โ ๏ธ |
+| `murshid_backend/` ุฃู `.zip` | ุฅูุฒุงู
ู | โ
/ โ |
+
+---
+
+### ุงูุฎููุฉ 3: ุชุฌููุฒ ุงูุจุงููุฏ ูู /content
+
+**ุงููุฏู:** ููู ู
ููุงุช ุงูุจุงููุฏ ู
ู Drive ุฅูู `/content` ูุชุณุฑูุน ุงููุฑุงุกุฉ (Drive ุฃุจุทุฃ ูู I/O).
+
+**ุงูู
ูุทู ุงูุฐูู:**
+```
+ูู murshid_backend/ ู
ูุฌูุฏ ุนูู Driveุ
+ โ ูุนู
โ ุงูุณุฎ ู
ุจุงุดุฑุฉู ุฅูู /content
+ โ ูุง
+ูู murshid_backend_for_drive.zip ู
ูุฌูุฏุ
+ โ ูุนู
โ ุงุณุชุฎุฑุฌู ุฅูู Drive ุฃููุงู ุซู
ุงูุณุฎ
+ โ ูุง
+โ โ ุฎุทุฃ: "ุงุฑูุนู ZIP ุฅูู Google Drive"
+```
+
+**ุงูุฎุทูุงุช ุงูู
ููููุฐุฉ:**
+1. **ุงุณุชุฎุฑุงุฌ ZIP** (ุฅุฐุง ูุฒู
) ุฅูู `MyDrive/`
+2. **ูุณุฎ** `murshid_backend/` ุฅูู `/content/murshid_backend/` (ุจุฏูู pycache ูู
ููุงุช ู
ุคูุชุฉ)
+3. **ุฅุถุงูุฉ** `/content/murshid_backend` ุฅูู `sys.path`
+4. **ุชุบููุฑ** working directory ุฅูู `/content/murshid_backend`
+
+**ูู
ุงุฐุง ุงููุณุฎ ุฅูู /contentุ**
+- Drive ูุนุชู
ุฏ ุนูู FUSE mount = ุจุทูุก ูููุฑุงุกุฉ ุงูู
ุชูุฑุฑุฉ
+- `/content` ุนูู SSD ู
ุญูู ููู VM = ุฃุณุฑุน ุจู 5-10x
+
+---
+
+### ุงูุฎููุฉ 4: ุชุซุจูุช ุงูู
ุชุทูุจุงุช
+
+**ุงููุฏู:** ุชุซุจูุช ุฌู
ูุน ุงูู
ูุชุจุงุช ุงููุงุฒู
ุฉ ูุชุดุบูู ุงูุจุงููุฏ.
+
+**ุงูู
ูุชุจุงุช ุงูู
ุซุจููุชุฉ:**
+
+| ุงูู
ูุชุจุฉ | ุงูุฅุตุฏุงุฑ | ุงูุบุฑุถ |
+|---------|---------|--------|
+| `fastapi` | 0.115.0 | ุฅุทุงุฑ API |
+| `uvicorn` | 0.32.0 | ุฎุงุฏู
ASGI |
+| `pydantic` | 2.9.0 | ุชุญูู ู
ู ุงูุจูุงูุงุช |
+| `sqlalchemy` | 2.0.0 | ORM |
+| `alembic` | 1.13.0 | ูุฌุฑุฉ DB |
+| `scikit-learn` | **1.6.1** | ูู
ูุฐุฌ LogReg (ูุทุงุจู ุจูุฆุฉ ุงูุชุฏุฑูุจ) |
+| `bitsandbytes` | โฅ0.46.1 | ุชูู
ูู
LLaMA 4-bit |
+| `accelerate` | ุขุฎุฑ ูุณุฎุฉ | `device_map="auto"` ููู GPU |
+| `openpyxl` | ุขุฎุฑ ูุณุฎุฉ | ูุฑุงุกุฉ ู
ูู Excel |
+| `lxml` | ุขุฎุฑ ูุณุฎุฉ | ู
ุนุงูุฌุฉ XML |
+| `pyngrok` | ุขุฎุฑ ูุณุฎุฉ | (ุงุญุชูุงุทู โ ุบูุฑ ู
ุณุชุฎุฏู
) |
+
+> **ู
ูุงุญุธุฉ ู
ูู
ุฉ:** `scikit-learn==1.6.1` ู
ุญุฏููุฏ ุจุฏูุฉ ูุฃู ู
ููุงุช joblib ุฏูุฑููุจุช ุจูุฐู ุงููุณุฎุฉ โ ุงุณุชุฎุฏุงู
ูุณุฎุฉ ู
ุฎุชููุฉ ูููุชุฌ ุชุญุฐูุฑุงุช `InconsistentVersionWarning`.
+
+---
+
+### ุงูุฎููุฉ 5: ุฅุนุฏุงุฏ ู
ูู .env
+
+**ุงููุฏู:** ุฅูุดุงุก ู
ูู ุงูุฅุนุฏุงุฏุงุช ูุชุดุบูู FULL mode.
+
+**ู
ุญุชูู ุงูู
ูู ุงูู
ูููููุฏ:**
+```env
+MURSHID_DB_URL=sqlite:////content/murshid.db
+MURSHID_MODELS_DIR=/content/drive/MyDrive/Needed
+HF_TOKEN=****
+MURSHID_SKIP_LLM=false โ ู
ูุชุงุญ FULL mode
+SECRET_KEY=murshid_colab_2026
+LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct
+EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus
+LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib
+LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy
+LABEL_COLUMNS_JSON=murshid_label_columns.json
+```
+
+**ุงููุฑู ุจูู FULL ู LOCAL mode:**
+| ุงูู
ุชุบูุฑ | FULL mode | LOCAL mode |
+|---------|-----------|------------|
+| `MURSHID_SKIP_LLM` | `false` | `true` |
+| LLaMA ููุญู
ูููุ | โ
ูุนู
| โ ูุง |
+| ุฌูุฏุฉ ุงูุชูุฎูุต | ุนุงููุฉ | ุงููุตู ุงูุฎุงู
ููุท |
+| T1484 confidence (ู
ุซุงู) | **94.76%** | 89.29% |
+
+---
+
+### ุงูุฎููุฉ 6: ุชูุฌูุฑ ูุงุนุฏุฉ ุงูุจูุงูุงุช (Alembic)
+
+**ุงููุฏู:** ุฅูุดุงุก ุฌุฏุงูู ูุงุนุฏุฉ ุงูุจูุงูุงุช SQLite.
+
+```bash
+python -m alembic upgrade head
+```
+
+**ุงูุฌุฏุงูู ุงูู
ููุดุฃุฉ (ู
ู migration 0001):**
+
+| ุงูุฌุฏูู | ุงูุบุฑุถ | ู
ุตุฏุฑู ูู ุงูุชูุฑูุฑ |
+|--------|--------|-----------------|
+| `users` | ู
ุณุชุฎุฏู
ู ุงููุธุงู
(admin/analyst) | ER Diagram ยง3.2.6 |
+| `mapping_jobs` | ูุธุงุฆู ู
ุนุงูุฌุฉ ู
ููุงุช ุงูููุงุนุฏ | ER Diagram ยง3.2.6 |
+| `rules` | ููุงุนุฏ Wazuh ุงูู
ูุญููููุฉ | ER Diagram ยง3.2.6 |
+| `techniques` | ุชูููุงุช MITRE ATT&CK | ER Diagram ยง3.2.6 |
+| `rule_technique_mappings` | ุฑุจุท ุงูููุงุนุฏ ุจุงูุชูููุงุช + confidence | ER Diagram ยง3.2.6 |
+| `query_templates` | ููุงูุจ WQL ููุชุญููู | ER Diagram ยง3.2.6 |
+
+> **ู
ูุงุญุธุฉ:** ูุงุนุฏุฉ ุงูุจูุงูุงุช ูู `/content/murshid.db` โ ุชููุดุฃ ู
ู ุฌุฏูุฏ ูู ูู ุฌูุณุฉ Colab.
+
+---
+
+### ุงูุฎููุฉ 7: ุงุณุชูุฑุงุฏ ููุงูุจ WQL ู
ู Excel
+
+**ุงููุฏู:** ุชุญู
ูู 60 ูุงูุจ WQL ู
ู ู
ูู Excel ุฅูู ูุงุนุฏุฉ ุงูุจูุงูุงุช.
+
+**ุงูุจูุงูุงุช ุงูู
ุณุชูุฑุฏุฉ:**
+
+| ุงูุฅุญุตุงุฆูุฉ | ุงูููู
ุฉ |
+|-----------|--------|
+| ุฅุฌู
ุงูู ุงูุชูููุงุช | 20 ุชูููุฉ |
+| ุฅุฌู
ุงูู ุงูููุงูุจ | 60 ูุงูุจ (3 ููู ุชูููุฉ) |
+| ุงูุชูููุงุช ุงูู
ุดู
ููุฉ | T1047, T1055, T1059.001, T1070.004, T1078, T1083, T1095, T1098, T1105, T1110, T1112, T1114, T1176, T1190, T1484, T1498, T1499, T1529, T1531, T1562.001 |
+
+**ู
ุซุงู ุนูู ูุงูุจ WQL (T1484):**
+```
+Template 1: Host pivot
+ agent.name:${HOST} AND win.system.eventID:(4728 OR 4729 ...) AND @timestamp:[now-24h TO now]
+
+Template 2: Actor pivot
+ win.eventdata.SubjectUserName:${USER} AND win.system.eventID:(...) AND @timestamp:[now-24h TO now]
+
+Template 3: High-impact target change
+ win.system.eventID:(...) AND win.eventdata.TargetUserName:("Domain Admins" OR ...) AND @timestamp:[now-24h TO now]
+```
+
+**ู
ูุน ุงูุชูุฑุงุฑ:**
+- ูุชุญูู ู
ู ูุฌูุฏ (`technique_id` + `purpose`) ูุจู ุงูุฅุถุงูุฉ
+- `replace=False` ุจุดูู ุงูุชุฑุงุถู (ูุง ููุนูุฏ ุงููุชุงุจุฉ)
+
+---
+
+### ุงูุฎููุฉ 8: ุชุดุบูู FastAPI + Cloudflare Tunnel
+
+**ุงููุฏู:** ุงูุฎููุฉ ุงูุฑุฆูุณูุฉ โ ุชูุดุบูู ุงูุจุงููุฏ ูุชููุดุฆ ุฑุงุจุทุงู ุนุงู
ุงู.
+
+#### 8.1 ุงูุชุญูู ู
ู bitsandbytes
+```python
+import bitsandbytes as bnb
+print(f'โ
bitsandbytes {bnb.__version__}')
+```
+> ุฅุฐุง ูุดู: ููููู ุงูุชุดุบูู ููุฑุงู ู
ุน ุฑุณุงูุฉ ูุงุถุญุฉ.
+
+#### 8.2 ุชุดุบูู uvicorn
+```bash
+python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 --log-level info
+```
+- `--host 0.0.0.0`: ูุณุชู
ุน ุนูู ูู ุงููุงุฌูุงุช (ู
ุทููุจ ููู tunnel)
+- ุงูููุฌ ููุญูุธ ูู `/content/murshid_server.log`
+
+#### 8.3 ุชุญู
ูู ุงููู
ุงุฐุฌ (lifespan)
+ุนูุฏ ุจุฏุก ุงูุฎุงุฏู
ุชูููููุฐ `load_models()` ุจูุฐุง ุงูุชุฑุชูุจ:
+
+```
+1. hf_login(token) โ 1-2 ุซุงููุฉ
+2. LLaMA 3 8B-Instruct (4-bit NF4) โ 5-8 ุฏูุงุฆู (4.5 GB)
+ - BitsAndBytesConfig: load_in_4bit=True
+ - bnb_4bit_quant_type="nf4"
+ - bnb_4bit_compute_dtype=float16
+3. SecureBERT+ (ehsanaghaei) โ 1-2 ุฏูููุฉ
+ - AutoModel + AutoTokenizer
+ - mean pooling 768-dim
+4. LogisticRegressionModel โ < 1 ุซุงููุฉ
+ - joblib.load (Pipeline: PCA + OneVsRestClassifier)
+ - np.load thresholds
+```
+
+#### 8.4 ุงูุงูุชุธุงุฑ ุงูุฐูู
+```python
+for i in range(180): # 15 ุฏูููุฉ ูุญุฏ ุฃูุตู
+ time.sleep(5)
+ # ูุญุต /health ูู 5 ุซูุงูู
+ # ุนุฑุถ ุงูููุฌ ูู 30 ุซุงููุฉ
+ # ูุดู ู
ุจูุฑ ููุฃุฎุทุงุก (ERROR, ImportError)
+```
+
+#### 8.5 Cloudflare Tunnel
+```bash
+wget cloudflared-linux-amd64 โ /usr/local/bin/cloudflared
+cloudflared tunnel --url http://localhost:8000
+```
+- ูุง ูุญุชุงุฌ ุญุณุงุจุงู ุฃู ุชูููุงู
+- ูููุชุฌ ุฑุงุจุทุงู ู
ุซู: `https://xxxx.trycloudflare.com`
+- ุตุงูุญ ุทูุงู ุฌูุณุฉ Colab
+
+---
+
+### ุงูุฎููุฉ 9: ุฑุจุท ุงููุฑููุช ุชููุงุฆูุงู
+
+**ุงููุฏู:** ุชุญุฏูุซ `index.html` ุจุงูุฑุงุจุท ุงูุฌุฏูุฏ ู
ู Cloudflare ุชููุงุฆูุงู.
+
+```python
+# ุงุณุชุฎุฑุงุฌ ุงูุฑุงุจุท
+match = re.search(r'https://[a-z0-9\-]+\.trycloudflare\.com', content)
+public_url = match.group(0)
+
+# ุชุญุฏูุซ index.html ุนูู Drive
+html = re.sub(
+ r"const BASE = '[^']*';",
+ f"const BASE = '{public_url}';",
+ html
+)
+```
+
+**ุงููุชูุฌุฉ:**
+```javascript
+// ูุจู
+const BASE = 'http://127.0.0.1:8000';
+
+// ุจุนุฏ
+const BASE = 'https://xxxx.trycloudflare.com';
+```
+
+---
+
+
+
+### ุงูุฎููุฉ 10: ุงุฎุชุจุงุฑ ุงูู API
+
+**ุงููุฏู:** ุงูุชุญูู ู
ู ุนู
ู ูู ู
ููู.
+
+#### 10.1 Health Check
+```python
+urllib.request.urlopen('http://localhost:8000/health')
+```
+
+**ุงูู
ุฎุฑุฌ ุงูู
ุชููุน (FULL mode):**
+```json
+{
+ "pipeline_mode": "full",
+ "pipeline_description": "LLaMA + SecureBERT+ + LogReg",
+ "components": {
+ "llama_loaded": true,
+ "embedder_loaded": true,
+ "logreg_loaded": true,
+ "cuda_available": true
+ },
+ "all_model_files_present": true
+}
+```
+
+#### 10.2 ุชุญููู ูุงุนุฏุฉ ุงุฎุชุจุงุฑ
+```python
+rule_xml = '...'
+POST http://localhost:8000/rules/analyze
+```
+
+**ุงูู pipeline ุฎุทูุฉ ุจุฎุทูุฉ:**
+
+```
+XML Input (rule 18205)
+ โ
+sanitize_rule_from_string()
+ - ุญุฐู: mitre, if_sid, group, if_group
+ โ
+summarize_one_rule() [LLaMA]
+ - Input: sanitized XML
+ - Output: "Detects the deletion of a security-enabled global group on a Windows system."
+ โ
+build_text_for_embedding()
+ - text = summary + ". " + description
+ - "Detects the deletion of a security-enabled global group on a Windows system. Windows: Security Enabled Global Group Deleted."
+ โ
+SecureBERTEmbedder.embed_text()
+ - Chunks (256 tokens max)
+ - mean pooling per chunk
+ - average chunks โ 768-dim vector
+ - L2 normalize
+ โ
+LogisticRegressionModel.predict()
+ - predict_proba(X_user)
+ - pred = (proba >= logreg_thr)
+ - conf = proba * 100
+ - gap = proba - logreg_thr
+ โ
+save_technique_mappings() [DB]
+ - ุญูุธ 20 ุชูููุฉ ู
ุน confidence
+ โ
+JSON Response
+```
+
+**ุงูู
ุฎุฑุฌ ูููุงุนุฏุฉ 18205:**
+```
+Technique Pred Conf% Proba Thr Gap
+T1484 โ
94.76 0.9476 0.74 +0.2076 โ Primary
+T1531 โ 27.92 0.2792 ... ...
+T1070.004 โ 21.03 0.2103 ... ...
+T1098 โ 10.65 0.1065 ... ...
+T1112 โ 9.27 0.0927 ... ...
+```
+
+---
+ุงูุฎุทูุงุช ุงููุงุฏู
ุฉ ููู
ูุฏ ุงูู
ุญูู (lOCAL Mode) ุบูุฑ ุถุฑูุฑูู
+
+### ุงูุฎููุฉ 11: ุชุตุฏูุฑ ุงููุชุงุฆุฌ (ุงุฎุชูุงุฑู)
+
+**ุงููุฏู:** ุชุตุฏูุฑ ูุชุงุฆุฌ ุงูููุงุนุฏ ุงูู
ูุญููููุฉ ุฅูู JSON ูุงุณุชุฎุฏุงู
ูุง ูุงุญูุงู ุนูู ุงูุฌูุงุฒ ุงูู
ุญูู.
+
+```python
+export_path = f'{NEEDED_PATH}/murshid_full_results.json'
+json.dump(export_results, f, ensure_ascii=False, indent=2)
+```
+
+**ุงูุงุณุชุฎุฏุงู
:** ููู
ูููู ุงุณุชูุฑุงุฏ ูุชุงุฆุฌ FULL mode ูู ุงูุจุงููุฏ ุงูู
ุญูู ุจุฏูู GPU.
+
+---
+
+### ุงูุฎููุฉ 12: ุฅููุงู ุงูุฎุงุฏู
+
+```python
+cf_proc.terminate() # ุฅุบูุงู Cloudflare tunnel
+server_proc.terminate() # ุฅููุงู uvicorn
+```
+
+---
+
+## 4. ู
ูุงุฑูุฉ ุฃูุถุงุน ุงูุชุดุบูู
+
+| | FULL mode (Colab) | LOCAL mode (ุงูุฌูุงุฒ) | LITE mode |
+|--|-------------------|---------------------|-----------|
+| **LLaMA** | โ
| โ | โ |
+| **SecureBERT+** | โ
| โ
| โ |
+| **LogReg** | โ
| โ
| โ
|
+| **GPU** | Tesla T4 | ูุง ููุฒู
| ูุง ููุฒู
|
+| **Embedding** | ูุต ู
ูุซุฑู ุจู LLaMA | ูุตู ุงููุงุนุฏุฉ ููุท | ุนุดูุงุฆู |
+| **T1484 confidence** | **94.76%** | 89.29% | ุบูุฑ ู
ูุซูู |
+| **ุงููุฑุงุฑ ุงูููุงุฆู** | T1484 โ
| T1484 โ
| ุบูุฑ ู
ูุซูู |
+| **ููุช ุงูุชุญููู/ูุงุนุฏุฉ** | ~30-60 ุซุงููุฉ | ~2-5 ุซูุงูู | < 1 ุซุงููุฉ |
+| **ุงูุงุณุชุฎุฏุงู
** | ุฅูุชุงุฌ / ุนุฑุถ | ุชุทููุฑ ู
ุญูู | ุงุฎุชุจุงุฑ ููุท |
+
+---
+
+## 5. ู
ุนู
ุงุฑูุฉ ุงููุธุงู
ุงููุงู
ูุฉ ุนูู Colab
+
+```
+โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+โ Google Colab VM โ
+โ โ
+โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
+โ โ /content/murshid_backend/ โ โ
+โ โ โ โ
+โ โ FastAPI (uvicorn :8000) โ โ
+โ โ โโโ /health โ โ
+โ โ โโโ POST /rules/analyze โ โ
+โ โ โโโ GET /results/{rule_id} โ โ
+โ โ โโโ GET /queries/{tech_id} โ โ
+โ โ โโโ GET /api/db/... โ โ
+โ โโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโ โ
+โ โ โ
+โ โโโโโโโโโโโโโโโโโดโโโโโโโโโโโโ โ
+โ โ ML Models (GPU VRAM) โ โ
+โ โ โโโ LLaMA 3 8B (4-bit) โ โ
+โ โ โโโ SecureBERT+ โ โ
+โ โ โโโ LogReg Pipeline โ โ
+โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
+โ โ โ
+โ โโโโโโโโโโโโโโโโโดโโโโโโโโโโโโ โ
+โ โ /content/murshid.db โ โ
+โ โ (SQLite โ 6 ุฌุฏุงูู) โ โ
+โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
+โ โ
+โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
+โ โ cloudflared tunnel โ โ
+โ โ localhost:8000 โ HTTPS โ โ
+โ โโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโ โ
+โโโโโโโโโโโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ โ
+ โผ
+ https://xxxx.trycloudflare.com
+ โ
+ โผ
+ โโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ โ ุงูู
ุชุตูุญ / ุงููุฑููุช โ
+ โ index.html (React) โ
+ โโโโโโโโโโโโโโโโโโโโโโโโโโโ
+```
+
+---
+
+## 6. ุงูุฃุฎุทุงุก ุงูุดุงุฆุนุฉ ูุญููููุง
+
+| ุงูุฎุทุฃ | ุงูุณุจุจ | ุงูุญู |
+|-------|-------|------|
+| `ImportError: bitsandbytes>=0.46.1` | ูุณุฎุฉ ูุฏูู
ุฉ | ุดุบููู `!pip install -U bitsandbytes>=0.46.1` |
+| `FileNotFoundError: murshid_backend` | ZIP ุบูุฑ ู
ุฑููุน | ุงุฑูุนู `murshid_backend_for_drive.zip` ุฅูู Drive |
+| `ERR_NGROK_4018` | ngrok ูุญุชุงุฌ ุญุณุงุจุงู | ุงุณุชุฎุฏู
ู Cloudflare Tunnel (ุฎููุฉ 9) |
+| `Cannot connect to backend` | CORS ู
ุบูู | `allow_origins=["*"]` ูู `main.py` |
+| Server ูุณุชุบุฑู > 15 ุฏูููุฉ | ุชูุฒูู LLaMA ุจุทูุก | ูู ุงูุฌูุณุฉ ุงูุซุงููุฉ ุงูุชูุฒูู ู
ู Cache |
+| `InconsistentVersionWarning` | sklearn ุฅุตุฏุงุฑ ู
ุฎุชูู | ุชุฃูุฏู ู
ู `scikit-learn==1.6.1` |
+
+---
+
+## 7. ุงูู Endpoints ุงูู
ุชุงุญุฉ ุจุนุฏ ุงูุชุดุบูู
+
+| Method | Endpoint | ุงููุตู |
+|--------|----------|-------|
+| `GET` | `/health` | ุญุงูุฉ ุงูุฎุงุฏู
ูุงููู
ุงุฐุฌ |
+| `GET` | `/api/stats` | ุฅุญุตุงุฆูุงุช Dashboard |
+| `GET` | `/api/db/summary` | ุนุฏุฏ ุงูุตููู ูู ุงูุฌุฏุงูู |
+| `GET` | `/api/db/rules` | ุฌู
ูุน ุงูููุงุนุฏ ูู DB |
+| `GET` | `/api/db/mappings` | ุฌู
ูุน ุงูู
ุทุงุจูุงุช |
+| `GET` | `/api/db/techniques` | ุชูููุงุช MITRE ุงูู
ุฎุฒููุฉ |
+| `GET` | `/api/db/templates` | ููุงูุจ WQL |
+| `POST` | `/api/db/import-excel` | ุงุณุชูุฑุงุฏ Excel |
+| `POST` | `/rules/analyze` | ุชุญููู ูุงุนุฏุฉ XML (FULL pipeline) |
+| `GET` | `/results/{rule_id}` | ูุชุงุฆุฌ ุชูููุฉ ูุงุนุฏุฉ ู
ุญุฏุฏุฉ |
+| `GET` | `/queries/{technique_id}` | ุงุณุชุนูุงู
ุงุช WQL ูุชูููุฉ |
+| `POST` | `/admin/templates` | ุฅุถุงูุฉ ูุงูุจ WQL |
+| `PATCH` | `/admin/templates/{id}` | ุชุนุฏูู ูุงูุจ |
+| `GET` | `/docs` | Swagger UI ุงูุชูุงุนูู |
+
+---
+
+## 8. ู
ูุงุญุธุงุช ููุนุฑุถ ุงูุชูุฏูู
ู
+
+1. **ุดุบููู ุงูุฎูุงูุง ูุจู ุงูุนุฑุถ ุจู 15 ุฏูููุฉ** (ููุช ุชุญู
ูู LLaMA)
+2. **ุงูุณุฎู ุฑุงุจุท Cloudflare** ูุชุญููู ู
ูู ูู ุงูู
ุชุตูุญ
+3. **ุงููุฑููุช ููุญุฏููุซ ุชููุงุฆูุงู** ุจุงูุฑุงุจุท ุงูุฌุฏูุฏ ูู ุฎููุฉ 9
+4. **ูู ุฌูุณุฉ Colab ุฌุฏูุฏุฉ = ุฑุงุจุท Cloudflare ุฌุฏูุฏ** โ ูุฑูุฑู ุงูุฎุทูุงุช
+5. **DB ูุงุฑุบุฉ ูู ูู ุฌูุณุฉ** โ ุญูููู ุงูููุงุนุฏ ุนุจุฑ Admin Panel ุฃู ุฎููุฉ ุงุฎุชุจุงุฑ
+
+---
+
+*ุชุงุฑูุฎ ุงูุฅูุดุงุก: 8 ุฃุจุฑูู 2026 | ู
ุดุฑูุน ู
ูุฑุดูุฏ โ CCIS, PNU*
diff --git a/Needed/murshid_label_columns.json b/Needed/murshid_label_columns.json
new file mode 100644
index 0000000000000000000000000000000000000000..0a3d6e9d00e91408fa9b1d28f1d33a50992cd8a6
--- /dev/null
+++ b/Needed/murshid_label_columns.json
@@ -0,0 +1,22 @@
+[
+ "T1047",
+ "T1055",
+ "T1059.001",
+ "T1070.004",
+ "T1078",
+ "T1083",
+ "T1095",
+ "T1098",
+ "T1105",
+ "T1110",
+ "T1112",
+ "T1114",
+ "T1176",
+ "T1190",
+ "T1484",
+ "T1498",
+ "T1499",
+ "T1529",
+ "T1531",
+ "T1562.001"
+]
\ No newline at end of file
diff --git a/Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib b/Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib
new file mode 100644
index 0000000000000000000000000000000000000000..5ece55c1a1bd7e50f60b14abf3819d8e9a0a96fd
--- /dev/null
+++ b/Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be629d9f6780456a9435f8be2655e3fa0a848fbe2a4f166813913331b4c43ba4
+size 206584
diff --git a/Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy b/Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy
new file mode 100644
index 0000000000000000000000000000000000000000..9cf4604dd051177673475df7c2a8223394f9b99e
--- /dev/null
+++ b/Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:005a664d7faa22104e4a9e58ace6976628d1d00c1cabcaead1833ff792366c79
+size 208
diff --git a/Needed/murshid_query_template_structure_clean_shared.xlsx b/Needed/murshid_query_template_structure_clean_shared.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..92b947fcb2e29b202201cabad674ad44111b9667
--- /dev/null
+++ b/Needed/murshid_query_template_structure_clean_shared.xlsx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1491c4dee86bbf29691b3c4254a344e2cb87eabbb77f04f49da09856cb1d145
+size 20938
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bcd145c353c7da7aefdb630be9ca4fc74b6e25e2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,39 @@
+---
+title: Murshid - ู
ูุฑุดูุฏ
+emoji: ๐ก๏ธ
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+pinned: false
+license: mit
+---
+
+# ๐ก๏ธ Murshid | ู
ูุฑุดูุฏ
+
+**From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts**
+
+REST API + Dashboard for analyzing Wazuh IDS rules and mapping them to MITRE ATT&CK techniques.
+
+## Features
+
+- **Rule Analysis**: Parse Wazuh XML rules and classify MITRE ATT&CK techniques
+- **WQL Queries**: Get pre-built Wazuh Query Language templates per technique
+- **Dashboard**: Interactive web UI with statistics and DB viewer
+- **ML Pipeline**: Logistic Regression with SecureBERT+ embeddings
+
+## Tech Stack
+
+- **FastAPI** โ REST API
+- **SQLite** โ Database
+- **Logistic Regression** โ Primary classification model
+- **SecureBERT+** โ Text embeddings (optional, requires torch)
+
+## API Endpoints
+
+| Method | URL | Description |
+|--------|-----|-------------|
+| `GET` | `/health` | System health check |
+| `POST` | `/rules/analyze` | Analyze a Wazuh XML rule |
+| `GET` | `/results/{rule_id}` | Get stored results for a rule |
+| `GET` | `/queries/{technique_id}` | Get WQL templates for a technique |
+| `GET` | `/docs` | Interactive Swagger documentation |
diff --git a/murshid_backend/README.md b/murshid_backend/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3c8e7ae9131131334df33516a3c192aeb22f7a33
--- /dev/null
+++ b/murshid_backend/README.md
@@ -0,0 +1,156 @@
+# Murshid Backend
+
+REST API ูู
ุดุฑูุน "ู
ุฑุดุฏ โ ู
ู ุงูุชูุจููุงุช ุฅูู ุงูุชูุฌูู: ุฑุจุท ุชูููุงุช MITRE ATT&CK ูู
ุญููู SOC"
+
+## ุงูุชูููุงุช
+
+- **FastAPI** โ REST API
+- **MySQL** + **SQLAlchemy** โ ูุงุนุฏุฉ ุงูุจูุงูุงุช
+- **Alembic** โ ูุฌุฑุฉ ุงูุฌุฏุงูู
+- **Logistic Regression** โ ุงููู
ูุฐุฌ ุงูุฃุณุงุณู ูู ูุฐู ุงูู
ุฑุญูุฉ
+- **SecureBERT+** โ ุชุถู
ููุงุช ูุตูุฉ
+- **Llama 3 8B** โ ุชูุฎูุต ููุงุนุฏ Wazuh
+
+> ุงูู
ูุทู ู
ุณุชุฎุฑุฌ ู
ู `MurshidUIPipeline.ipynb` ุฏูู ุชุนุฏููู.
+
+---
+
+## ูููู ุงูู
ุดุฑูุน
+
+```
+murshid_backend/
+ app/
+ main.py โ ููุทุฉ ุชุดุบูู FastAPI
+ config.py
+ api/routes/
+ health.py โ GET /health
+ rules.py โ POST /rules/analyze + GET /results/{rule_id}
+ queries.py โ GET /queries/{technique_id} + Admin endpoints
+ services/
+ ml_service.py
+ rule_service.py
+ result_service.py
+ template_service.py
+ ml/
+ sanitizer.py โ ุชูุธูู XML
+ summarizer.py โ ุชูุฎูุต Llama
+ embedder.py โ SecureBERT+
+ logistic_model.py โ Logistic Regression inference
+ pipeline.py โ analyze_rule() ุงูุดุงู
ู
+ models/ โ SQLAlchemy ORM (6 ุฌุฏุงูู ู
ู ER Diagram)
+ schemas/ โ Pydantic schemas
+ repositories/ โ DB access layer
+ db/
+ base.py
+ session.py
+ alembic/
+ versions/0001_initial_schema.py
+ requirements.txt
+ .env.example
+```
+
+---
+
+## ุฌุฏุงูู ูุงุนุฏุฉ ุงูุจูุงูุงุช (ู
ุณุชุฎุฑุฌุฉ ู
ู ER Diagram ยง3.2.6)
+
+| ุฌุฏูู | ุงูู
ุตุฏุฑ ูู ุงูุชูุฑูุฑ |
+|------|-------------------|
+| `users` | User entity โ username, email, password_hash, role |
+| `mapping_jobs` | MappingJob entity โ job_id, file_name, status, progress, timestamp |
+| `rules` | Rule entity โ rule_id, embedding_vector, job_id |
+| `techniques` | Technique entity โ technique_id, technique_name, tactic |
+| `rule_technique_mappings` | RuleTechniqueMapping โ rule_id, technique_id, confidence_score |
+| `query_templates` | QueryTemplate โ purpose, wql_query, note, is_active |
+
+---
+
+## ุงูุฅุนุฏุงุฏ ูุงูุชุดุบูู
+
+### 1) ู
ุชุทูุจุงุช
+
+- Python 3.10+
+- MySQL 8+
+- GPU ู
ูุตู ุจู ูู Llama 3 8B
+
+### 2) ุชุซุจูุช
+
+```powershell
+cd d:\GP\murshid_backend
+python -m venv .venv
+.\.venv\Scripts\activate
+pip install -r requirements.txt
+```
+
+### 3) ุฅุนุฏุงุฏ ูุงุนุฏุฉ ุงูุจูุงูุงุช
+
+ุฅูุดุงุก ูุงุนุฏุฉ ุงูุจูุงูุงุช ูู MySQL:
+```sql
+CREATE DATABASE murshid_db CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
+```
+
+### 4) ุฅุนุฏุงุฏ `.env`
+
+```powershell
+copy .env.example .env
+```
+
+ุนุฏููู ุงูููู
:
+```env
+MURSHID_DB_URL=mysql+pymysql://root:YOUR_PASSWORD@localhost:3306/murshid_db
+MURSHID_MODELS_DIR=d:/GP/Needed
+HF_TOKEN=hf_xxxx
+MURSHID_SKIP_LLM=false
+```
+
+### 5) ุชุฃูุฏ ู
ู ูุฌูุฏ ู
ููุงุช ุงููู
ุงุฐุฌ ูู `d:\GP\Needed`
+
+```
+murshid_logreg_pipeline_manual_oof_pcatuned.joblib
+murshid_logreg_thresholds_manual_oof_pcatuned.npy
+murshid_label_columns.json
+```
+
+### 6) ุชุดุบูู Alembic (ูุฌุฑุฉ ุงูุฌุฏุงูู)
+
+```powershell
+alembic upgrade head
+```
+
+### 7) ุชุดุบูู ุงูู API
+
+```powershell
+uvicorn app.main:app --reload --host 127.0.0.1 --port 8000
+```
+
+---
+
+## ุงูู Endpoints
+
+| Method | URL | ุงููุตู |
+|--------|-----|--------|
+| `GET` | `/health` | ูุญุต ุญุงูุฉ ุงููุธุงู
ูุงููู
ุงุฐุฌ |
+| `POST` | `/rules/analyze` | ุชุญููู ูุงุนุฏุฉ Wazuh XML ูุญูุธ ุงููุชุงุฆุฌ |
+| `GET` | `/results/{rule_id}` | ุงุณุชุฑุฌุงุน ุงูุชูููุงุช ุงูู
ุฎุฒูุฉ ูู
ุนุฑู ุงููุงุนุฏุฉ |
+| `GET` | `/queries/{technique_id}` | ุฌูุจ ููุงูุจ WQL ูุชูููุฉ ู
ุนููุฉ |
+| `POST` | `/admin/templates` | ุฅุถุงูุฉ ูุงูุจ WQL ุฌุฏูุฏ (Admin) |
+| `PATCH` | `/admin/templates/{id}` | ุชุนุฏูู ุฃู ุชุนุทูู ูุงูุจ (Admin) |
+
+### ู
ุซุงู โ ุชุญููู ูุงุนุฏุฉ
+
+```bash
+curl -X POST http://127.0.0.1:8000/rules/analyze \
+ -H "Content-Type: application/json" \
+ -d '{"rule_xml": "Registry Key Entry Deleted."}'
+```
+
+### ุงูุชูุซูู ุงูุชูุงุนูู
+
+ุงูุชุญู: **http://127.0.0.1:8000/docs**
+
+---
+
+## ู
ูุงุญุธุงุช
+
+- ุงูู
ูู ุงูุฃุตูู `MurshidUIPipeline.ipynb` **ูู
ููุนุฏููู** โ ุงูู
ูุทู ู
ูุณูุฎ ุฅูู ุทุจูุฉ `app/ml/`.
+- ุงููู
ูุฐุฌ ุงูู
ุนุชู
ุฏ ูู ูุฐู ุงูู
ุฑุญูุฉ: **Logistic Regression** ููุท.
+- ูุชุดุบูู ุจุฏูู GPU ููุงุฎุชุจุงุฑ ููุท: ุถุนู `MURSHID_SKIP_LLM=true` ูู `.env` (ููู `/rules/analyze` ุณุชุนูุฏ 503).
diff --git a/murshid_backend/TECHNICAL_REPORT.md b/murshid_backend/TECHNICAL_REPORT.md
new file mode 100644
index 0000000000000000000000000000000000000000..0bce7ea061d1e3fceb3469629d1f91ee04a754d3
--- /dev/null
+++ b/murshid_backend/TECHNICAL_REPORT.md
@@ -0,0 +1,322 @@
+# ุชูุฑูุฑ ุชููู ู
ูุตูู โ ู
ุดุฑูุน ู
ูุฑุดูุฏ (Murshid)
+## From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts
+
+---
+
+## 1. ูุธุฑุฉ ุนุงู
ุฉ
+
+ู
ูุฑุดูุฏ ูุธุงู
ุฐูู ูุญููู ุชูุจููุงุช ููุงุนุฏ Wazuh XML ุฅูู ุชูููุงุช MITRE ATT&CK ู
ูุฑุชูุจุฉ ุจุฏุฑุฌุงุช ุซูุฉุ ููููุชุฌ ุงุณุชุนูุงู
ุงุช ุชุญููู WQL ุฌุงูุฒุฉ ูู
ุญููู SOC.
+
+```
+ูุงุนุฏุฉ Wazuh XML
+ โ
+ Sanitization (ุญุฐู if_sid, group, mitre)
+ โ
+ LLaMA 3 8B (ุชูุฎูุต ุจุฌู
ูุฉ ูุงุญุฏุฉ)
+ โ
+ SecureBERT+ (768-dim embedding)
+ โ
+ Logistic Regression + PCA (ุชุตููู)
+ โ
+ ุชูููุงุช MITRE ATT&CK + Confidence Scores
+ โ
+ ููุงูุจ WQL ููุชุญููู
+```
+
+---
+
+## 2. ูููู ุงูู
ุดุฑูุน ุงููุงู
ู
+
+```
+d:\GP\
+โโโ MurshidUIPipeline.ipynb โ ุงูุฏูุชุฑ ุงูุฃุตูู (ูุง ููุนุฏููู)
+โโโ Needed\ โ ู
ููุงุช ุงููู
ุงุฐุฌ ุงูู
ุฏุฑูุจุฉ
+โ โโโ murshid_logreg_pipeline_manual_oof_pcatuned.joblib
+โ โโโ murshid_logreg_thresholds_manual_oof_pcatuned.npy
+โ โโโ murshid_svmlinear_per_label_thresholds.joblib
+โ โโโ murshid_label_columns.json (20 ุชูููุฉ)
+โโโ murshid_backend\ โ ุฎุฏู
ุฉ FastAPI
+โ โโโ app\
+โ โ โโโ main.py
+โ โ โโโ config.py
+โ โ โโโ api\routes\
+โ โ โ โโโ health.py GET /health
+โ โ โ โโโ rules.py POST /rules/analyze | GET /results/{rule_id}
+โ โ โ โโโ queries.py GET /queries/{technique_id} | POST,PATCH /admin/templates
+โ โ โ โโโ stats.py GET /api/stats
+โ โ โ โโโ db_viewer.py GET /api/db/{summary|rules|mappings|...}
+โ โ โโโ ml\
+โ โ โ โโโ sanitizer.py ุชูุธูู XML
+โ โ โ โโโ summarizer.py LLaMA inference
+โ โ โ โโโ embedder.py SecureBERT+ embeddings
+โ โ โ โโโ logistic_model.py LogReg inference (PRIMARY)
+โ โ โ โโโ pipeline.py ุชูุณูู ุงูู
ุฑุงุญู (FULL|LOCAL|LITE)
+โ โ โโโ models\ SQLAlchemy ORM
+โ โ โ โโโ user.py
+โ โ โ โโโ mapping_job.py
+โ โ โ โโโ rule.py
+โ โ โ โโโ technique.py
+โ โ โ โโโ rule_technique_mapping.py
+โ โ โ โโโ query_template.py
+โ โ โโโ schemas\ Pydantic schemas
+โ โ โโโ services\ Business logic
+โ โ โโโ repositories\ DB access
+โ โ โโโ db\ SQLAlchemy session
+โ โโโ alembic\ Migrations
+โ โโโ murshid.db SQLite database
+โ โโโ .env
+โ โโโ requirements.txt
+โโโ murshid_frontend\ ูุงุฌูุฉ React
+ โโโ index.html
+```
+
+---
+
+## 3. ุทุจูุฉ ุงูุจุงููุฏ (FastAPI)
+
+### 3.1 ุงูู Endpoints
+
+| Method | URL | ุงููุตู | Actor |
+|--------|-----|--------|-------|
+| `GET` | `/health` | ุญุงูุฉ ุงููุธุงู
+ pipeline mode + ู
ููุงุช ุงููู
ุงุฐุฌ | All |
+| `GET` | `/api/stats` | ุฅุญุตุงุฆูุงุช Dashboard (KPIs + Technique Frequency) | All |
+| `GET` | `/api/db/summary` | ุนุฏุฏ ุงูุตููู ูู ูู ุฌุฏูู | Testing |
+| `GET` | `/api/db/rules` | ุฌู
ูุน ุงูููุงุนุฏ ุงูู
ุฎุฒููุฉ | Testing |
+| `GET` | `/api/db/mappings` | ุฌู
ูุน ู
ุทุงุจูุงุช ุงูููุงุนุฏ-ุงูุชูููุงุช | Testing |
+| `GET` | `/api/db/techniques` | ุฌู
ูุน ุชูููุงุช MITRE ุงูู
ุฎุฒููุฉ | Testing |
+| `GET` | `/api/db/templates` | ุฌู
ูุน ููุงูุจ WQL | Testing |
+| `POST` | `/rules/analyze` | ุชุญููู ูุงุนุฏุฉ XML โ ุชุฎุฒูู ุงููุชุงุฆุฌ | Admin |
+| `GET` | `/results/{rule_id}` | ุงุณุชุฑุฌุงุน ุชูููุงุช ูุงุนุฏุฉ ู
ุญุฏุฏุฉ (Figure 4-11/12) | SOC Analyst |
+| `GET` | `/queries/{technique_id}` | ููุงูุจ WQL ูุชูููุฉ ู
ุญุฏุฏุฉ | SOC Analyst |
+| `POST` | `/admin/templates` | ุฅุถุงูุฉ ูุงูุจ WQL ุฌุฏูุฏ | Admin |
+| `PATCH` | `/admin/templates/{id}` | ุชุนุฏูู/ุชุนุทูู ูุงูุจ | Admin |
+
+### 3.2 ู
ุนู
ุงุฑูุฉ ุงูุทุจูุงุช
+
+```
+HTTP Request
+ โ
+ โผ
+API Layer (FastAPI routes)
+ โ validates input (Pydantic)
+ โผ
+Service Layer
+ โ orchestrates business logic
+ โผ
+ML Layer Repository Layer
+ โ โ
+ โผ โผ
+Pipeline SQLAlchemy ORM
+(sanitizeโembedโclassify) โ
+ โ โผ
+ โโโโโโโโโโโโ SQLite DB
+```
+
+### 3.3 ูุงุนุฏุฉ ุงูุจูุงูุงุช (SQLite + SQLAlchemy)
+
+ู
ุณุชุฎุฑุฌุฉ ุญุฑููุงู ู
ู ER Diagram (ยง3.2.6 ู
ู ุงูุชูุฑูุฑ):
+
+| ุงูุฌุฏูู | ุงูุฃุนู
ุฏุฉ ุงูุฑุฆูุณูุฉ | ุงูู
ุตุฏุฑ ูู ุงูุชูุฑูุฑ |
+|--------|------------------|-------------------|
+| `users` | user_id, username, email, password_hash, role | User entity |
+| `mapping_jobs` | job_id, user_id, file_name, status, progress, timestamp | MappingJob entity |
+| `rules` | rule_id (PK), job_id, embedding_vector | Rule entity |
+| `techniques` | technique_id (PK), technique_name, tactic | Technique entity |
+| `rule_technique_mappings` | mapping_id, rule_id, technique_id, confidence_score | RuleTechniqueMapping |
+| `query_templates` | template_id, technique_id, purpose, wql_query, note, is_active | QueryTemplate |
+
+> Index ุนูู `rule_id` ูู `rule_technique_mappings` (Use Case 6 ยง3.2.7)
+
+---
+
+## 4. ุทุจูุฉ ML
+
+### 4.1 ู
ุฑุงุญู ุงูู Pipeline (ู
ู ุงูุฏูุชุฑ)
+
+#### ุงูู
ุฑุญูุฉ 1: Sanitization
+```python
+# ml/sanitizer.py โ ู
ู cell 10 ูู ุงูุฏูุชุฑ
+REMOVE_TAGS_ANYWHERE = {"mitre", "if_sid", "group", "if_group"}
+# ููุญุฐู: group tags, if_sid, mitre IDs, compliance tags
+# ูุจูู: description, id, category, decoded_as, info
+```
+
+#### ุงูู
ุฑุญูุฉ 2: LLM Summarization (LLaMA 3 8B)
+```python
+# ml/summarizer.py โ ู
ู cell 11 ูู ุงูุฏูุชุฑ
+# Input: sanitized XML
+# Prompt: "Write EXACTLY ONE sentence describing the observable event pattern"
+# Output: JSON {"summary": "Detects ..."}
+# Constraints: 7-18 words, ูุจุฏุฃ ุจู Detects/Monitors/...
+```
+
+#### ุงูู
ุฑุญูุฉ 3: Paragraph Construction
+```python
+# ml/embedder.py โ ู
ู cell 12 ูู ุงูุฏูุชุฑ
+text = f"{summary}. {description}."
+# ู
ุซุงู: "Detects deletion of global group. Windows: Security Enabled Global Group Deleted."
+```
+
+#### ุงูู
ุฑุญูุฉ 4: SecureBERT+ Embedding
+```python
+# ml/embedder.py โ ู
ู cell 15 ูู ุงูุฏูุชุฑ
+# Model: ehsanaghaei/SecureBERT_Plus
+# MAX_LEN: 512 tokens, chunks
+# Pooling: Mean pooling across tokens โ 768-dim vector
+# Normalization: L2
+```
+
+#### ุงูู
ุฑุญูุฉ 5: Logistic Regression Inference
+```python
+# ml/logistic_model.py โ ู
ู cell 18-19 ูู ุงูุฏูุชุฑ
+proba = logreg_model.predict_proba(X_user)
+proba = proba.reshape(-1)
+pred = (proba >= logreg_thr).astype(int)
+conf = proba * 100
+gap = proba - logreg_thr
+# ุชูุฑุฌุน ุฌู
ูุน ุงูู 20 ุชูููุฉ ู
ุฑุชูุจุฉ ุชูุงุฒููุงู
+```
+
+### 4.2 ุฃูุถุงุน ุงูุชุดุบูู
+
+| ุงููุถุน | ุงูุดุฑุท | ุงูุฏูุฉ | ุงูุงุณุชุฎุฏุงู
|
+|-------|--------|-------|-----------|
+| **FULL** | LLaMA + SecureBERT + LogReg | 100% (ู
ุทุงุจู ููุฏูุชุฑ) | Colab/GPU |
+| **LOCAL** | SecureBERT + LogReg (ุจุฏูู LLaMA) | ~95% (ูุตู ุจุฏูู ู
ูุฎุต) | ุงูุฌูุงุฒ ุงูู
ุญูู |
+| **LITE** | LogReg ููุท (ุจุฏูู torch) | ู
ูุฎูุถุฉ (ุนุดูุงุฆู) | ุงุฎุชุจุงุฑ ุงูุจููุฉ ููุท |
+
+---
+
+## 5. ุทุจูุฉ ุงููุฑููุช (React + Tailwind + Chart.js)
+
+### 5.1 ุงูุตูุญุงุช (CDN-based React, ุจุฏูู Build Step)
+
+| ุงูุตูุญุฉ | ID | ุงูู
ุณุชุฎุฏู
| ุงููุตู |
+|--------|-----|----------|--------|
+| Login | โ | All | ุชุณุฌูู ุฏุฎูู + ุงุฎุชูุงุฑ ุฏูุฑ |
+| Dashboard | `dashboard` | All | KPIs + MITRE Technique Frequency Chart |
+| Rule Lookup | `rules` | SOC Analyst | ุจุญุซ ุจู Rule ID โ Figure 4-11 + Figure 4-12 |
+| ูุชุงุฆุฌ DB | `dbviewer` | All | ุงุณุชุนุฑุงุถ ูุงุนุฏุฉ ุงูุจูุงูุงุช ููุงุฎุชุจุงุฑ |
+| Rule Mapping | `admin` | Admin | ุฑูุน XML + ุชุญููู + ุฌุฏูู ุงูุชูุฏู
|
+| WQL Templates | `templates` | Admin | ุฅุฏุงุฑุฉ ููุงูุจ ุงูุงุณุชุนูุงู
ุงุช |
+| Settings | `settings` | All | ู
ูู ุดุฎุตู + Dark Mode + ุฃููุงู |
+
+### 5.2 ุงูู Figures ูู
ุง ูู ุงูุชูุฑูุฑ
+
+| Figure | ุงูุตูุญุฉ | ุงูู
ูููู |
+|--------|--------|---------|
+| Figure 4-10 | Rule Lookup | Search bar + Rule ID input |
+| Figure 4-11 | Rule Lookup | `TechniqueDistributionChart` โ Horizontal bar chart (Top 5, ู
ูููููู H/M/L) |
+| Figure 4-12 | Rule Lookup | Investigation Queries table (Primary + Secondary โฅ50%) |
+| Figure 4-13 | Admin | Rule Mapping Panel (paste XML + Submit) |
+| Figure 4-14 | Admin | Mapping Progress Table (Job ID, Status, Progress) |
+| Figure 4-9 | Dashboard | KPIs + Technique Frequency Bar Chart |
+
+### 5.3 ุฑุจุท ุงููุฑููุช ุจุงูุจุงููุฏ
+
+```javascript
+const BASE = 'http://127.0.0.1:8000';
+// CORS ู
ููุนููู ูู ุงูุจุงููุฏ ูู http://localhost:5173 ู http://127.0.0.1:5173
+// ุงููุฑููุช ููุฎุฏููู
ู
ุจุงุดุฑุฉู ู
ู FastAPI ุนุจุฑ StaticFiles
+```
+
+---
+
+## 6. ู
ุฎุทุท ุชุฏูู ุงูุจูุงูุงุช ุงููุงู
ู
+
+```
+โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+โ SOC Analyst / Admin โ
+โ (murshid_frontend/index.html) โ
+โโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโ
+ โ HTTP/JSON
+ โผ
+โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+โ FastAPI (port 8000) โ
+โ โ
+โ /health โ pipeline status โ
+โ POST /rules/analyze: โ
+โ 1. sanitizer.py โ clean XML โ
+โ 2. summarizer.py โ LLaMA summary โ โ FULL mode only
+โ 3. embedder.py โ 768-dim vector โ
+โ 4. logistic_model โ proba + scores โ
+โ 5. rule_repo โ save to DB โ
+โ โ
+โ GET /results/{id} โ from DB โ
+โ GET /queries/{id} โ WQL templates โ
+โโโโโโโโโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโ
+ โ SQLAlchemy
+ โผ
+โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+โ SQLite (murshid.db) โ
+โ rules | techniques | mappings โ
+โ query_templates | mapping_jobs โ
+โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+```
+
+---
+
+## 7. ุงูุชุดุบูู
+
+### ุงูู
ุชุทูุจุงุช
+- Python 3.12 (ุนุจุฑ uv)
+- ู
ููุงุช ุงููู
ุงุฐุฌ ูู `d:\GP\Needed\`
+- ุงุชุตุงู ุฅูุชุฑูุช (ูู SecureBERT+ ู
ู HuggingFace ุฃูู ู
ุฑุฉ)
+
+### ุชุดุบูู ุงูุฎุงุฏู
+```powershell
+cd d:\GP\murshid_backend
+.venv\Scripts\python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000
+```
+
+### ุงูุฑูุงุจุท
+| ุงูุฑุงุจุท | ุงููุตู |
+|--------|--------|
+| http://127.0.0.1:8000/index.html | ุงููุงุฌูุฉ ุงูุฑุฆูุณูุฉ |
+| http://127.0.0.1:8000/docs | Swagger API Documentation |
+| http://127.0.0.1:8000/health | ูุญุต ุญุงูุฉ ุงููุธุงู
|
+| http://127.0.0.1:8000/api/db/summary | ู
ูุฎุต ูุงุนุฏุฉ ุงูุจูุงูุงุช |
+
+### ุงุฎุชุจุงุฑ ุณุฑูุน
+```powershell
+# 1. ุชุญููู ูุงุนุฏุฉ
+$body = '{"rule_xml":"Registry Key Entry Deleted."}'
+Invoke-RestMethod -Uri "http://127.0.0.1:8000/rules/analyze" -Method POST -ContentType "application/json" -Body $body
+
+# 2. ุงุณุชุฑุฌุงุน ุงููุชุงุฆุฌ
+Invoke-RestMethod "http://127.0.0.1:8000/results/597"
+
+# 3. ุฅุถุงูุฉ ูุงูุจ WQL
+$t = '{"technique_id":"T1112","purpose":"Detect registry modification","wql_query":"agent.name:${HOST} AND rule.description:\"registry\"","note":"Replace ${HOST}"}'
+Invoke-RestMethod -Uri "http://127.0.0.1:8000/admin/templates" -Method POST -ContentType "application/json" -Body $t
+
+# 4. ุฌูุจ ุงูุงุณุชุนูุงู
ุงุช
+Invoke-RestMethod "http://127.0.0.1:8000/queries/T1112"
+```
+
+---
+
+## 8. ุงููุฑู ุจูู FULL mode (Colab) ู LOCAL mode (ุงูุฌูุงุฒ)
+
+| | Colab (FULL) | ุงูุฌูุงุฒ ุงูู
ุญูู (LOCAL) |
+|--|-------------|----------------------|
+| Input text | `"Detects deletion of a security-enabled global group. Windows: Security Enabled Global Group Deleted."` | `"Windows: Security Enabled Global Group Deleted"` |
+| T1484 proba | **0.9476 (94.76%)** | **0.8929 (89.29%)** |
+| ุณุจุจ ุงููุฑู | LLaMA ููุซุฑู ุงููุต ุจุณูุงู ุฏูุงูู | ุงููุตู ููุท ุจุฏูู ุฅุซุฑุงุก |
+| ุงููุฑุงุฑ ุงูุตุญูุญ | T1484 โ
| T1484 โ
|
+
+**ุงูุงุณุชูุชุงุฌ:** ุงููุฑุงุฑ ุงูููุงุฆู ุตุญูุญ ูู ููุง ุงููุถุนูู โ ุงูุงุฎุชูุงู ูู ุฏุฑุฌุฉ ุงูุซูุฉ ููุท.
+
+---
+
+## 9. ุญุงูุงุช ุงูุงุณุชุฎุฏุงู
ุงูู
ูููููุฐุฉ (ู
ู ุงูุชูุฑูุฑ)
+
+| Use Case | ุงููุตู | ู
ูููููุฐ |
+|----------|--------|---------|
+| UC1 | View techniques and scores for a rule | โ
`GET /results/{rule_id}` |
+| UC2 | View WQL investigation queries | โ
`GET /queries/{technique_id}` |
+| UC3 | Copy and fill investigation query | โ
ุฒุฑ Copy ูู ุงููุฑููุช |
+| UC4 | Upload Wazuh rule(s) | โ
Admin Panel |
+| UC5 | Process rule via ML pipeline | โ
`POST /rules/analyze` |
+| UC6 | Store mapped techniques in DB | โ
ุชููุงุฆู ุจุนุฏ analyze |
+| UC7 | Manage WQL templates repository | โ
`POST/PATCH /admin/templates` |
diff --git a/murshid_backend/alembic.ini b/murshid_backend/alembic.ini
new file mode 100644
index 0000000000000000000000000000000000000000..0eda848ee5b8059ac2c9504b0805ce277162ba3e
--- /dev/null
+++ b/murshid_backend/alembic.ini
@@ -0,0 +1,38 @@
+[alembic]
+script_location = alembic
+prepend_sys_path = .
+sqlalchemy.url = sqlite:///murshid.db
+
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/murshid_backend/alembic/env.py b/murshid_backend/alembic/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..292be508a12fafc3b9c2a45726e7acf4809a2d73
--- /dev/null
+++ b/murshid_backend/alembic/env.py
@@ -0,0 +1,52 @@
+import sys
+from logging.config import fileConfig
+from pathlib import Path
+
+from sqlalchemy import engine_from_config, pool
+
+from alembic import context
+
+# make app importable
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from app.config import settings
+from app.db.base import Base
+import app.models # noqa: F401 โ registers all models with Base.metadata
+
+config = context.config
+config.set_main_option("sqlalchemy.url", settings.murshid_db_url)
+
+if config.config_file_name is not None:
+ fileConfig(config.config_file_name)
+
+target_metadata = Base.metadata
+
+
+def run_migrations_offline() -> None:
+ url = config.get_main_option("sqlalchemy.url")
+ context.configure(
+ url=url,
+ target_metadata=target_metadata,
+ literal_binds=True,
+ dialect_opts={"paramstyle": "named"},
+ )
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+def run_migrations_online() -> None:
+ connectable = engine_from_config(
+ config.get_section(config.config_ini_section, {}),
+ prefix="sqlalchemy.",
+ poolclass=pool.NullPool,
+ )
+ with connectable.connect() as connection:
+ context.configure(connection=connection, target_metadata=target_metadata)
+ with context.begin_transaction():
+ context.run_migrations()
+
+
+if context.is_offline_mode():
+ run_migrations_offline()
+else:
+ run_migrations_online()
diff --git a/murshid_backend/alembic/script.py.mako b/murshid_backend/alembic/script.py.mako
new file mode 100644
index 0000000000000000000000000000000000000000..17dcba0ef89f896010374bbb3db808071268aa4c
--- /dev/null
+++ b/murshid_backend/alembic/script.py.mako
@@ -0,0 +1,25 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+revision: str = ${repr(up_revision)}
+down_revision: Union[str, None] = ${repr(down_revision)}
+branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
+depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+ ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+ ${downgrades if downgrades else "pass"}
diff --git a/murshid_backend/alembic/versions/0001_initial_schema.py b/murshid_backend/alembic/versions/0001_initial_schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0bd898f311cf11808d3a093b923ea592614aaf2
--- /dev/null
+++ b/murshid_backend/alembic/versions/0001_initial_schema.py
@@ -0,0 +1,87 @@
+"""initial schema โ all 6 tables from ER Diagram ยง3.2.6
+
+Revision ID: 0001
+Revises:
+Create Date: 2026-04-08
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+from alembic import op
+
+revision: str = "0001"
+down_revision: Union[str, None] = None
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ op.create_table(
+ "users",
+ sa.Column("user_id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column("username", sa.String(100), unique=True, nullable=False),
+ sa.Column("email", sa.String(255), unique=True, nullable=False),
+ sa.Column("password_hash", sa.String(255), nullable=False),
+ sa.Column("role", sa.String(20), nullable=False, server_default="analyst"),
+ )
+
+ op.create_table(
+ "mapping_jobs",
+ sa.Column("job_id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.user_id"), nullable=False),
+ sa.Column("file_name", sa.String(255), nullable=False),
+ sa.Column("rules_count", sa.Integer(), server_default="0"),
+ sa.Column("status", sa.String(20), nullable=False, server_default="pending"),
+ sa.Column("progress", sa.Integer(), server_default="0"),
+ sa.Column("timestamp", sa.DateTime(), server_default=sa.func.now()),
+ )
+
+ op.create_table(
+ "rules",
+ sa.Column("rule_id", sa.String(50), primary_key=True),
+ sa.Column("job_id", sa.Integer(), sa.ForeignKey("mapping_jobs.job_id"), nullable=True),
+ sa.Column("embedding_vector", sa.Text(), nullable=True),
+ )
+
+ op.create_table(
+ "techniques",
+ sa.Column("technique_id", sa.String(20), primary_key=True),
+ sa.Column("technique_name", sa.String(255), nullable=False),
+ sa.Column("tactic", sa.String(100), nullable=True),
+ )
+
+ op.create_table(
+ "rule_technique_mappings",
+ sa.Column("mapping_id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column("rule_id", sa.String(50), sa.ForeignKey("rules.rule_id"), nullable=False),
+ sa.Column(
+ "technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False
+ ),
+ sa.Column("confidence_score", sa.Float(), nullable=False),
+ )
+ # Index on rule_id โ Use Case 6 ยง3.2.7
+ op.create_index("ix_rule_technique_rule_id", "rule_technique_mappings", ["rule_id"])
+
+ op.create_table(
+ "query_templates",
+ sa.Column("template_id", sa.Integer(), primary_key=True, autoincrement=True),
+ sa.Column(
+ "technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False
+ ),
+ sa.Column("purpose", sa.String(255), nullable=True),
+ sa.Column("wql_query", sa.Text(), nullable=False),
+ sa.Column("note", sa.Text(), nullable=True),
+ sa.Column("is_active", sa.Boolean(), nullable=False, server_default="1"),
+ )
+
+
+def downgrade() -> None:
+ op.drop_table("query_templates")
+ op.drop_index("ix_rule_technique_rule_id", table_name="rule_technique_mappings")
+ op.drop_table("rule_technique_mappings")
+ op.drop_table("techniques")
+ op.drop_table("rules")
+ op.drop_table("mapping_jobs")
+ op.drop_table("users")
+ pass # SQLite: no custom types to drop
diff --git a/murshid_backend/app/__init__.py b/murshid_backend/app/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4873060f5df02cc130c2c7b5a1a40b8021952e0
--- /dev/null
+++ b/murshid_backend/app/__init__.py
@@ -0,0 +1 @@
+"""Murshid backend package."""
diff --git a/murshid_backend/app/api/__init__.py b/murshid_backend/app/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f3b5cfaec50d63556a3347dcd266a77f433af2
--- /dev/null
+++ b/murshid_backend/app/api/__init__.py
@@ -0,0 +1 @@
+"""API layer โ FastAPI routers."""
diff --git a/murshid_backend/app/api/routes/__init__.py b/murshid_backend/app/api/routes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f36ec2fa9b2d5f19160f5c9b6b2319bc6bec36aa
--- /dev/null
+++ b/murshid_backend/app/api/routes/__init__.py
@@ -0,0 +1 @@
+"""Route modules."""
diff --git a/murshid_backend/app/api/routes/db_viewer.py b/murshid_backend/app/api/routes/db_viewer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c1efe8c6da1bb3f5d86c28c9dd8e511c641507
--- /dev/null
+++ b/murshid_backend/app/api/routes/db_viewer.py
@@ -0,0 +1,122 @@
+"""
+GET /api/db/rules โ all rules in DB
+GET /api/db/mappings โ all rule-technique mappings
+GET /api/db/techniques โ all techniques
+GET /api/db/templates โ all query templates
+GET /api/db/summary โ counts per table
+POST /api/db/import-excel โ import WQL templates from Excel file
+"""
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy import func
+from sqlalchemy.orm import Session
+
+from app.db.session import get_db
+from app.models.mapping_job import MappingJob
+from app.models.query_template import QueryTemplate
+from app.models.rule import Rule
+from app.models.rule_technique_mapping import RuleTechniqueMapping
+from app.models.technique import Technique
+
+router = APIRouter(prefix="/api/db", tags=["db-viewer"])
+
+
+@router.get("/summary")
+def db_summary(db: Session = Depends(get_db)):
+ return {
+ "rules": db.query(func.count(Rule.rule_id)).scalar(),
+ "techniques": db.query(func.count(Technique.technique_id)).scalar(),
+ "rule_mappings": db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar(),
+ "query_templates": db.query(func.count(QueryTemplate.template_id)).scalar(),
+ "mapping_jobs": db.query(func.count(MappingJob.job_id)).scalar(),
+ }
+
+
+@router.get("/rules")
+def all_rules(db: Session = Depends(get_db)):
+ rows = db.query(Rule).order_by(Rule.rule_id).all()
+ return [
+ {
+ "rule_id": r.rule_id,
+ "job_id": r.job_id,
+ "has_embedding": r.embedding_vector is not None,
+ }
+ for r in rows
+ ]
+
+
+@router.get("/mappings")
+def all_mappings(db: Session = Depends(get_db)):
+ rows = (
+ db.query(RuleTechniqueMapping)
+ .order_by(
+ RuleTechniqueMapping.rule_id,
+ RuleTechniqueMapping.confidence_score.desc(),
+ )
+ .all()
+ )
+ return [
+ {
+ "mapping_id": m.mapping_id,
+ "rule_id": m.rule_id,
+ "technique_id": m.technique_id,
+ "confidence_score": round(m.confidence_score, 4),
+ "confidence_pct": round(m.confidence_score * 100, 2),
+ }
+ for m in rows
+ ]
+
+
+@router.get("/techniques")
+def all_techniques(db: Session = Depends(get_db)):
+ rows = db.query(Technique).order_by(Technique.technique_id).all()
+ return [
+ {
+ "technique_id": t.technique_id,
+ "technique_name": t.technique_name,
+ "tactic": t.tactic,
+ }
+ for t in rows
+ ]
+
+
+@router.get("/templates")
+def all_templates(db: Session = Depends(get_db)):
+ rows = db.query(QueryTemplate).order_by(QueryTemplate.technique_id, QueryTemplate.template_id).all()
+ return [
+ {
+ "template_id": t.template_id,
+ "technique_id": t.technique_id,
+ "purpose": t.purpose,
+ "wql_query": t.wql_query,
+ "note": t.note,
+ "is_active": t.is_active,
+ }
+ for t in rows
+ ]
+
+
+@router.post("/import-excel")
+def import_excel_templates(
+ replace: bool = Query(False, description="Update existing templates if True"),
+ db: Session = Depends(get_db),
+):
+ """
+ Import WQL query templates from the Excel file:
+ murshid_query_template_structure_clean_shared.xlsx
+
+ The file is read from MURSHID_MODELS_DIR or the GP root folder.
+ Pass ?replace=true to overwrite existing templates.
+ """
+ try:
+ from scripts.import_excel_templates import run
+ result = run(db, replace=replace)
+ except FileNotFoundError as e:
+ raise HTTPException(status_code=404, detail=str(e))
+ except Exception as e:
+ raise HTTPException(status_code=500, detail=str(e))
+
+ if "error" in result:
+ raise HTTPException(status_code=404, detail=result["error"])
+
+ return result
diff --git a/murshid_backend/app/api/routes/health.py b/murshid_backend/app/api/routes/health.py
new file mode 100644
index 0000000000000000000000000000000000000000..79ce5be0efbfec6b3d34e1d0ac08f61dbeb83cf6
--- /dev/null
+++ b/murshid_backend/app/api/routes/health.py
@@ -0,0 +1,73 @@
+"""GET /health โ system readiness check with clear pipeline mode info."""
+
+from pathlib import Path
+
+from fastapi import APIRouter
+
+from app.config import settings
+from app.ml.pipeline import _store, is_ready
+
+router = APIRouter(tags=["health"])
+
+try:
+ import torch
+ _CUDA = torch.cuda.is_available()
+ _TORCH = True
+ _TORCH_ERR = None
+except (ImportError, OSError) as _e:
+ _CUDA = False
+ _TORCH = False
+ _TORCH_ERR = str(_e)
+
+
+def _check_model_files() -> dict:
+ base = Path(settings.murshid_models_dir).resolve()
+ files = {
+ "logreg_joblib": base / settings.logreg_joblib,
+ "logreg_thresholds": base / settings.logreg_thresholds_npy,
+ "label_columns": base / settings.label_columns_json,
+ }
+ return {k: v.is_file() for k, v in files.items()}
+
+
+@router.get("/health")
+def health():
+ model_files = _check_model_files()
+ all_files_ok = all(model_files.values())
+
+ if _store.llama_model is not None:
+ mode = "full"
+ mode_desc = "LLaMA + SecureBERT+ + LogReg"
+ elif _store.embedder is not None and _store.logreg is not None:
+ mode = "local"
+ mode_desc = "SecureBERT+ + LogReg (no LLaMA โ using description as text)"
+ elif _store.logreg is not None:
+ mode = "lite"
+ mode_desc = "LogReg only (no embedder โ random vectors, testing only)"
+ else:
+ mode = "not_ready"
+ mode_desc = "No ML models loaded"
+
+ return {
+ "status": "ok",
+ "pipeline_ready": is_ready(),
+ "pipeline_mode": mode,
+ "pipeline_description": mode_desc,
+ "analyze_available": _store.logreg is not None,
+ "components": {
+ "llama_loaded": _store.llama_model is not None,
+ "embedder_loaded": _store.embedder is not None,
+ "logreg_loaded": _store.logreg is not None,
+ "torch_installed": _TORCH,
+ "cuda_available": _CUDA,
+ "torch_error": _TORCH_ERR,
+ },
+ "model_files": model_files,
+ "all_model_files_present": all_files_ok,
+ "models_dir": str(settings.murshid_models_dir.resolve()),
+ "skip_llm_env": settings.murshid_skip_llm,
+ "next_step": (
+ "POST /rules/analyze is ready!" if _store.logreg is not None
+ else "Copy .joblib and .npy files to MURSHID_MODELS_DIR and restart."
+ ),
+ }
diff --git a/murshid_backend/app/api/routes/queries.py b/murshid_backend/app/api/routes/queries.py
new file mode 100644
index 0000000000000000000000000000000000000000..949ab5bb46ec6c01fb126c18de64eebe0303371d
--- /dev/null
+++ b/murshid_backend/app/api/routes/queries.py
@@ -0,0 +1,78 @@
+"""
+GET /queries/{technique_id} โ SOC Analyst: fetch WQL templates.
+POST /admin/templates โ Admin: add new template.
+PATCH /admin/templates/{template_id} โ Admin: update / disable template.
+
+Based on:
+ Use Case 2 (View Investigation WQL Queries) โ ยง3.2.7
+ Use Case 7 (Manage static query templates) โ ยง3.2.7
+"""
+
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+
+from app.db.session import get_db
+from app.schemas.query import QueryTemplateIn, QueryTemplateOut, QueryTemplateUpdate
+from app.services.template_service import TemplateService
+
+router = APIRouter(tags=["queries"])
+
+
+def _get_template_service(db: Session = Depends(get_db)) -> TemplateService:
+ return TemplateService(db=db)
+
+
+# ---------------------------------------------------------------------------
+# GET /queries/{technique_id}
+# ---------------------------------------------------------------------------
+
+
+@router.get("/queries/{technique_id}", response_model=list[QueryTemplateOut])
+def get_queries(
+ technique_id: str,
+ svc: TemplateService = Depends(_get_template_service),
+):
+ """
+ Returns all active WQL templates for the given MITRE technique.
+ Use Case 2 โ ยง3.2.7
+ """
+ templates = svc.get_queries_for_technique(technique_id)
+ if not templates:
+ raise HTTPException(
+ status_code=404,
+ detail=f"No active query templates found for technique '{technique_id}'.",
+ )
+ return [QueryTemplateOut(**t) for t in templates]
+
+
+# ---------------------------------------------------------------------------
+# Admin endpoints
+# ---------------------------------------------------------------------------
+
+
+@router.post("/admin/templates", response_model=QueryTemplateOut, status_code=201)
+def add_template(
+ body: QueryTemplateIn,
+ svc: TemplateService = Depends(_get_template_service),
+):
+ """Admin: add a new WQL template. Use Case 7 โ ยง3.2.7"""
+ result = svc.add_template(
+ technique_id=body.technique_id,
+ purpose=body.purpose,
+ wql_query=body.wql_query,
+ note=body.note,
+ )
+ return QueryTemplateOut(**result)
+
+
+@router.patch("/admin/templates/{template_id}", response_model=QueryTemplateOut)
+def update_template(
+ template_id: int,
+ body: QueryTemplateUpdate,
+ svc: TemplateService = Depends(_get_template_service),
+):
+ """Admin: update or disable a WQL template. Use Case 7 โ ยง3.2.7"""
+ result = svc.update_template(template_id, body.model_dump(exclude_none=True))
+ if result is None:
+ raise HTTPException(status_code=404, detail=f"Template {template_id} not found.")
+ return QueryTemplateOut(**result)
diff --git a/murshid_backend/app/api/routes/rules.py b/murshid_backend/app/api/routes/rules.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff8b9b937761bdb0cbbb1838a257e77ce7474c89
--- /dev/null
+++ b/murshid_backend/app/api/routes/rules.py
@@ -0,0 +1,100 @@
+"""
+POST /rules/analyze โ Admin: analyze a rule, persist results.
+GET /results/{rule_id} โ SOC Analyst: retrieve stored mappings.
+
+Based on:
+ Use Case 4+5+6 (Upload, Process, Store) โ ยง3.2.7
+ Use Case 1 (View techniques and scores) โ ยง3.2.7
+"""
+
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy.orm import Session
+
+from app.db.session import get_db
+from app.ml.pipeline import is_ready
+from app.schemas.result import MappingResult, ResultsResponse
+from app.schemas.rule import AnalyzeRequest, AnalyzeResponse, TechniqueResult
+from app.services.ml_service import MLService
+from app.services.result_service import ResultService
+from app.services.rule_service import RuleService
+
+router = APIRouter(tags=["rules"])
+
+
+def _get_rule_service(db: Session = Depends(get_db)) -> RuleService:
+ return RuleService(db=db, ml=MLService())
+
+
+def _get_result_service(db: Session = Depends(get_db)) -> ResultService:
+ return ResultService(db=db)
+
+
+# ---------------------------------------------------------------------------
+# POST /rules/analyze
+# ---------------------------------------------------------------------------
+
+
+@router.post("/rules/analyze", response_model=AnalyzeResponse, status_code=201)
+def analyze_rule(
+ body: AnalyzeRequest,
+ svc: RuleService = Depends(_get_rule_service),
+):
+ """
+ Runs the full ML pipeline on the submitted Wazuh rule XML and stores
+ the results in the database.
+ """
+ if not is_ready():
+ raise HTTPException(status_code=503, detail="ML pipeline not ready.")
+
+ try:
+ result = svc.analyze_and_persist(body.rule_xml)
+ except ValueError as exc:
+ raise HTTPException(status_code=422, detail=str(exc)) from exc
+ except RuntimeError as exc:
+ raise HTTPException(status_code=503, detail=str(exc)) from exc
+ except Exception as exc:
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
+
+ all_results = [TechniqueResult(**r) for r in result["results"]]
+ detected = [r for r in all_results if r.predicted]
+
+ return AnalyzeResponse(
+ rule_id=result["rule_id"],
+ sanitized_xml=result["sanitized_xml"],
+ summary=result["summary"],
+ text_for_embedding=result["text_for_embedding"],
+ embedding_dim=result["embedding_dim"],
+ pipeline_mode=result.get("pipeline_mode", "full"),
+ detected=detected,
+ all_results=all_results,
+ )
+
+
+# ---------------------------------------------------------------------------
+# GET /results/{rule_id}
+# ---------------------------------------------------------------------------
+
+
+@router.get("/results/{rule_id}", response_model=ResultsResponse)
+def get_results(
+ rule_id: str,
+ svc: ResultService = Depends(_get_result_service),
+):
+ """
+ Returns all stored MITRE ATT&CK techniques for a rule ID, sorted by confidence.
+ Use Case 1 โ ยง3.2.7
+ - mappings: ALL techniques sorted by confidence desc (for Figure 4-11 Top 5 chart)
+ - detected: primary + secondary (โฅ0.5) only (for Figure 4-12 WQL queries)
+ """
+ data = svc.get_results_for_rule(rule_id)
+ if data is None:
+ raise HTTPException(
+ status_code=404,
+ detail=f"No mapping results found for rule_id '{rule_id}'. "
+ "Run POST /rules/analyze first.",
+ )
+ return ResultsResponse(
+ rule_id=rule_id,
+ mappings=[MappingResult(**m) for m in data["mappings"]],
+ detected=[MappingResult(**m) for m in data["detected"]],
+ )
diff --git a/murshid_backend/app/api/routes/stats.py b/murshid_backend/app/api/routes/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a68f477e1cfac68d95f0ca7294a728fa745698
--- /dev/null
+++ b/murshid_backend/app/api/routes/stats.py
@@ -0,0 +1,43 @@
+"""GET /api/stats โ dashboard KPIs."""
+
+from fastapi import APIRouter, Depends
+from sqlalchemy import func
+from sqlalchemy.orm import Session
+
+from app.db.session import get_db
+from app.models.rule import Rule
+from app.models.rule_technique_mapping import RuleTechniqueMapping
+from app.models.query_template import QueryTemplate
+from app.models.technique import Technique
+
+router = APIRouter(prefix="/api", tags=["stats"])
+
+
+@router.get("/stats")
+def get_stats(db: Session = Depends(get_db)):
+ total_rules = db.query(func.count(Rule.rule_id)).scalar() or 0
+ total_mappings = db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar() or 0
+ total_queries = db.query(func.count(QueryTemplate.template_id)).filter(QueryTemplate.is_active.is_(True)).scalar() or 0
+ total_techniques = db.query(func.count(Technique.technique_id)).scalar() or 0
+
+ technique_freq = (
+ db.query(
+ RuleTechniqueMapping.technique_id,
+ func.count(RuleTechniqueMapping.mapping_id).label("count"),
+ )
+ .group_by(RuleTechniqueMapping.technique_id)
+ .order_by(func.count(RuleTechniqueMapping.mapping_id).desc())
+ .limit(10)
+ .all()
+ )
+
+ return {
+ "total_rules_mapped": total_rules,
+ "total_techniques": total_techniques,
+ "total_mappings": total_mappings,
+ "total_queries": total_queries,
+ "technique_frequency": [
+ {"technique_id": t.technique_id, "count": t.count}
+ for t in technique_freq
+ ],
+ }
diff --git a/murshid_backend/app/config.py b/murshid_backend/app/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1410fe100ad19fb332b99acdb74b5c9ac6c8061
--- /dev/null
+++ b/murshid_backend/app/config.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+_GP_ROOT = Path(__file__).resolve().parent.parent.parent
+
+
+class Settings(BaseSettings):
+ model_config = SettingsConfigDict(
+ env_file=".env",
+ env_file_encoding="utf-8",
+ extra="ignore",
+ )
+
+ murshid_db_url: str = "mysql+pymysql://root:password@localhost:3306/murshid_db"
+ murshid_models_dir: Path = _GP_ROOT / "Needed"
+ hf_token: str | None = None
+ murshid_skip_llm: bool = False
+ secret_key: str = "change_me"
+
+ llama_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct"
+ embed_model_id: str = "ehsanaghaei/SecureBERT_Plus"
+
+ logreg_joblib: str = "murshid_logreg_pipeline_manual_oof_pcatuned.joblib"
+ logreg_thresholds_npy: str = "murshid_logreg_thresholds_manual_oof_pcatuned.npy"
+ label_columns_json: str = "murshid_label_columns.json"
+
+
+settings = Settings()
diff --git a/murshid_backend/app/db/__init__.py b/murshid_backend/app/db/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e10b1a368332298776a47106feabeadf5f013cc
--- /dev/null
+++ b/murshid_backend/app/db/__init__.py
@@ -0,0 +1 @@
+"""Database layer."""
diff --git a/murshid_backend/app/db/base.py b/murshid_backend/app/db/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2b68a5d245bbdde7fbea6b86c9650a584167d6
--- /dev/null
+++ b/murshid_backend/app/db/base.py
@@ -0,0 +1,5 @@
+from sqlalchemy.orm import DeclarativeBase
+
+
+class Base(DeclarativeBase):
+ pass
diff --git a/murshid_backend/app/db/session.py b/murshid_backend/app/db/session.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e4d24961db1aa125a645572b8244708ae3ae779
--- /dev/null
+++ b/murshid_backend/app/db/session.py
@@ -0,0 +1,25 @@
+from collections.abc import Generator
+
+from sqlalchemy import create_engine
+from sqlalchemy.orm import Session, sessionmaker
+
+from app.config import settings
+
+_is_sqlite = settings.murshid_db_url.startswith("sqlite")
+
+engine = create_engine(
+ settings.murshid_db_url,
+ connect_args={"check_same_thread": False} if _is_sqlite else {},
+ pool_pre_ping=not _is_sqlite,
+ pool_recycle=3600 if not _is_sqlite else -1,
+)
+
+SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False)
+
+
+def get_db() -> Generator[Session, None, None]:
+ db = SessionLocal()
+ try:
+ yield db
+ finally:
+ db.close()
diff --git a/murshid_backend/app/main.py b/murshid_backend/app/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..ece73d45e892818c0afa726e8f1e93597b1a097b
--- /dev/null
+++ b/murshid_backend/app/main.py
@@ -0,0 +1,60 @@
+"""
+Murshid Backend โ FastAPI entrypoint.
+
+Architecture:
+ API Layer โ app/api/routes/
+ Service Layerโ app/services/
+ ML Layer โ app/ml/
+ Repository โ app/repositories/
+ Database โ app/db/ (SQLAlchemy + Alembic, MySQL)
+"""
+
+from __future__ import annotations
+
+from contextlib import asynccontextmanager
+
+from pathlib import Path
+
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+
+from app.api.routes import db_viewer, health, queries, rules, stats
+from app.ml.pipeline import load_models, unload_models
+
+_FRONTEND_DIR = Path(__file__).resolve().parent.parent.parent / "murshid_frontend"
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+ load_models()
+ yield
+ unload_models()
+
+
+app = FastAPI(
+ title="Murshid API",
+ description=(
+ "MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts. "
+ "Transforms Wazuh IDS rules into actionable threat intelligence."
+ ),
+ version="1.0.0",
+ lifespan=lifespan,
+)
+
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+app.include_router(health.router)
+app.include_router(stats.router)
+app.include_router(db_viewer.router)
+app.include_router(rules.router)
+app.include_router(queries.router)
+
+if _FRONTEND_DIR.is_dir():
+ app.mount("/", StaticFiles(directory=str(_FRONTEND_DIR), html=True), name="frontend")
diff --git a/murshid_backend/app/ml/__init__.py b/murshid_backend/app/ml/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e732fa909b6b716cd3232cbc29aeb52f6b51a7e
--- /dev/null
+++ b/murshid_backend/app/ml/__init__.py
@@ -0,0 +1 @@
+"""ML layer โ logic extracted from MurshidUIPipeline.ipynb without modifying the original."""
diff --git a/murshid_backend/app/ml/embedder.py b/murshid_backend/app/ml/embedder.py
new file mode 100644
index 0000000000000000000000000000000000000000..593f54c21dca923841e8acfad01588177ae60fc9
--- /dev/null
+++ b/murshid_backend/app/ml/embedder.py
@@ -0,0 +1,116 @@
+"""
+SecureBERT+ embedder โ extracted from MurshidUIPipeline.ipynb (cell 15).
+Produces a 768-dim float32 embedding for a text paragraph.
+Also provides build_text_for_embedding (cell 12).
+Original file is NOT modified.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+from lxml import etree
+
+try:
+ import torch
+ from transformers import AutoModel, AutoTokenizer
+ _TORCH_OK = True
+except (ImportError, OSError):
+ _TORCH_OK = False
+
+from app.config import settings
+
+
+def _norm_spaces(s: str) -> str:
+ return " ".join((s or "").split()).strip()
+
+
+def _strip_end_punct(s: str) -> str:
+ return (s or "").rstrip(". ").strip()
+
+
+def build_text_for_embedding(clean_rule: str, summary: str) -> str:
+ """Combine LLM summary with rule description โ cell 12 of notebook."""
+ rule_elem = etree.fromstring(clean_rule.strip())
+ raw_desc = rule_elem.findtext("description") or ""
+ description = _norm_spaces(raw_desc)
+ summary = _norm_spaces(summary)
+ description = _norm_spaces(description)
+
+ if not summary and not description:
+ return ""
+ if summary and not description:
+ return summary
+ if description and not summary:
+ return description
+
+ s0 = _strip_end_punct(summary).lower()
+ d0 = _strip_end_punct(description).lower()
+
+ if s0 == d0:
+ return _strip_end_punct(summary) + "."
+ return f"{_strip_end_punct(summary)}. {_strip_end_punct(description)}."
+
+
+class SecureBERTEmbedder:
+ """Mean-pooling embedder using ehsanaghaei/SecureBERT_Plus โ cell 15."""
+
+ MAX_LEN = 512
+ BATCH_CHUNKS = 8
+
+ def __init__(self, model_id: str | None = None, device: str | None = None):
+ if not _TORCH_OK:
+ raise RuntimeError("torch/transformers not available โ SecureBERTEmbedder cannot be initialised.")
+ mid = model_id or settings.embed_model_id
+ self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
+ self.tokenizer = AutoTokenizer.from_pretrained(mid, use_fast=True)
+ self.model = AutoModel.from_pretrained(mid).to(self.device)
+ self.model.eval()
+ self.cls_id = self.tokenizer.cls_token_id
+ self.sep_id = self.tokenizer.sep_token_id
+ self.pad_id = (
+ self.tokenizer.pad_token_id
+ if self.tokenizer.pad_token_id is not None
+ else self.sep_id
+ )
+
+ def _chunk_text(self, text: str) -> list[list[int]]:
+ token_ids = self.tokenizer.encode(text, add_special_tokens=False)
+ chunk_size = self.MAX_LEN - 2
+ chunks = []
+ for i in range(0, len(token_ids), chunk_size):
+ piece = token_ids[i : i + chunk_size]
+ chunks.append([self.cls_id] + piece + [self.sep_id])
+ return chunks
+
+ def embed_text(self, text: str) -> np.ndarray:
+ chunks = self._chunk_text(text)
+ all_embs: list[np.ndarray] = []
+
+ for i in range(0, len(chunks), self.BATCH_CHUNKS):
+ batch = chunks[i : i + self.BATCH_CHUNKS]
+ max_len = max(len(x) for x in batch)
+ input_ids, masks = [], []
+ for x in batch:
+ pad = max_len - len(x)
+ input_ids.append(x + [self.pad_id] * pad)
+ masks.append([1] * len(x) + [0] * pad)
+
+ ids_t = torch.tensor(input_ids).to(self.device)
+ mask_t = torch.tensor(masks).to(self.device)
+
+ with torch.no_grad():
+ out = self.model(input_ids=ids_t, attention_mask=mask_t)
+ tok_emb = out.last_hidden_state
+ mask_exp = mask_t.unsqueeze(-1).expand(tok_emb.size()).float()
+ summed = torch.sum(tok_emb * mask_exp, dim=1)
+ denom = torch.clamp(mask_exp.sum(dim=1), min=1e-9)
+ mean_pooled = summed / denom
+
+ all_embs.append(mean_pooled.cpu().numpy())
+
+ all_embs_np = np.vstack(all_embs)
+ para_emb = all_embs_np.mean(axis=0)
+ para_emb /= np.linalg.norm(para_emb) + 1e-12
+ return para_emb.astype(np.float32)
diff --git a/murshid_backend/app/ml/logistic_model.py b/murshid_backend/app/ml/logistic_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc366242bf8f7cb7ce3021365fcba7117376b9ef
--- /dev/null
+++ b/murshid_backend/app/ml/logistic_model.py
@@ -0,0 +1,111 @@
+"""
+Logistic Regression โ PRIMARY model per user decision.
+
+Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19):
+
+ logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib")
+ logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy")
+
+ proba = logreg_model.predict_proba(X_user)
+
+ if isinstance(proba, list):
+ proba = np.column_stack([p[:, 1] for p in proba])
+ elif proba.ndim == 3:
+ proba = proba[:, :, 1]
+
+ proba = proba.reshape(-1)
+
+ pred_logreg = (proba >= logreg_thr).astype(int)
+ conf_logreg = proba * 100
+ gap_logreg = proba - logreg_thr
+
+Original notebook file is NOT modified.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import joblib
+import numpy as np
+
+from app.config import settings
+
+
+class LogisticRegressionModel:
+ """
+ Wraps the trained Logistic Regression pipeline + per-label thresholds.
+ File structure (from notebook cell 18):
+ logreg_model โ sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg))
+ logreg_thr โ np.ndarray shape (n_techniques,) per-label thresholds
+ """
+
+ def __init__(self, models_dir: Path | None = None) -> None:
+ base = Path(models_dir or settings.murshid_models_dir).resolve()
+
+ logreg_path = base / settings.logreg_joblib
+ thr_path = base / settings.logreg_thresholds_npy
+ labels_path = base / settings.label_columns_json
+
+ for p in (logreg_path, thr_path, labels_path):
+ if not p.is_file():
+ raise FileNotFoundError(f"Missing model file: {p}")
+
+ # --- notebook cell 18: load model + thresholds ---
+ self._model = joblib.load(logreg_path) # logreg_model
+ self._thr = np.load(thr_path) # logreg_thr
+
+ with open(labels_path, encoding="utf-8") as f:
+ self.technique_names: list[str] = json.load(f)
+
+ n = len(self.technique_names)
+ if self._thr.shape[0] != n:
+ raise ValueError(
+ f"LogReg thresholds length {self._thr.shape[0]} != {n} labels"
+ )
+
+ # ------------------------------------------------------------------
+
+ def predict(self, embedding_1d: np.ndarray) -> list[dict]:
+ """
+ Run LogReg inference exactly as in notebook cell 19.
+
+ Returns list of dicts sorted by confidence_percent desc:
+ technique_id, predicted, confidence_percent, proba, threshold, gap
+ """
+ X_user = embedding_1d.reshape(1, -1)
+
+ # --- verbatim from notebook cell 19 ---
+ proba = self._model.predict_proba(X_user)
+
+ if isinstance(proba, list):
+ proba = np.column_stack([p[:, 1] for p in proba])
+ elif proba.ndim == 3:
+ proba = proba[:, :, 1]
+
+ proba = proba.reshape(-1)
+
+ pred_logreg = (proba >= self._thr).astype(int)
+ conf_logreg = proba * 100
+ gap_logreg = proba - self._thr
+ # --- end verbatim ---
+
+ results = [
+ {
+ "technique_id": self.technique_names[i],
+ "predicted": bool(pred_logreg[i]),
+ "confidence_percent": round(float(conf_logreg[i]), 2),
+ "proba": round(float(proba[i]), 4),
+ "threshold": round(float(self._thr[i]), 4),
+ "gap": round(float(gap_logreg[i]), 4),
+ }
+ for i in range(len(self.technique_names))
+ ]
+
+ # sort: predicted first, then by confidence desc (notebook sort logic)
+ return sorted(
+ results,
+ key=lambda r: (r["predicted"], r["confidence_percent"]),
+ reverse=True,
+ )
diff --git a/murshid_backend/app/ml/pipeline.py b/murshid_backend/app/ml/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..20d4a99961633e98a851581040febd56025b3747
--- /dev/null
+++ b/murshid_backend/app/ml/pipeline.py
@@ -0,0 +1,225 @@
+"""
+Full inference pipeline โ combines sanitizer โ summarizer โ embedder โ logistic_model.
+Exposes analyze_rule(rule_xml) -> dict as the single callable for the service layer.
+
+Modes:
+ FULL : LLaMA available + SecureBERT+ + LogReg (GPU/Colab required)
+ LOCAL : MURSHID_SKIP_LLM=true + SecureBERT+ + LogReg
+ โ skips LLaMA; uses field as the paragraph text.
+ This allows POST /rules/analyze to work locally without a GPU.
+ LITE : torch not installed โ uses a trivial bag-of-words fake embedding (testing only)
+"""
+
+from __future__ import annotations
+
+import xml.etree.ElementTree as ET
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+
+from app.config import settings
+from app.ml.logistic_model import LogisticRegressionModel
+from app.ml.sanitizer import sanitize_rule_from_string
+
+try:
+ import torch
+ from huggingface_hub import login as hf_login
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+ from app.ml.embedder import SecureBERTEmbedder, build_text_for_embedding
+ from app.ml.summarizer import summarize_one_rule
+ _TORCH_AVAILABLE = True
+ _TORCH_ERROR: str | None = None
+except (ImportError, OSError) as _e:
+ _TORCH_AVAILABLE = False
+ _TORCH_ERROR = str(_e)
+
+
+# ---------------------------------------------------------------------------
+# Singleton container (loaded once at startup)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class _ModelStore:
+ llama_model: Any | None = None
+ llama_tokenizer: Any | None = None
+ llama_device: str = "cpu"
+ embedder: SecureBERTEmbedder | None = None
+ logreg: LogisticRegressionModel | None = None
+ ready: bool = False
+
+
+_store = _ModelStore()
+
+
+def load_models() -> None:
+ """
+ Load all models into _store.
+ Call once at FastAPI startup (lifespan).
+ """
+ if _TORCH_AVAILABLE and settings.hf_token:
+ hf_login(token=settings.hf_token, add_to_git_credential=False)
+
+ if not settings.murshid_skip_llm:
+ if not _TORCH_AVAILABLE:
+ print("[Murshid] WARNING: torch not installed โ skipping LLM load.")
+ else:
+ bnb_cfg = BitsAndBytesConfig(
+ load_in_4bit=True,
+ bnb_4bit_use_double_quant=True,
+ bnb_4bit_quant_type="nf4",
+ bnb_4bit_compute_dtype=torch.float16,
+ )
+ tok = AutoTokenizer.from_pretrained(settings.llama_model_id, use_fast=True)
+ if tok.pad_token is None:
+ tok.pad_token = tok.eos_token
+ m = AutoModelForCausalLM.from_pretrained(
+ settings.llama_model_id,
+ quantization_config=bnb_cfg,
+ device_map="auto",
+ low_cpu_mem_usage=True,
+ dtype=torch.float16,
+ )
+ m.config.pad_token_id = tok.pad_token_id
+ m.eval()
+ _store.llama_tokenizer = tok
+ _store.llama_model = m
+ _store.llama_device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ if _TORCH_AVAILABLE:
+ try:
+ _store.embedder = SecureBERTEmbedder()
+ except Exception as exc:
+ print(f"[Murshid] WARNING: SecureBERT+ not loaded โ {exc}")
+ _store.embedder = None
+ else:
+ print("[Murshid] WARNING: torch not installed โ embedder skipped.")
+ _store.embedder = None
+
+ try:
+ _store.logreg = LogisticRegressionModel()
+ except FileNotFoundError as exc:
+ print(f"[Murshid] WARNING: LogReg model files missing โ {exc}")
+ _store.logreg = None
+ except Exception as exc:
+ print(f"[Murshid] WARNING: LogReg not loaded โ {exc}")
+ _store.logreg = None
+
+ _store.ready = True
+
+
+def unload_models() -> None:
+ _store.llama_model = None
+ _store.llama_tokenizer = None
+ _store.embedder = None
+ _store.logreg = None
+ _store.ready = False
+
+
+def is_ready() -> bool:
+ return _store.ready
+
+
+# ---------------------------------------------------------------------------
+# Public function
+# ---------------------------------------------------------------------------
+
+
+def _extract_description(clean_xml: str) -> str:
+ """Extract text from sanitized rule XML."""
+ try:
+ elem = ET.fromstring(clean_xml.strip())
+ desc = elem.findtext("description") or ""
+ return " ".join(desc.split()).strip()
+ except ET.ParseError:
+ return ""
+
+
+def analyze_rule(rule_xml: str) -> dict:
+ """
+ Full pipeline: XML โ sanitize โ summarize โ embed โ LogReg โ ranked results.
+
+ Operates in three modes depending on environment:
+
+ FULL mode (MURSHID_SKIP_LLM=false, GPU available):
+ LLaMA generates a natural-language summary โ SecureBERT+ embeds it โ LogReg predicts.
+
+ LOCAL mode (MURSHID_SKIP_LLM=true, torch installed):
+ Skips LLaMA. Uses the rule's field directly as the text.
+ SecureBERT+ still embeds it properly โ LogReg predicts.
+ โ ๏ธ Accuracy slightly lower than FULL mode (no LLaMA enrichment).
+
+ LITE mode (torch not installed):
+ Uses a random unit-vector as a placeholder embedding.
+ Results are meaningless โ for structural testing only.
+
+ Returns:
+ {
+ "sanitized_xml": str,
+ "summary": str, # LLaMA output OR description OR "(lite mode)"
+ "text_for_embedding": str,
+ "embedding_dim": int,
+ "pipeline_mode": str, # "full" | "local" | "lite"
+ "results": [...], # all techniques sorted by confidence desc
+ "detected": [...], # predicted == True only
+ }
+ """
+ if not _store.ready:
+ raise RuntimeError("Models not loaded. Call load_models() first.")
+
+ if "" not in rule_xml:
+ raise ValueError("Incomplete XML: must contain and .")
+
+ if _store.logreg is None:
+ raise RuntimeError(
+ "LogReg model not loaded. "
+ "Copy the .joblib and .npy files to MURSHID_MODELS_DIR and restart."
+ )
+
+ clean_xml = sanitize_rule_from_string(rule_xml)
+
+ # โโ Choose mode โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ if _store.llama_model is not None and _store.llama_tokenizer is not None:
+ # FULL mode: LLaMA summary
+ mode = "full"
+ summary = summarize_one_rule(
+ clean_xml,
+ _store.llama_model,
+ _store.llama_tokenizer,
+ _store.llama_device,
+ )
+ text = build_text_for_embedding(clean_xml, summary)
+ embedding: np.ndarray = _store.embedder.embed_text(text)
+
+ elif _store.embedder is not None:
+ # LOCAL mode: no LLaMA, use as text
+ mode = "local"
+ desc = _extract_description(clean_xml)
+ summary = desc or "No description available."
+ text = desc or clean_xml[:300]
+ embedding = _store.embedder.embed_text(text)
+
+ else:
+ # LITE mode: torch not available, random unit-vector (structural test only)
+ mode = "lite"
+ desc = _extract_description(clean_xml)
+ summary = f"(lite mode โ no embedder) {desc}"
+ text = desc or clean_xml[:300]
+ dim = 768
+ raw = np.random.default_rng(abs(hash(text)) % (2**32)).random(dim).astype(np.float32)
+ embedding = raw / (np.linalg.norm(raw) + 1e-12)
+
+ # โโ Classify โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ all_results = _store.logreg.predict(embedding)
+ detected = [r for r in all_results if r["predicted"]]
+
+ return {
+ "sanitized_xml": clean_xml,
+ "summary": summary,
+ "text_for_embedding": text,
+ "embedding_dim": int(embedding.shape[0]),
+ "pipeline_mode": mode,
+ "results": all_results,
+ "detected": detected,
+ }
diff --git a/murshid_backend/app/ml/sanitizer.py b/murshid_backend/app/ml/sanitizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a039fc45c2f90b171037db3046796d26a483c5d7
--- /dev/null
+++ b/murshid_backend/app/ml/sanitizer.py
@@ -0,0 +1,32 @@
+"""
+Rule sanitizer โ extracted from MurshidUIPipeline.ipynb (cell 10).
+Removes: mitre, if_sid, group, if_group tags from Wazuh XML rule.
+Original file is NOT modified.
+"""
+
+from __future__ import annotations
+
+import copy
+import xml.etree.ElementTree as ET
+
+REMOVE_TAGS_ANYWHERE: set[str] = {"mitre", "if_sid", "group", "if_group"}
+
+
+def _remove_tag_anywhere(root_elem: ET.Element, tag: str) -> None:
+ for parent in list(root_elem.iter()):
+ for child in list(parent):
+ if child.tag == tag:
+ parent.remove(child)
+
+
+def sanitize_rule(rule_elem: ET.Element) -> ET.Element:
+ r = copy.deepcopy(rule_elem)
+ for tag in REMOVE_TAGS_ANYWHERE:
+ _remove_tag_anywhere(r, tag)
+ return r
+
+
+def sanitize_rule_from_string(rule_xml: str) -> str:
+ rule_elem = ET.fromstring(rule_xml.strip())
+ sanitized = sanitize_rule(rule_elem)
+ return ET.tostring(sanitized, encoding="unicode")
diff --git a/murshid_backend/app/ml/summarizer.py b/murshid_backend/app/ml/summarizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5171fa955d02b36a07c01dbc37a37d8ee9b49f9
--- /dev/null
+++ b/murshid_backend/app/ml/summarizer.py
@@ -0,0 +1,262 @@
+"""
+LLM summarizer โ extracted from MurshidUIPipeline.ipynb (cells 11-12).
+Converts sanitized Wazuh XML rule to a one-sentence behavior summary.
+Original file is NOT modified.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+import unicodedata
+
+import torch
+
+# --------------------------------------------------------------------------
+# Constants (identical to notebook)
+# --------------------------------------------------------------------------
+MAX_INPUT_TOKENS = 2048
+MAX_NEW_TOKENS = 160
+DO_SAMPLE = False
+NUM_BEAMS = 4
+MAX_RETRIES = 3
+
+SYSTEM_INSTR = (
+ "You are a cybersecurity expert.\n"
+ "You will be provided with a Wazuh rule in XML format.\n"
+ "Write EXACTLY ONE sentence describing the observable event pattern the rule matches.\n\n"
+ "HARD CONSTRAINTS:\n"
+ '1) Output must be minified JSON only: {"summary":"..."}\n'
+ "2) ONE sentence only.\n"
+ "3) Start with one of: Detects, Monitors, Identifies, Flags, Reports, Tracks, Captures.\n"
+ "4) Use ONLY facts present in the XML. Describe the observable system event only.\n"
+ "5) Do NOT infer attacker intent, attack type, or technique.\n"
+ "6) Do NOT mention MITRE, ATT&CK, or attack technique names unless explicitly present in the XML.\n"
+ "7) Do NOT use speculative language: likely, potentially, possible, possibly, may indicate, or could indicate.\n"
+ "8) Length: 7 to 18 words.\n"
+ "9) SHOULD include a clear event type when possible.\n"
+ "10) Mention at least ONE concrete indicator if available (event_id, process name, file path,\n"
+ " registry key, service, protocol/port, URL pattern, command, username, IP).\n"
+ "If only a single indicator exists, still produce a complete behavior-focused sentence.\n"
+)
+
+REPAIR_HINT = (
+ "Your previous output was rejected.\n"
+ "Fix it to satisfy ALL constraints:\n"
+ '- Output MUST be minified JSON only: {"summary":"..."}\n'
+ "- One sentence only.\n"
+ "- Keep it behavior-focused.\n"
+ "- Include at least ONE concrete indicator if present in the XML.\n"
+ "- Do NOT add any extra text outside JSON.\n"
+)
+
+VERB_OK = ("Detects", "Monitors", "Identifies", "Flags", "Reports", "Tracks", "Captures")
+JSON_OBJ_RE = re.compile(r"\{.*?\}", re.DOTALL)
+BAD_INTRO_RE = re.compile(
+ r"^\s*(this\s+(wazuh\s+)?rule|the\s+rule|this\s+alert)\b", re.IGNORECASE
+)
+BAD_INTENT_RE = re.compile(r"\b(likely|potentially|possible|maybe)\b", re.IGNORECASE)
+GENERIC_RE = re.compile(
+ r"\b(detects activity|detects suspicious activity|detects potentially suspicious activity|"
+ r"monitors activity|reports activity|detects an event pattern defined by the rule indicators)\b",
+ re.IGNORECASE,
+)
+
+
+# --------------------------------------------------------------------------
+# Helpers (identical to notebook)
+# --------------------------------------------------------------------------
+
+def _build_prompt(rule_xml: str, tokenizer, extra_hint: str = "") -> str:
+ sys = SYSTEM_INSTR + (("\n" + extra_hint) if extra_hint else "")
+ user = f"Wazuh rule XML:\n{rule_xml}\n\nReturn JSON only:"
+ messages = [{"role": "system", "content": sys}, {"role": "user", "content": user}]
+ return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+
+def _looks_broken_encoding(s: str) -> bool:
+ return any(m in s for m in ("ร", "ร", "ร", "รข", "รข")) if s else False
+
+
+def _try_extract_json_summary(text: str) -> str | None:
+ t = (text or "").strip()
+ if not t:
+ return None
+ if t.startswith("{") and '"summary"' in t:
+ try:
+ obj = json.loads(t)
+ if isinstance(obj, dict) and isinstance(obj.get("summary"), str):
+ return obj["summary"].strip()
+ except Exception:
+ pass
+ m = JSON_OBJ_RE.search(t)
+ if m and '"summary"' in m.group(0):
+ blob = m.group(0)
+ try:
+ obj = json.loads(blob)
+ if isinstance(obj, dict) and isinstance(obj.get("summary"), str):
+ return obj["summary"].strip()
+ except Exception:
+ m2 = re.search(r'"summary"\s*:\s*"([^"]+)"', blob)
+ if m2:
+ return m2.group(1).strip()
+ return None
+
+
+def _normalize_one_sentence(s: str) -> str:
+ s = re.sub(r"\s+", " ", (s or "").strip()).strip()
+ s = unicodedata.normalize("NFKC", s)
+ if not s:
+ return ""
+ if BAD_INTRO_RE.match(s):
+ s = BAD_INTRO_RE.sub("", s).lstrip(":,- ").strip()
+ if not s:
+ return ""
+ if not any(s.startswith(v) for v in VERB_OK):
+ s = "Detects " + (s[0].lower() + s[1:]) if len(s) > 1 else ""
+ if not s:
+ return ""
+ m = re.search(r"[.!?](?:\s|$)", s)
+ s = s[: m.end()].strip() if m else s + "."
+ s = re.sub(r"^(Detects\s+)+", "Detects ", s).strip()
+ return re.sub(r"\s+", " ", s).strip()
+
+
+def _looks_truncated(s: str) -> bool:
+ return not s or s.strip().endswith(("(", ":", " -", ","))
+
+
+def _has_behavior_signal(s: str) -> bool:
+ kws = ["create","delete","execute","spawn","launch","login","logon","authentication",
+ "connect","request","query","modify","registry","process","command","file",
+ "service","ip","url","dns","http","vpn","account"]
+ return any(k in s.lower() for k in kws)
+
+
+def _has_indicator_signal(s: str) -> bool:
+ kws = [".exe",".dll",".ps1",".bat",".cmd","powershell","cmd.exe","reg.exe","rundll32",
+ "svchost","registry","temp","system32","event_id","http","dns","ip","url","port","key"]
+ return any(k in s.lower() for k in kws)
+
+
+def _is_bad(s: str) -> bool:
+ if not s or BAD_INTRO_RE.match(s) or BAD_INTENT_RE.search(s) or GENERIC_RE.search(s):
+ return True
+ if _looks_broken_encoding(s) or _looks_truncated(s):
+ return True
+ wc = len(s.split())
+ if wc < 7 or wc > 18 or not _has_behavior_signal(s):
+ return True
+ return bool((s.startswith("{") and "summary" in s) or ('"summary"' in s and "{" in s))
+
+
+def _is_catastrophic(s: str) -> bool:
+ return not s or _looks_broken_encoding(s) or _looks_truncated(s) or len(s.split()) < 3
+
+
+def _score(s: str) -> int:
+ wc = len(s.split())
+ return (
+ (3 if 7 <= wc <= 18 else 0)
+ + (3 if _has_behavior_signal(s) else 0)
+ + (2 if _has_indicator_signal(s) else 0)
+ + (1 if not GENERIC_RE.search(s) else 0)
+ + (1 if not BAD_INTENT_RE.search(s) else 0)
+ )
+
+
+def _rescue_finalize(s: str) -> str:
+ s = _normalize_one_sentence(s)
+ if not s:
+ return "Detects rule-matched behavior."
+ s = re.sub(r",\s*(possibly|potentially|maybe|may)\b.*$", "", s, flags=re.IGNORECASE).strip()
+ s = re.sub(r"\b(possibly|potentially|maybe|may)\b", "", s, flags=re.IGNORECASE)
+ s = re.sub(r"\s+", " ", s).strip()
+ if len(s.split()) < 7:
+ low = s.lower()
+ for kw, rep in [
+ ("powershell", "Detects powershell.exe process execution."),
+ ("cmd", "Detects cmd.exe process execution."),
+ ("reg", "Detects reg.exe process execution."),
+ ("svchost", "Detects svchost.exe process execution."),
+ ]:
+ if kw in low:
+ s = rep
+ break
+ else:
+ s = s.rstrip(".") + " matching rule indicators."
+ if _looks_truncated(s):
+ s = s.rstrip(".") + " matching rule indicators."
+ if not any(s.startswith(v) for v in VERB_OK):
+ s = "Detects " + s[0].lower() + s[1:] if len(s) > 1 else "Detects rule-matched behavior."
+ words = s.split()
+ if len(words) > 18:
+ s = " ".join(words[:18]).rstrip(".") + "."
+ return re.sub(r"\s+", " ", s if s.endswith(".") else s + ".").strip()
+
+
+# --------------------------------------------------------------------------
+# Public API
+# --------------------------------------------------------------------------
+
+def summarize_one_rule(rule_xml: str, model, tokenizer, device: str | None = None) -> str:
+ """Generate a one-sentence summary for a sanitized Wazuh rule XML string."""
+ if device is None:
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+ eos_id = tokenizer.eos_token_id or pad_id
+
+ best: str | None = None
+ best_any: str | None = None
+ last_raw = ""
+ last_cleaned = ""
+
+ for attempt in range(1, MAX_RETRIES + 1):
+ prompt = _build_prompt(
+ rule_xml, tokenizer, extra_hint=REPAIR_HINT if attempt >= 2 else ""
+ )
+ inputs = tokenizer(
+ prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS
+ ).to(device)
+
+ with torch.no_grad():
+ outputs = model.generate(
+ **inputs,
+ max_new_tokens=MAX_NEW_TOKENS,
+ do_sample=DO_SAMPLE,
+ num_beams=NUM_BEAMS,
+ pad_token_id=pad_id,
+ eos_token_id=eos_id,
+ repetition_penalty=1.05,
+ no_repeat_ngram_size=3,
+ )
+
+ raw = tokenizer.decode(
+ outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
+ ).strip()
+ last_raw = raw
+
+ parsed = _try_extract_json_summary(raw)
+ if parsed is None:
+ continue
+
+ cleaned = _normalize_one_sentence(parsed)
+ last_cleaned = cleaned
+
+ if cleaned and not _is_catastrophic(cleaned):
+ if best_any is None or _score(cleaned) > _score(best_any):
+ best_any = cleaned
+
+ if not _is_bad(cleaned):
+ best = cleaned
+ break
+
+ if best is None:
+ if best_any and not _is_catastrophic(best_any):
+ best = best_any
+ else:
+ src = last_cleaned or _try_extract_json_summary(last_raw) or last_raw
+ best = _rescue_finalize(src)
+
+ return best
diff --git a/murshid_backend/app/ml/svm_model.py b/murshid_backend/app/ml/svm_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab39e3643dedbf4305d375bf4d6951bafc50a2cc
--- /dev/null
+++ b/murshid_backend/app/ml/svm_model.py
@@ -0,0 +1,101 @@
+"""
+SVM classifier โ PRIMARY model per the report (ยง3.1.3 + ยง4.1).
+
+Report quote:
+ "the Support Vector Machine (SVM) was adopted as the core classifier"
+ "classification using SVM to predict the associated MITRE ATT&CK techniques"
+
+Inference logic (verbatim from MurshidUIPipeline.ipynb cell 16+19):
+ scores = svm_model.named_steps["clf"].decision_function(
+ svm_model.named_steps["pca"].transform(X_user)
+ ).reshape(-1)
+ pred = (scores >= thr_per_label).astype(int)
+ margins = scores - thr_per_label
+ conf = sigmoid(margins) * 100
+
+Original notebook file is NOT modified.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import joblib
+import numpy as np
+
+from app.config import settings
+
+
+def _sigmoid(x: np.ndarray) -> np.ndarray:
+ """Probability calibration: sigmoid(margin) โ notebook cell 17."""
+ x = np.clip(x, -30, 30)
+ return 1.0 / (1.0 + np.exp(-x))
+
+
+class SVMModel:
+ """
+ Wraps the trained LinearSVC pipeline with per-label thresholds.
+ Structure of the .joblib pack (from notebook):
+ svm_pack["model"] โ sklearn Pipeline (PCA + LinearSVC)
+ svm_pack["thresholds_per_label"] โ np.ndarray shape (n_techniques,)
+ """
+
+ def __init__(self, models_dir: Path | None = None) -> None:
+ base = Path(models_dir or settings.murshid_models_dir).resolve()
+
+ svm_path = base / settings.svm_joblib
+ labels_path = base / settings.label_columns_json
+
+ for p in (svm_path, labels_path):
+ if not p.is_file():
+ raise FileNotFoundError(f"Missing model file: {p}")
+
+ svm_pack = joblib.load(svm_path)
+ self._model = svm_pack["model"] # Pipeline(PCA โ LinearSVC)
+ self._thresholds = np.asarray(
+ svm_pack["thresholds_per_label"], dtype=np.float64
+ )
+
+ with open(labels_path, encoding="utf-8") as f:
+ self.technique_names: list[str] = json.load(f)
+
+ n = len(self.technique_names)
+ if self._thresholds.shape[0] != n:
+ raise ValueError(
+ f"SVM thresholds length {self._thresholds.shape[0]} != {n} labels"
+ )
+
+ # ------------------------------------------------------------------
+
+ def predict(self, embedding_1d: np.ndarray) -> list[dict]:
+ """
+ Run SVM inference exactly as in the notebook.
+
+ Returns list of dicts sorted by confidence_percent desc:
+ technique_id, predicted, confidence_percent, score, threshold, margin
+ """
+ X = embedding_1d.reshape(1, -1)
+
+ # Apply PCA then LinearSVC decision function (notebook cell 19)
+ scores = self._model.named_steps["clf"].decision_function(
+ self._model.named_steps["pca"].transform(X)
+ ).reshape(-1)
+
+ pred = (scores >= self._thresholds).astype(int)
+ margins = scores - self._thresholds
+ conf = _sigmoid(margins) * 100 # calibrated confidence (%)
+
+ results = [
+ {
+ "technique_id": self.technique_names[i],
+ "predicted": bool(pred[i]),
+ "confidence_percent": round(float(conf[i]), 2),
+ "score": round(float(scores[i]), 4),
+ "threshold": round(float(self._thresholds[i]), 4),
+ "margin": round(float(margins[i]), 4),
+ }
+ for i in range(len(self.technique_names))
+ ]
+
+ return sorted(results, key=lambda r: r["confidence_percent"], reverse=True)
diff --git a/murshid_backend/app/models/__init__.py b/murshid_backend/app/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..341bc901a88c9e72bdaffe20b15d28e8b345e170
--- /dev/null
+++ b/murshid_backend/app/models/__init__.py
@@ -0,0 +1,16 @@
+"""SQLAlchemy ORM models (tables defined exactly per ER Diagram ยง3.2.6 of the report)."""
+from app.models.user import User
+from app.models.mapping_job import MappingJob
+from app.models.rule import Rule
+from app.models.technique import Technique
+from app.models.rule_technique_mapping import RuleTechniqueMapping
+from app.models.query_template import QueryTemplate
+
+__all__ = [
+ "User",
+ "MappingJob",
+ "Rule",
+ "Technique",
+ "RuleTechniqueMapping",
+ "QueryTemplate",
+]
diff --git a/murshid_backend/app/models/mapping_job.py b/murshid_backend/app/models/mapping_job.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7891559075c9d5fc06da2ce4ae59d8121f968df
--- /dev/null
+++ b/murshid_backend/app/models/mapping_job.py
@@ -0,0 +1,40 @@
+"""
+MappingJob entity โ ER Diagram ยง3.2.6
+Attributes: job_ID, file_name, timestamp, rules_count, status, progress
+Linked to User via "uploads" relationship.
+Also visible in Figure 4-14 (Mapping Progress Table).
+"""
+
+import enum
+from datetime import datetime
+
+from sqlalchemy import DateTime, Enum, ForeignKey, Integer, String, func
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from app.db.base import Base
+
+
+class JobStatus(str, enum.Enum):
+ pending = "pending"
+ running = "running"
+ done = "done"
+ failed = "failed"
+
+
+class MappingJob(Base):
+ __tablename__ = "mapping_jobs"
+
+ job_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
+ user_id: Mapped[int] = mapped_column(ForeignKey("users.user_id"), nullable=False)
+ file_name: Mapped[str] = mapped_column(String(255), nullable=False)
+ rules_count: Mapped[int] = mapped_column(Integer, default=0)
+ status: Mapped[JobStatus] = mapped_column(
+ Enum(JobStatus), nullable=False, default=JobStatus.pending
+ )
+ progress: Mapped[int] = mapped_column(Integer, default=0)
+ timestamp: Mapped[datetime] = mapped_column(
+ DateTime, nullable=False, server_default=func.now()
+ )
+
+ user: Mapped["User"] = relationship(back_populates="jobs")
+ rules: Mapped[list["Rule"]] = relationship(back_populates="job")
diff --git a/murshid_backend/app/models/query_template.py b/murshid_backend/app/models/query_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef79528c949e49ba0de246c655dd3e3d30293e1d
--- /dev/null
+++ b/murshid_backend/app/models/query_template.py
@@ -0,0 +1,27 @@
+"""
+QueryTemplate entity โ ER Diagram ยง3.2.6
+Attributes: Template_ID, Purpose, wql_query, Note
+Linked to Technique. Admin can add/update/disable (Use Case 7, ยง3.2.7).
+"""
+
+from sqlalchemy import Boolean, ForeignKey, String, Text
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from app.db.base import Base
+
+
+class QueryTemplate(Base):
+ __tablename__ = "query_templates"
+
+ template_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
+ technique_id: Mapped[str] = mapped_column(
+ String(20), ForeignKey("techniques.technique_id"), nullable=False
+ )
+ purpose: Mapped[str | None] = mapped_column(String(255), nullable=True)
+ # WQL with placeholders: ${HOST}, ${USER}, ${IP}
+ wql_query: Mapped[str] = mapped_column(Text, nullable=False)
+ note: Mapped[str | None] = mapped_column(Text, nullable=True)
+ # Admin can disable without deleting โ Use Case 7
+ is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False)
+
+ technique: Mapped["Technique"] = relationship(back_populates="query_templates")
diff --git a/murshid_backend/app/models/rule.py b/murshid_backend/app/models/rule.py
new file mode 100644
index 0000000000000000000000000000000000000000..378fe3096b9071a6d16551b453ed4722f3c84675
--- /dev/null
+++ b/murshid_backend/app/models/rule.py
@@ -0,0 +1,27 @@
+"""
+Rule entity โ ER Diagram ยง3.2.6
+Attributes: Rule_ID, embedding_vector, job_ID (FK)
+Rule_ID is the Wazuh rule ID string (e.g. "597").
+"""
+
+from sqlalchemy import ForeignKey, String, Text
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from app.db.base import Base
+
+
+class Rule(Base):
+ __tablename__ = "rules"
+
+ rule_id: Mapped[str] = mapped_column(String(50), primary_key=True)
+ job_id: Mapped[int | None] = mapped_column(
+ ForeignKey("mapping_jobs.job_id"), nullable=True
+ )
+ # 768-dimensional float vector stored as JSON string; kept nullable for
+ # rules where only the mapping result is persisted without the vector.
+ embedding_vector: Mapped[str | None] = mapped_column(Text, nullable=True)
+
+ job: Mapped["MappingJob | None"] = relationship(back_populates="rules")
+ technique_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship(
+ back_populates="rule", cascade="all, delete-orphan"
+ )
diff --git a/murshid_backend/app/models/rule_technique_mapping.py b/murshid_backend/app/models/rule_technique_mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..351c9559d26c7d0b8f4dedc53f174a48d6e00614
--- /dev/null
+++ b/murshid_backend/app/models/rule_technique_mapping.py
@@ -0,0 +1,31 @@
+"""
+RuleTechniqueMapping associative entity โ ER Diagram ยง3.2.6
+Attributes: Mapping_ID, Rule_ID (FK), Technique_ID (FK), confidence_score
+Index on rule_id for fast lookup โ mentioned explicitly in Use Case 6 (ยง3.2.7).
+"""
+
+from sqlalchemy import Float, ForeignKey, Index, Integer, String
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from app.db.base import Base
+
+
+class RuleTechniqueMapping(Base):
+ __tablename__ = "rule_technique_mappings"
+
+ mapping_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
+ rule_id: Mapped[str] = mapped_column(
+ String(50), ForeignKey("rules.rule_id"), nullable=False
+ )
+ technique_id: Mapped[str] = mapped_column(
+ String(20), ForeignKey("techniques.technique_id"), nullable=False
+ )
+ confidence_score: Mapped[float] = mapped_column(Float, nullable=False)
+
+ rule: Mapped["Rule"] = relationship(back_populates="technique_mappings")
+ technique: Mapped["Technique"] = relationship(back_populates="rule_mappings")
+
+ __table_args__ = (
+ # "creates an index on rule_id for efficient lookup" โ Use Case 6
+ Index("ix_rule_technique_rule_id", "rule_id"),
+ )
diff --git a/murshid_backend/app/models/technique.py b/murshid_backend/app/models/technique.py
new file mode 100644
index 0000000000000000000000000000000000000000..42fe7d76290619aa103e7448c4b1f70f7ceed865
--- /dev/null
+++ b/murshid_backend/app/models/technique.py
@@ -0,0 +1,24 @@
+"""
+Technique entity โ ER Diagram ยง3.2.6
+Attributes: Technique_ID, technique_name, tactic
+"""
+
+from sqlalchemy import String
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from app.db.base import Base
+
+
+class Technique(Base):
+ __tablename__ = "techniques"
+
+ technique_id: Mapped[str] = mapped_column(String(20), primary_key=True)
+ technique_name: Mapped[str] = mapped_column(String(255), nullable=False)
+ tactic: Mapped[str | None] = mapped_column(String(100), nullable=True)
+
+ rule_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship(
+ back_populates="technique"
+ )
+ query_templates: Mapped[list["QueryTemplate"]] = relationship(
+ back_populates="technique"
+ )
diff --git a/murshid_backend/app/models/user.py b/murshid_backend/app/models/user.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac6e3a53d83d5dffe5e51e48ac46d1122bf5310
--- /dev/null
+++ b/murshid_backend/app/models/user.py
@@ -0,0 +1,30 @@
+"""
+User entity โ ER Diagram ยง3.2.6
+Attributes: User_ID, username, email, password_hash, role
+"""
+
+import enum
+
+from sqlalchemy import Enum, String
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+
+from app.db.base import Base
+
+
+class UserRole(str, enum.Enum):
+ admin = "admin"
+ analyst = "analyst"
+
+
+class User(Base):
+ __tablename__ = "users"
+
+ user_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
+ username: Mapped[str] = mapped_column(String(100), unique=True, nullable=False)
+ email: Mapped[str] = mapped_column(String(255), unique=True, nullable=False)
+ password_hash: Mapped[str] = mapped_column(String(255), nullable=False)
+ role: Mapped[UserRole] = mapped_column(
+ Enum(UserRole), nullable=False, default=UserRole.analyst
+ )
+
+ jobs: Mapped[list["MappingJob"]] = relationship(back_populates="user")
diff --git a/murshid_backend/app/repositories/__init__.py b/murshid_backend/app/repositories/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4674f8c0fa126fc4ef11a6f766e140318af248ea
--- /dev/null
+++ b/murshid_backend/app/repositories/__init__.py
@@ -0,0 +1 @@
+"""Repository layer โ thin DB access wrappers."""
diff --git a/murshid_backend/app/repositories/job_repo.py b/murshid_backend/app/repositories/job_repo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5acdb40cc12d85c203f44e7e3ab8cd4d9f9805c
--- /dev/null
+++ b/murshid_backend/app/repositories/job_repo.py
@@ -0,0 +1,44 @@
+"""CRUD for MappingJob table."""
+
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+from sqlalchemy.orm import Session
+
+from app.models.mapping_job import JobStatus, MappingJob
+
+
+def create_job(db: Session, *, user_id: int, file_name: str, rules_count: int = 0) -> MappingJob:
+ job = MappingJob(
+ user_id=user_id,
+ file_name=file_name,
+ rules_count=rules_count,
+ status=JobStatus.pending,
+ progress=0,
+ timestamp=datetime.now(tz=timezone.utc),
+ )
+ db.add(job)
+ db.flush()
+ return job
+
+
+def update_job_status(
+ db: Session,
+ job_id: int,
+ *,
+ status: JobStatus,
+ progress: int | None = None,
+) -> MappingJob | None:
+ job = db.get(MappingJob, job_id)
+ if job is None:
+ return None
+ job.status = status
+ if progress is not None:
+ job.progress = progress
+ db.flush()
+ return job
+
+
+def get_job(db: Session, job_id: int) -> MappingJob | None:
+ return db.get(MappingJob, job_id)
diff --git a/murshid_backend/app/repositories/rule_repo.py b/murshid_backend/app/repositories/rule_repo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a542660280c881ef8dedbf2f91c2e40fe46558b2
--- /dev/null
+++ b/murshid_backend/app/repositories/rule_repo.py
@@ -0,0 +1,71 @@
+"""CRUD for Rule and RuleTechniqueMapping tables."""
+
+from __future__ import annotations
+
+import json
+
+import numpy as np
+from sqlalchemy.orm import Session
+
+from app.models.rule import Rule
+from app.models.rule_technique_mapping import RuleTechniqueMapping
+
+
+def upsert_rule(
+ db: Session,
+ *,
+ rule_id: str,
+ job_id: int | None = None,
+ embedding: np.ndarray | None = None,
+) -> Rule:
+ rule = db.get(Rule, rule_id)
+ if rule is None:
+ rule = Rule(rule_id=rule_id)
+ db.add(rule)
+ if job_id is not None:
+ rule.job_id = job_id
+ if embedding is not None:
+ rule.embedding_vector = json.dumps(embedding.tolist())
+ db.flush()
+ return rule
+
+
+def save_technique_mappings(
+ db: Session,
+ *,
+ rule_id: str,
+ results: list[dict],
+) -> list[RuleTechniqueMapping]:
+ """
+ Persist ALL (rule_id, technique_id, confidence_score) rows sorted by confidence.
+ Deletes existing mappings first so re-runs are idempotent.
+ Saves ALL techniques (not just detected ones) so Figure 4-11 can show Top 5.
+ """
+ db.query(RuleTechniqueMapping).filter(
+ RuleTechniqueMapping.rule_id == rule_id
+ ).delete(synchronize_session=False)
+
+ sorted_results = sorted(results, key=lambda r: r["confidence_percent"], reverse=True)
+
+ rows = []
+ for r in sorted_results:
+ row = RuleTechniqueMapping(
+ rule_id=rule_id,
+ technique_id=r["technique_id"],
+ confidence_score=r["confidence_percent"] / 100.0,
+ )
+ db.add(row)
+ rows.append(row)
+ db.flush()
+ return rows
+
+
+def get_mappings_for_rule(
+ db: Session, rule_id: str
+) -> list[RuleTechniqueMapping]:
+ return (
+ db.query(RuleTechniqueMapping)
+ .filter(RuleTechniqueMapping.rule_id == rule_id)
+ .order_by(RuleTechniqueMapping.confidence_score.desc())
+ .all()
+ )
diff --git a/murshid_backend/app/repositories/template_repo.py b/murshid_backend/app/repositories/template_repo.py
new file mode 100644
index 0000000000000000000000000000000000000000..13c58d427945b93d57cbb495176fc9b69227b871
--- /dev/null
+++ b/murshid_backend/app/repositories/template_repo.py
@@ -0,0 +1,94 @@
+"""CRUD for Technique and QueryTemplate tables."""
+
+from __future__ import annotations
+
+from sqlalchemy.orm import Session
+
+from app.models.query_template import QueryTemplate
+from app.models.technique import Technique
+
+
+# --------------------------------------------------------------------------
+# Techniques
+# --------------------------------------------------------------------------
+
+
+def get_or_create_technique(
+ db: Session, *, technique_id: str, technique_name: str = "", tactic: str | None = None
+) -> Technique:
+ t = db.get(Technique, technique_id)
+ if t is None:
+ t = Technique(
+ technique_id=technique_id,
+ technique_name=technique_name or technique_id,
+ tactic=tactic,
+ )
+ db.add(t)
+ db.flush()
+ return t
+
+
+def get_technique(db: Session, technique_id: str) -> Technique | None:
+ return db.get(Technique, technique_id)
+
+
+# --------------------------------------------------------------------------
+# Query templates
+# --------------------------------------------------------------------------
+
+
+def get_templates_for_technique(
+ db: Session, technique_id: str
+) -> list[QueryTemplate]:
+ return (
+ db.query(QueryTemplate)
+ .filter(
+ QueryTemplate.technique_id == technique_id,
+ QueryTemplate.is_active.is_(True),
+ )
+ .all()
+ )
+
+
+def create_template(
+ db: Session,
+ *,
+ technique_id: str,
+ purpose: str | None,
+ wql_query: str,
+ note: str | None,
+) -> QueryTemplate:
+ tpl = QueryTemplate(
+ technique_id=technique_id,
+ purpose=purpose,
+ wql_query=wql_query,
+ note=note,
+ is_active=True,
+ )
+ db.add(tpl)
+ db.flush()
+ return tpl
+
+
+def update_template(
+ db: Session,
+ template_id: int,
+ *,
+ purpose: str | None = None,
+ wql_query: str | None = None,
+ note: str | None = None,
+ is_active: bool | None = None,
+) -> QueryTemplate | None:
+ tpl = db.get(QueryTemplate, template_id)
+ if tpl is None:
+ return None
+ if purpose is not None:
+ tpl.purpose = purpose
+ if wql_query is not None:
+ tpl.wql_query = wql_query
+ if note is not None:
+ tpl.note = note
+ if is_active is not None:
+ tpl.is_active = is_active
+ db.flush()
+ return tpl
diff --git a/murshid_backend/app/schemas/__init__.py b/murshid_backend/app/schemas/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa2a4f43ee5c7534f5249f2b0d979d0c279e386f
--- /dev/null
+++ b/murshid_backend/app/schemas/__init__.py
@@ -0,0 +1 @@
+"""Pydantic schemas for API request/response validation."""
diff --git a/murshid_backend/app/schemas/query.py b/murshid_backend/app/schemas/query.py
new file mode 100644
index 0000000000000000000000000000000000000000..a310b5a4b6fcdeecfd276e9f23a0ff06158ce089
--- /dev/null
+++ b/murshid_backend/app/schemas/query.py
@@ -0,0 +1,23 @@
+from pydantic import BaseModel
+
+
+class QueryTemplateOut(BaseModel):
+ template_id: int
+ technique_id: str
+ purpose: str | None
+ wql_query: str
+ note: str | None
+
+
+class QueryTemplateIn(BaseModel):
+ technique_id: str
+ purpose: str | None = None
+ wql_query: str
+ note: str | None = None
+
+
+class QueryTemplateUpdate(BaseModel):
+ purpose: str | None = None
+ wql_query: str | None = None
+ note: str | None = None
+ is_active: bool | None = None
diff --git a/murshid_backend/app/schemas/result.py b/murshid_backend/app/schemas/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..b00cf13a2f4eb321c170e213959f4ad7fe6884b0
--- /dev/null
+++ b/murshid_backend/app/schemas/result.py
@@ -0,0 +1,17 @@
+from pydantic import BaseModel
+
+
+class MappingResult(BaseModel):
+ technique_id: str
+ confidence_score: float
+ confidence_percent: float
+ # primary = highest confidence; secondary = second if >=0.5; others = below threshold
+ rank: int
+ is_primary: bool = False
+ is_secondary: bool = False
+
+
+class ResultsResponse(BaseModel):
+ rule_id: str
+ mappings: list[MappingResult] # all techniques sorted by confidence desc
+ detected: list[MappingResult] # primary + secondary (confidence >= 0.5)
diff --git a/murshid_backend/app/schemas/rule.py b/murshid_backend/app/schemas/rule.py
new file mode 100644
index 0000000000000000000000000000000000000000..6455f018c39472c2464fb7cceb8b20bd92b7ee2b
--- /dev/null
+++ b/murshid_backend/app/schemas/rule.py
@@ -0,0 +1,29 @@
+from pydantic import BaseModel, Field
+
+
+class AnalyzeRequest(BaseModel):
+ rule_xml: str = Field(
+ ...,
+ min_length=10,
+ description="Full Wazuh rule XML including ...",
+ )
+
+
+class TechniqueResult(BaseModel):
+ technique_id: str
+ predicted: bool
+ confidence_percent: float
+ proba: float
+ threshold: float
+ gap: float
+
+
+class AnalyzeResponse(BaseModel):
+ rule_id: str
+ sanitized_xml: str
+ summary: str
+ text_for_embedding: str
+ embedding_dim: int
+ pipeline_mode: str = "full" # "full" | "local" | "lite"
+ detected: list[TechniqueResult]
+ all_results: list[TechniqueResult]
diff --git a/murshid_backend/app/services/__init__.py b/murshid_backend/app/services/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48f9301cd497582af5d4433209a36c9059c52f48
--- /dev/null
+++ b/murshid_backend/app/services/__init__.py
@@ -0,0 +1 @@
+"""Service layer โ business logic between API and ML/repositories."""
diff --git a/murshid_backend/app/services/ml_service.py b/murshid_backend/app/services/ml_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe3ce81294617cf87ac439536440aff80e8c1720
--- /dev/null
+++ b/murshid_backend/app/services/ml_service.py
@@ -0,0 +1,16 @@
+"""
+MLService โ thin wrapper that calls the ML pipeline and exposes
+analyze_rule() for use by other services.
+"""
+
+from __future__ import annotations
+
+from app.ml.pipeline import analyze_rule as _pipeline_analyze
+from app.ml.pipeline import is_ready
+
+
+class MLService:
+ def analyze(self, rule_xml: str) -> dict:
+ if not is_ready():
+ raise RuntimeError("ML pipeline is not ready. Models still loading.")
+ return _pipeline_analyze(rule_xml)
diff --git a/murshid_backend/app/services/result_service.py b/murshid_backend/app/services/result_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..063b8c9c36658f0d910d28623015df431a5c4068
--- /dev/null
+++ b/murshid_backend/app/services/result_service.py
@@ -0,0 +1,50 @@
+"""
+ResultService โ fetches stored technique mappings for a given rule_id.
+Use Case 1: "View the techniques and their scores associated with an alert" (ยง3.2.7).
+"""
+
+from __future__ import annotations
+
+from sqlalchemy.orm import Session
+
+from app.repositories.rule_repo import get_mappings_for_rule
+
+
+class ResultService:
+ def __init__(self, db: Session) -> None:
+ self._db = db
+
+ SECONDARY_THRESHOLD = 0.50 # ยง3.2.3.2: secondary if confidence โฅ 0.5
+
+ def get_results_for_rule(self, rule_id: str) -> dict:
+ """
+ Returns:
+ mappings: all techniques sorted by confidence desc (for Figure 4-11 Top 5)
+ detected: primary + secondary techniques only (for Figure 4-12 WQL queries)
+ """
+ mappings = get_mappings_for_rule(self._db, rule_id)
+ if not mappings:
+ return None
+
+ all_mappings = []
+ detected = []
+
+ for i, m in enumerate(mappings):
+ conf_pct = round(m.confidence_score * 100, 2)
+ is_primary = (i == 0)
+ is_secondary = (i == 1 and m.confidence_score >= self.SECONDARY_THRESHOLD)
+
+ row = {
+ "technique_id": m.technique_id,
+ "confidence_score": round(m.confidence_score, 4),
+ "confidence_percent": conf_pct,
+ "rank": i + 1,
+ "is_primary": is_primary,
+ "is_secondary": is_secondary,
+ }
+ all_mappings.append(row)
+
+ if is_primary or is_secondary:
+ detected.append(row)
+
+ return {"mappings": all_mappings, "detected": detected}
diff --git a/murshid_backend/app/services/rule_service.py b/murshid_backend/app/services/rule_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1691c8e9935fb7082748fd5cb7474997461866b
--- /dev/null
+++ b/murshid_backend/app/services/rule_service.py
@@ -0,0 +1,71 @@
+"""
+RuleService โ orchestrates:
+ 1. ML analysis
+ 2. Persisting Rule + RuleTechniqueMapping rows
+ 3. Ensuring Technique rows exist
+"""
+
+from __future__ import annotations
+
+import xml.etree.ElementTree as ET
+
+from sqlalchemy.orm import Session
+
+from app.repositories import rule_repo, template_repo
+from app.services.ml_service import MLService
+
+
+class RuleService:
+ def __init__(self, db: Session, ml: MLService | None = None) -> None:
+ self._db = db
+ self._ml = ml or MLService()
+
+ # ------------------------------------------------------------------
+
+ def analyze_and_persist(
+ self,
+ rule_xml: str,
+ *,
+ job_id: int | None = None,
+ ) -> dict:
+ """
+ Full pipeline call followed by DB persistence.
+ Returns the analysis result dict from the ML pipeline.
+ """
+ result = self._ml.analyze(rule_xml)
+
+ # Extract rule_id from raw XML (not the sanitised version)
+ rule_id = self._extract_rule_id(rule_xml)
+
+ # Ensure each predicted technique has a row in `techniques`
+ for r in result["detected"]:
+ template_repo.get_or_create_technique(
+ self._db,
+ technique_id=r["technique_id"],
+ technique_name=r["technique_id"],
+ )
+
+ # Upsert rule row
+ rule_repo.upsert_rule(self._db, rule_id=rule_id, job_id=job_id)
+
+ # Persist technique mappings
+ rule_repo.save_technique_mappings(
+ self._db, rule_id=rule_id, results=result["results"]
+ )
+
+ self._db.commit()
+
+ # Attach rule_id to the result for convenience
+ result["rule_id"] = rule_id
+ return result
+
+ # ------------------------------------------------------------------
+
+ @staticmethod
+ def _extract_rule_id(rule_xml: str) -> str:
+ try:
+ elem = ET.fromstring(rule_xml.strip())
+ rid = elem.get("id", "").strip()
+ return rid if rid else "unknown"
+ except ET.ParseError:
+ return "unknown"
diff --git a/murshid_backend/app/services/template_service.py b/murshid_backend/app/services/template_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb4f360aa80073afef8fcc780e67f38b83a577fb
--- /dev/null
+++ b/murshid_backend/app/services/template_service.py
@@ -0,0 +1,87 @@
+"""
+TemplateService โ fetches and manages WQL query templates.
+Use Case 2: "View Investigation WQL Queries" (ยง3.2.7).
+Use Case 7: "Manage static query templates" (ยง3.2.7).
+"""
+
+from __future__ import annotations
+
+from sqlalchemy.orm import Session
+
+from app.repositories.template_repo import (
+ create_template,
+ get_templates_for_technique,
+ update_template,
+)
+
+CONFIDENCE_THRESHOLD_SECONDARY = 0.5 # from ยง3.2.3.2 "secondary if score >= 0.5"
+
+
+class TemplateService:
+ def __init__(self, db: Session) -> None:
+ self._db = db
+
+ def get_queries_for_technique(self, technique_id: str) -> list[dict]:
+ """
+ Returns all active WQL templates for the given technique.
+ Use Case 2.
+ """
+ templates = get_templates_for_technique(self._db, technique_id)
+ return [
+ {
+ "template_id": t.template_id,
+ "technique_id": t.technique_id,
+ "purpose": t.purpose,
+ "wql_query": t.wql_query,
+ "note": t.note,
+ }
+ for t in templates
+ ]
+
+ def add_template(
+ self,
+ *,
+ technique_id: str,
+ purpose: str | None,
+ wql_query: str,
+ note: str | None,
+ ) -> dict:
+ """Admin: add a new WQL template. Use Case 7."""
+ tpl = create_template(
+ self._db,
+ technique_id=technique_id,
+ purpose=purpose,
+ wql_query=wql_query,
+ note=note,
+ )
+ self._db.commit()
+ return {
+ "template_id": tpl.template_id,
+ "technique_id": tpl.technique_id,
+ "purpose": tpl.purpose,
+ "wql_query": tpl.wql_query,
+ "note": tpl.note,
+ "is_active": tpl.is_active,
+ }
+
+ def update_template(self, template_id: int, data: dict) -> dict | None:
+ """Admin: update or disable a template. Use Case 7."""
+ tpl = update_template(
+ self._db,
+ template_id,
+ purpose=data.get("purpose"),
+ wql_query=data.get("wql_query"),
+ note=data.get("note"),
+ is_active=data.get("is_active"),
+ )
+ if tpl is None:
+ return None
+ self._db.commit()
+ return {
+ "template_id": tpl.template_id,
+ "technique_id": tpl.technique_id,
+ "purpose": tpl.purpose,
+ "wql_query": tpl.wql_query,
+ "note": tpl.note,
+ "is_active": tpl.is_active,
+ }
diff --git a/murshid_backend/requirements.txt b/murshid_backend/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..88cd0453663e72f26b12f4b2d49212a22b78bb1e
--- /dev/null
+++ b/murshid_backend/requirements.txt
@@ -0,0 +1,22 @@
+fastapi>=0.115.0
+uvicorn[standard]>=0.32.0
+pydantic>=2.9.0
+pydantic-settings>=2.6.0
+python-dotenv>=1.0.0
+
+# database
+sqlalchemy>=2.0.0
+alembic>=1.13.0
+pymysql>=1.1.0
+cryptography>=43.0.0
+
+# ML / numerics
+numpy>=1.26.0
+joblib>=1.4.0
+torch>=2.0.0
+transformers>=4.44.0
+accelerate>=0.34.0
+bitsandbytes>=0.46.1
+sentencepiece>=0.2.0
+lxml>=5.0.0
+huggingface_hub>=0.25.0
diff --git a/murshid_backend/requirements_light.txt b/murshid_backend/requirements_light.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1f9426bd41267892ab7029fc9596a29695beb793
--- /dev/null
+++ b/murshid_backend/requirements_light.txt
@@ -0,0 +1,13 @@
+# ุฎููู โ ููุงุฎุชุจุงุฑ ุงูุฃููู ุจุฏูู GPU/LLM
+fastapi>=0.115.0
+uvicorn[standard]>=0.32.0
+pydantic>=2.9.0
+pydantic-settings>=2.6.0
+python-dotenv>=1.0.0
+sqlalchemy>=2.0.0
+alembic>=1.13.0
+pymysql>=1.1.0
+cryptography>=43.0.0
+numpy>=1.26.0
+joblib>=1.4.0
+lxml>=5.0.0
diff --git a/murshid_backend/scripts/__init__.py b/murshid_backend/scripts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f23051728e4f570f0fe2d03d0689510cf9e343b8
--- /dev/null
+++ b/murshid_backend/scripts/__init__.py
@@ -0,0 +1 @@
+"""Scripts for one-time data operations."""
diff --git a/murshid_backend/scripts/import_excel_templates.py b/murshid_backend/scripts/import_excel_templates.py
new file mode 100644
index 0000000000000000000000000000000000000000..d191c8794866c3e4406c8f238ab77f1944925d4a
--- /dev/null
+++ b/murshid_backend/scripts/import_excel_templates.py
@@ -0,0 +1,130 @@
+"""
+ุงุณุชูุฑุงุฏ ููุงูุจ WQL ู
ู Excel ุฅูู ูุงุนุฏุฉ ุงูุจูุงูุงุช.
+
+ููุดุบููู ู
ุฑุฉ ูุงุญุฏุฉ:
+ cd d:\GP\murshid_backend
+ .venv\Scripts\python.exe scripts\import_excel_templates.py
+"""
+
+import sys
+import re
+from pathlib import Path
+
+# ุฃุถู ู
ุณุงุฑ ุงูุจุงููุฏ ูุงุณุชูุฑุงุฏ app
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+import openpyxl
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from app.db.session import SessionLocal
+from app.models.query_template import QueryTemplate
+from app.models.technique import Technique
+
+EXCEL_PATH = Path(settings.murshid_models_dir) / "murshid_query_template_structure_clean_shared.xlsx"
+
+# Fallback: same directory as project root
+if not EXCEL_PATH.is_file():
+ EXCEL_PATH = Path(__file__).resolve().parent.parent.parent / "murshid_query_template_structure_clean_shared.xlsx"
+
+def normalise_query(q: str | None) -> str:
+ """Collapse whitespace/newlines in WQL query."""
+ if not q:
+ return ""
+ return re.sub(r"\s+", " ", q.strip())
+
+
+def run(db: Session, replace: bool = False) -> dict:
+ if not EXCEL_PATH.is_file():
+ return {"error": f"Excel file not found: {EXCEL_PATH}"}
+
+ wb = openpyxl.load_workbook(EXCEL_PATH)
+ ws = wb.active
+
+ rows = list(ws.iter_rows(min_row=2, values_only=True))
+
+ inserted_techniques = 0
+ inserted_templates = 0
+ skipped = 0
+ errors = []
+
+ for idx, row in enumerate(rows, start=2):
+ technique_id = str(row[0] or "").strip()
+ technique_name = str(row[1] or "").strip()
+ template_id_str = str(row[2] or "").strip() # e.g. "T1484-1"
+ purpose = str(row[3] or "").strip() or None
+ wql_query = normalise_query(str(row[4] or ""))
+ note = str(row[5] or "").strip() or None
+
+ if not technique_id or not wql_query:
+ skipped += 1
+ continue
+
+ # 1. Upsert Technique
+ tech = db.get(Technique, technique_id)
+ if tech is None:
+ tech = Technique(
+ technique_id=technique_id,
+ technique_name=technique_name or technique_id,
+ tactic=None,
+ )
+ db.add(tech)
+ db.flush()
+ inserted_techniques += 1
+ elif technique_name and not tech.technique_name:
+ tech.technique_name = technique_name
+
+ # 2. Insert QueryTemplate (skip duplicate template_id_str unless replace=True)
+ # Check uniqueness by (technique_id + purpose) to avoid duplicates on re-run
+ existing = (
+ db.query(QueryTemplate)
+ .filter(
+ QueryTemplate.technique_id == technique_id,
+ QueryTemplate.purpose == purpose,
+ )
+ .first()
+ )
+
+ if existing:
+ if replace:
+ existing.wql_query = wql_query
+ existing.note = note
+ existing.is_active = True
+ inserted_templates += 1
+ else:
+ skipped += 1
+ continue
+
+ tpl = QueryTemplate(
+ technique_id=technique_id,
+ purpose=purpose,
+ wql_query=wql_query,
+ note=note,
+ is_active=True,
+ )
+ db.add(tpl)
+ inserted_templates += 1
+
+ db.commit()
+
+ return {
+ "excel_path": str(EXCEL_PATH),
+ "rows_processed": len(rows),
+ "techniques_inserted": inserted_techniques,
+ "templates_inserted": inserted_templates,
+ "skipped": skipped,
+ "errors": errors,
+ }
+
+
+if __name__ == "__main__":
+ replace = "--replace" in sys.argv
+
+ db: Session = SessionLocal()
+ try:
+ result = run(db, replace=replace)
+ print("\n=== Import Result ===")
+ for k, v in result.items():
+ print(f" {k}: {v}")
+ finally:
+ db.close()
diff --git a/murshid_frontend/index.html b/murshid_frontend/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..cfdb45b57203c63384664282f44076e31d84469f
--- /dev/null
+++ b/murshid_frontend/index.html
@@ -0,0 +1,1347 @@
+
+
+
+
+
+ Murshid | ู
ูุฑุดูุฏ
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41f7777220fae2ed0467233676c6003367ebb372
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+set -e
+
+cd /app/murshid_backend
+
+# Run Alembic migrations
+echo "๐ Running database migrations..."
+python -m alembic upgrade head
+echo "โ
Database ready"
+
+# Import Excel templates (if not already imported)
+echo "๐ Importing WQL templates from Excel..."
+python -c "
+from app.db.session import SessionLocal
+from scripts.import_excel_templates import run as import_excel
+db = SessionLocal()
+try:
+ result = import_excel(db, replace=False)
+ print('Templates:', result)
+finally:
+ db.close()
+" || echo "โ ๏ธ Template import skipped (non-critical)"
+
+echo "๐ Starting Murshid API on port ${PORT:-7860}..."
+exec python -m uvicorn app.main:app \
+ --host 0.0.0.0 \
+ --port "${PORT:-7860}" \
+ --log-level info