diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..ae6cd4f88468716a796742c05e7d196422013b20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +__pycache__ +*.pyc +*.pyo +.venv +venv +**/.env +.env.local +*.db +*.log +.git +.gitignore +*.zip +MurshidBackend_Colab.ipynb +MurshidBackend_Colab_Report.md +interface_pictures/ +murshid_backend/.venv +murshid_backend/__pycache__ +murshid_backend/TECHNICAL_REPORT.md diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..7fbd2c18033f804f7f89e0fab8e904767abfb0ca --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.joblib filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.xlsx filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..9a17f72da2c6ac77675830444782013dde54f79e --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +__pycache__/ +*.pyc +*.pyo +.venv/ +venv/ +*.db +*.log +**/.env +.env.local +murshid_backend_for_drive.zip +interface_pictures/ diff --git a/DEPLOY_GUIDE.md b/DEPLOY_GUIDE.md new file mode 100644 index 0000000000000000000000000000000000000000..43852eb112797ba2966ce91250449d6ad09ec459 --- /dev/null +++ b/DEPLOY_GUIDE.md @@ -0,0 +1,103 @@ +# ๐Ÿš€ ุฏู„ูŠู„ ุงู„ู†ุดุฑ ุนู„ู‰ Hugging Face Spaces + +## ุงู„ู…ุชุทู„ุจุงุช +- ุญุณุงุจ ุนู„ู‰ [Hugging Face](https://huggingface.co/) (ู…ุฌุงู†ูŠ) +- [Git](https://git-scm.com/) ู…ุซุจู‘ุช ุนู„ู‰ ุฌู‡ุงุฒูƒ + +--- + +## ุงู„ุฎุทูˆุงุช + +### 1. ุฅู†ุดุงุก Space ุฌุฏูŠุฏ + +1. ุงุฐู‡ุจ ุฅู„ู‰: https://huggingface.co/new-space +2. **Space name**: `murshid` +3. **SDK**: ุงุฎุชุฑ **Docker** +4. **Visibility**: Public (ู…ุฌุงู†ูŠ) ุฃูˆ Private +5. ุงุถุบุท **Create Space** + +### 2. ุฑูุน ุงู„ู…ุดุฑูˆุน + +```powershell +cd d:\murishd + +# ุชู‡ูŠุฆุฉ Git (ุฅุฐุง ู„ู… ูŠูƒู† ู…ูˆุฌูˆุฏุงู‹) +git init + +# ุฅุถุงูุฉ ุงู„ู€ remote (ุบูŠู‘ุฑ YOUR_USERNAME ุจุงุณู… ุญุณุงุจูƒ) +git remote add space https://huggingface.co/spaces/YOUR_USERNAME/murshid + +# ุฅุถุงูุฉ ุงู„ู…ู„ูุงุช ูˆุงู„ุฑูุน +git add . +git commit -m "Initial deployment" +git push space main +``` + +> โš ๏ธ ุฅุฐุง ุทู„ุจ ูƒู„ู…ุฉ ู…ุฑูˆุฑุŒ ุงุณุชุฎุฏู… **Access Token** ู…ู†: +> https://huggingface.co/settings/tokens + +### 3. ุฅุนุฏุงุฏ ุงู„ู…ุชุบูŠุฑุงุช ุงู„ุจูŠุฆูŠุฉ (Secrets) + +ุงุฐู‡ุจ ุฅู„ู‰ ุฅุนุฏุงุฏุงุช ุงู„ู€ Space: `Settings โ†’ Variables and secrets` + +ุฃุถู ู‡ุฐู‡ ุงู„ู…ุชุบูŠุฑุงุช: + +| ุงู„ุงุณู… | ุงู„ู‚ูŠู…ุฉ | ุงู„ู†ูˆุน | +|-------|--------|-------| +| `MURSHID_DB_URL` | `sqlite:////app/data/murshid.db` | Variable | +| `MURSHID_MODELS_DIR` | `/app/Needed` | Variable | +| `MURSHID_SKIP_LLM` | `true` | Variable | +| `SECRET_KEY` | (ุงุฎุชุฑ ูƒู„ู…ุฉ ุณุฑ ุนุดูˆุงุฆูŠุฉ) | **Secret** | +| `HF_TOKEN` | (ุงุฎุชูŠุงุฑูŠ โ€” ู„ูˆ ุชุจุบู‰ Llama) | **Secret** | + +### 4. ุงู†ุชุธุฑ ุงู„ุจู†ุงุก + +- HF Spaces ูŠุจู†ูŠ ุงู„ู€ Docker image ุชู„ู‚ุงุฆูŠุงู‹ +- ูŠุฃุฎุฐ **3-5 ุฏู‚ุงุฆู‚** ู„ู„ุจู†ุงุก ุงู„ุฃูˆู„ +- ุจุนุฏ ุงู„ู†ุฌุงุญุŒ ุงู„ุฑุงุจุท ูŠูƒูˆู†: + ``` + https://YOUR_USERNAME-murshid.hf.space + ``` + +--- + +## ุงู„ุฑูˆุงุจุท ุจุนุฏ ุงู„ู†ุดุฑ + +| ุงู„ุฑุงุจุท | ุงู„ูˆุตู | +|--------|-------| +| `https://YOUR_USERNAME-murshid.hf.space` | ุงู„ูˆุงุฌู‡ุฉ ุงู„ุฑุฆูŠุณูŠุฉ | +| `https://YOUR_USERNAME-murshid.hf.space/docs` | ุชูˆุซูŠู‚ Swagger | +| `https://YOUR_USERNAME-murshid.hf.space/health` | ูุญุต ุงู„ุญุงู„ุฉ | + +--- + +## ู…ู„ุงุญุธุงุช + +### ุงู„ูˆุถุน ุงู„ุญุงู„ูŠ (LITE mode) +- ุงู„ู…ุดุฑูˆุน ูŠู†ุดุฑ ุจูˆุถุน **LITE** (ุจุฏูˆู† torch/SecureBERT+) +- ุชุญู„ูŠู„ ุงู„ู‚ูˆุงุนุฏ ูŠุนู…ู„ ู„ูƒู† ุจุฏู‚ุฉ ุฃู‚ู„ (embeddings ุนุดูˆุงุฆูŠุฉ) +- ู…ู†ุงุณุจ ู„ุงุฎุชุจุงุฑ ุงู„ูˆุงุฌู‡ุฉ ูˆุงู„ู€ API + +### ู„ู„ุชุฑู‚ูŠุฉ ุฅู„ู‰ LOCAL mode (SecureBERT+ ุจุฏูˆู† Llama) +ุนุฏู‘ู„ `Dockerfile` ูˆุฃุฒู„ ุงู„ุชุนู„ูŠู‚ ู…ู† ุณุทุฑ torch: +```dockerfile +RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece +``` +> โš ๏ธ ู‡ุฐุง ูŠุฒูŠุฏ ุญุฌู… ุงู„ุตูˆุฑุฉ ~800MB ูˆูŠุญุชุงุฌ ุฐุงูƒุฑุฉ ุฃูƒุซุฑ + +### ู„ู„ุชุฑู‚ูŠุฉ ุฅู„ู‰ FULL mode (ู…ุน Llama 3) +- ุบูŠู‘ุฑ ุงู„ู€ Space ุฅู„ู‰ **GPU (T4)** ู…ู† ุงู„ุฅุนุฏุงุฏุงุช ($0.60/ุณุงุนุฉ) +- ุนุฏู‘ู„ `MURSHID_SKIP_LLM=false` +- ุฃุถู `HF_TOKEN` ููŠ ุงู„ู€ Secrets +- ุงุณุชุฎุฏู… `requirements.txt` ุงู„ูƒุงู…ู„ ุจุฏู„ `requirements_light.txt` + +--- + +## ุงุณุชูƒุดุงู ุงู„ุฃุฎุทุงุก + +| ุงู„ู…ุดูƒู„ุฉ | ุงู„ุญู„ | +|---------|------| +| Build ูุดู„ | ุชุญู‚ู‚ ู…ู† ุงู„ู€ Logs ููŠ ุชุจูˆูŠุจ ุงู„ู€ Space | +| 502 Bad Gateway | ุงู†ุชุธุฑ ุฏู‚ูŠู‚ุฉ โ€” ุงู„ุฎุงุฏู… ูŠุจุฏุฃ | +| DB ุฎุทุฃ | ุชุญู‚ู‚ ู…ู† `MURSHID_DB_URL` ููŠ ุงู„ู…ุชุบูŠุฑุงุช | +| Frontend ู„ุง ูŠุชุตู„ | ุงู„ู€ BASE URL ุฃุตุจุญ ุชู„ู‚ุงุฆูŠ (`window.location.origin`) | diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6fab11db62f4f3e595d30e761e30284c3cdf321c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,49 @@ +FROM python:3.11-slim + +# System deps +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential libxml2-dev libxslt1-dev \ + && rm -rf /var/lib/apt/lists/* + +# Create non-root user (HF Spaces requirement) +RUN useradd -m -u 1000 appuser + +WORKDIR /app + +# Copy requirements first for layer caching +COPY murshid_backend/requirements_light.txt ./requirements.txt +RUN pip install --no-cache-dir -r requirements.txt \ + && pip install --no-cache-dir openpyxl aiofiles scikit-learn + +# Optional: install torch CPU-only for LOCAL mode (SecureBERT+ embeddings) +# Uncomment the next line if you want LOCAL mode (adds ~800MB to image) +# RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu transformers sentencepiece + +# Copy backend code +COPY murshid_backend/ ./murshid_backend/ + +# Copy model files +COPY Needed/ ./Needed/ + +# Copy frontend +COPY murshid_frontend/ ./murshid_frontend/ + +# Create writable directory for SQLite DB +RUN mkdir -p /app/data && chown -R appuser:appuser /app + +# Setup environment +ENV MURSHID_DB_URL=sqlite:////app/data/murshid.db +ENV MURSHID_MODELS_DIR=/app/Needed +ENV MURSHID_SKIP_LLM=true +ENV SECRET_KEY=murshid_hf_space_2026 +ENV PORT=7860 + +# Run DB migrations + import templates + start server +COPY start.sh ./start.sh +RUN chmod +x start.sh + +USER appuser + +EXPOSE 7860 + +CMD ["./start.sh"] diff --git a/MurshidBackend_Colab.ipynb b/MurshidBackend_Colab.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..447346989a45fec55b47f11fe7405490972d70a1 --- /dev/null +++ b/MurshidBackend_Colab.ipynb @@ -0,0 +1,967 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ๐Ÿ›ก๏ธ Murshid Backend โ€” Full Mode on Colab\n", + "\n", + "**ู…ูุฑุดูุฏ | From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts**\n", + "\n", + "---\n", + "\n", + "## ๐Ÿ“ ุงู„ู…ู„ูุงุช ุงู„ู…ุทู„ูˆุจุฉ ุนู„ู‰ Google Drive\n", + "\n", + "```\n", + "MyDrive/\n", + "โ”œโ”€โ”€ murshid_backend_for_drive.zip โ† ุงุฑูุนูŠู‡ ุซู… ุดุบู‘ู„ูŠ ุงู„ุฎู„ูŠุฉ 2b ู„ุงุณุชุฎุฑุงุฌู‡\n", + "โ”‚ ุฃูˆ\n", + "โ”œโ”€โ”€ murshid_backend/ โ† ุฅุฐุง ุงุณุชุฎุฑุฌุชู‡ ู…ุณุจู‚ุงู‹\n", + "โ”‚ โ”œโ”€โ”€ app/\n", + "โ”‚ โ”œโ”€โ”€ alembic/\n", + "โ”‚ โ”œโ”€โ”€ scripts/\n", + "โ”‚ โ”œโ”€โ”€ alembic.ini\n", + "โ”‚ โ””โ”€โ”€ requirements.txt\n", + "โ”‚\n", + "โ””โ”€โ”€ Needed/\n", + " โ”œโ”€โ”€ murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n", + " โ”œโ”€โ”€ murshid_logreg_thresholds_manual_oof_pcatuned.npy\n", + " โ”œโ”€โ”€ murshid_label_columns.json\n", + " โ””โ”€โ”€ murshid_query_template_structure_clean_shared.xlsx\n", + "```\n", + "\n", + "## ุชุนู„ูŠู…ุงุช ุงู„ุชุดุบูŠู„\n", + "\n", + "### ุงู„ู…ุชุทู„ุจุงุช ู‚ุจู„ ุงู„ุชุดุบูŠู„\n", + "1. โœ… **GPU ู…ููุนูŽู‘ู„:** `Runtime โ†’ Change runtime type โ†’ T4 GPU`\n", + "2. โœ… **Google Drive ู…ูุชูŽู‘ุตู„** (ูŠุญุชูˆูŠ ู…ุฌู„ุฏ `Needed` ุจู…ู„ูุงุช ุงู„ู†ู…ุงุฐุฌ)\n", + "3. โœ… **ู…ุฌู„ุฏ `murshid_backend`** ุนู„ู‰ Drive ุฃูˆ ุฑูุนู‡ ูŠุฏูˆูŠุงู‹\n", + "\n", + "### ุงู„ู…ู„ูุงุช ุงู„ู…ุทู„ูˆุจุฉ ููŠ Google Drive\n", + "```\n", + "MyDrive/\n", + "โ”œโ”€โ”€ Needed/\n", + "โ”‚ โ”œโ”€โ”€ murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n", + "โ”‚ โ”œโ”€โ”€ murshid_logreg_thresholds_manual_oof_pcatuned.npy\n", + "โ”‚ โ”œโ”€โ”€ murshid_label_columns.json\n", + "โ”‚ โ””โ”€โ”€ murshid_query_template_structure_clean_shared.xlsx\n", + "โ””โ”€โ”€ murshid_backend/ โ† ู…ุฌู„ุฏ ุงู„ุจุงูƒู†ุฏ ูƒุงู…ู„ุงู‹\n", + "```\n", + "\n", + "### ุชุฑุชูŠุจ ุงู„ุชุดุบูŠู„\n", + "**ุดุบู‘ู„ูŠ ุงู„ุฎู„ุงูŠุง ุจุงู„ุชุฑุชูŠุจ ู…ู† ุงู„ุฃุนู„ู‰ ู„ู„ุฃุณูู„ โ€” ู„ุง ุชุชุฎุทู‘ูŠ ุฃูŠ ุฎู„ูŠุฉ**\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 1: ุงู„ุชุญู‚ู‚ ู…ู† GPU\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "print('CUDA available:', torch.cuda.is_available())\n", + "if torch.cuda.is_available():\n", + " print('GPU:', torch.cuda.get_device_name(0))\n", + " print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')\n", + "else:\n", + " print('โš ๏ธ ู„ุง ูŠูˆุฌุฏ GPU โ€” ุบูŠู‘ุฑูŠ Runtime ุฅู„ู‰ T4 ู…ู† ุงู„ู‚ุงุฆู…ุฉ ุฃุนู„ุงู‡')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 2: ุชุญู…ูŠู„ Google Drive\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 3: ุชุฌู‡ูŠุฒ ุงู„ุจุงูƒู†ุฏ ููŠ /content\n", + "\n", + "> ุชู‚ูˆู… ู‡ุฐู‡ ุงู„ุฎู„ูŠุฉ ุชู„ู‚ุงุฆูŠุงู‹ ุจู€:\n", + "> 1. ุงุณุชุฎุฑุงุฌ ZIP ู…ู† Drive (ุฅุฐุง ูƒุงู† ZIP ู…ูˆุฌูˆุฏุงู‹ ูˆู„ู… ูŠูุณุชุฎุฑุฌ ุจุนุฏ)\n", + "> 2. ู†ุณุฎ ู…ุฌู„ุฏ `murshid_backend` ุฅู„ู‰ `/content` (ุฃุณุฑุน ู„ู„ู‚ุฑุงุกุฉ)\n", + "> 3. ุถุจุท Python path\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('(ู‡ุฐู‡ ุงู„ุฎู„ูŠุฉ ูุงุฑุบุฉ โ€” ุงู„ูƒูˆุฏ ุงู†ุชู‚ู„ ุฅู„ู‰ ุงู„ุฎู„ูŠุฉ 3 ุฃุฏู†ุงู‡)')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from google.colab import drive\n", + "import os\n", + "\n", + "drive.mount('/content/drive')\n", + "\n", + "# โœ๏ธ ุนุฏู‘ู„ูŠ ู‡ุฐุง ุงู„ู…ุณุงุฑ ุฅุฐุง ูƒุงู† ู…ุฌู„ุฏูƒ ู…ุฎุชู„ูุงู‹\n", + "NEEDED_PATH = '/content/drive/MyDrive/Needed'\n", + "BACKEND_PATH = '/content/drive/MyDrive/murshid_backend'\n", + "ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip'\n", + "\n", + "print('=' * 55)\n", + "print('๐Ÿ“‚ Checking Google Drive files...')\n", + "print('=' * 55)\n", + "\n", + "# โ”€โ”€ ุงู„ุชุญู‚ู‚ ู…ู† ู…ู„ูุงุช Needed โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "print('\\n๐Ÿ“ Needed/ (model files):')\n", + "required_files = {\n", + " 'murshid_logreg_pipeline_manual_oof_pcatuned.joblib': 'LogReg model',\n", + " 'murshid_logreg_thresholds_manual_oof_pcatuned.npy': 'LogReg thresholds',\n", + " 'murshid_label_columns.json': 'Technique names',\n", + "}\n", + "\n", + "models_ok = True\n", + "for fname, desc in required_files.items():\n", + " path = f'{NEEDED_PATH}/{fname}'\n", + " exists = os.path.isfile(path)\n", + " size = f'{os.path.getsize(path)/1024:.0f} KB' if exists else ''\n", + " status = 'โœ…' if exists else 'โŒ'\n", + " print(f' {status} {fname} {size}')\n", + " if not exists:\n", + " models_ok = False\n", + "\n", + "excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n", + "excel_ok = os.path.isfile(excel_path)\n", + "print(f' {\"โœ…\" if excel_ok else \"โš ๏ธ \"} murshid_query_template_structure_clean_shared.xlsx (optional)')\n", + "\n", + "# โ”€โ”€ ุงู„ุชุญู‚ู‚ ู…ู† ุงู„ุจุงูƒู†ุฏ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "print('\\n๐Ÿ“ murshid_backend/ (backend code):')\n", + "backend_ok = os.path.isdir(BACKEND_PATH)\n", + "zip_ok = os.path.isfile(ZIP_PATH)\n", + "\n", + "if backend_ok:\n", + " fcount = sum(len(f) for _, _, f in os.walk(BACKEND_PATH))\n", + " print(f' โœ… murshid_backend/ ({fcount} files)')\n", + "elif zip_ok:\n", + " zsize = f'{os.path.getsize(ZIP_PATH)/1024:.0f} KB'\n", + " print(f' ๐Ÿ“ฆ murshid_backend_for_drive.zip ({zsize}) โ€” ุณูŠูุณุชุฎุฑุฌ ุชู„ู‚ุงุฆูŠุงู‹ ููŠ ุงู„ุฎู„ูŠุฉ 3')\n", + "else:\n", + " print(f' โŒ murshid_backend/ ุบูŠุฑ ู…ูˆุฌูˆุฏ')\n", + " print(f' โŒ murshid_backend_for_drive.zip ุบูŠุฑ ู…ูˆุฌูˆุฏ')\n", + " print(f'\\n โš ๏ธ ุงุฑูุนูŠ murshid_backend_for_drive.zip ุฅู„ู‰:')\n", + " print(f' Google Drive โ†’ My Drive')\n", + "\n", + "# โ”€โ”€ ู…ู„ุฎุต โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "print('\\n' + '=' * 55)\n", + "if models_ok and (backend_ok or zip_ok):\n", + " print('โœ… ูƒู„ ุดูŠุก ุฌุงู‡ุฒ โ€” ุชุงุจุนูŠ ุชุดุบูŠู„ ุงู„ุฎู„ุงูŠุง')\n", + "elif not models_ok:\n", + " print('โŒ ู…ู„ูุงุช ุงู„ู†ู…ุงุฐุฌ ู…ูู‚ูˆุฏุฉ ู…ู† Needed/ โ€” ูŠุฌุจ ุฑูุนู‡ุง ุฃูˆู„ุงู‹')\n", + "else:\n", + " print('โŒ ู…ู„ูุงุช ุงู„ุจุงูƒู†ุฏ ู…ูู‚ูˆุฏุฉ โ€” ุงุฑูุนูŠ ZIP ุฃูˆู„ุงู‹')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 3: ู†ุณุฎ ุงู„ุจุงูƒู†ุฏ ุฅู„ู‰ /content\n", + "\n", + "> ู†ุณุฎ ุงู„ู…ู„ูุงุช ู…ู† Drive ุฅู„ู‰ `/content` ู„ุชุณุฑูŠุน ุงู„ู‚ุฑุงุกุฉ\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil, os, zipfile, sys\n", + "\n", + "DRIVE_BASE = '/content/drive/MyDrive'\n", + "ZIP_PATH = f'{DRIVE_BASE}/murshid_backend_for_drive.zip'\n", + "BACKEND_DRIVE= f'{DRIVE_BASE}/murshid_backend'\n", + "BACKEND_LOCAL= '/content/murshid_backend'\n", + "\n", + "# โ”€โ”€ ุงู„ุฎุทูˆุฉ 1: ุงุณุชุฎุฑุงุฌ ZIP ู…ู† Drive ุฅุฐุง ู„ุฒู… โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "if not os.path.isdir(BACKEND_DRIVE):\n", + " if os.path.isfile(ZIP_PATH):\n", + " print(f'๐Ÿ“ฆ ZIP found โ€” extracting to Drive...')\n", + " with zipfile.ZipFile(ZIP_PATH, 'r') as z:\n", + " z.extractall(DRIVE_BASE)\n", + " print(f'โœ… Extracted to {BACKEND_DRIVE}')\n", + " else:\n", + " print('โŒ ERROR: ู…ุฌู„ุฏ murshid_backend ุบูŠุฑ ู…ูˆุฌูˆุฏ ุนู„ู‰ Drive')\n", + " print(f' ุงู„ู…ุทู„ูˆุจ: {BACKEND_DRIVE}')\n", + " print(f' ุฃูˆ ุฑูุน: {ZIP_PATH}')\n", + " raise FileNotFoundError(f'Backend not found. Upload murshid_backend_for_drive.zip to Google Drive MyDrive.')\n", + "else:\n", + " print(f'โœ… murshid_backend found on Drive: {BACKEND_DRIVE}')\n", + "\n", + "# โ”€โ”€ ุงู„ุฎุทูˆุฉ 2: ู†ุณุฎ ุฅู„ู‰ /content (ุฃุณุฑุน ุจูƒุซูŠุฑ ู…ู† Drive ุฃุซู†ุงุก ุงู„ุชุดุบูŠู„) โ”€\n", + "if os.path.exists(BACKEND_LOCAL):\n", + " shutil.rmtree(BACKEND_LOCAL)\n", + "\n", + "shutil.copytree(\n", + " BACKEND_DRIVE,\n", + " BACKEND_LOCAL,\n", + " ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '.venv', '*.db', '*.log')\n", + ")\n", + "\n", + "# โ”€โ”€ ุงู„ุฎุทูˆุฉ 3: ุฅุถุงูุฉ ู„ู„ู€ Python path โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "if BACKEND_LOCAL not in sys.path:\n", + " sys.path.insert(0, BACKEND_LOCAL)\n", + "\n", + "os.chdir(BACKEND_LOCAL)\n", + "\n", + "# โ”€โ”€ ุชุญู‚ู‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "file_count = sum(len(files) for _, _, files in os.walk(BACKEND_LOCAL))\n", + "print(f'โœ… Backend ready at {BACKEND_LOCAL} ({file_count} files)')\n", + "print(f'โœ… Working dir: {os.getcwd()}')\n", + "\n", + "# ุนุฑุถ ุงู„ู‡ูŠูƒู„\n", + "print('\\nStructure:')\n", + "for item in sorted(os.listdir(BACKEND_LOCAL)):\n", + " full = os.path.join(BACKEND_LOCAL, item)\n", + " if os.path.isdir(full):\n", + " sub_count = len(os.listdir(full))\n", + " print(f' ๐Ÿ“ {item}/ ({sub_count} items)')\n", + " else:\n", + " size = os.path.getsize(full)\n", + " print(f' ๐Ÿ“„ {item} ({size:,} bytes)')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 4: ุชุซุจูŠุช ุงู„ู…ุชุทู„ุจุงุช\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('๐Ÿ“ฆ Installing requirements...')\n", + "\n", + "# โ”€โ”€ ุงู„ุญุฒู… ุงู„ุฃุณุงุณูŠุฉ ู„ู„ุจุงูƒู†ุฏ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "!pip install -q \\\n", + " fastapi==0.115.0 \\\n", + " \"uvicorn[standard]==0.32.0\" \\\n", + " pydantic==2.9.0 \\\n", + " pydantic-settings==2.6.0 \\\n", + " python-dotenv==1.0.0 \\\n", + " sqlalchemy==2.0.0 \\\n", + " alembic==1.13.0 \\\n", + " aiofiles \\\n", + " scikit-learn==1.6.1 \\\n", + " joblib \\\n", + " lxml \\\n", + " openpyxl \\\n", + " nest-asyncio \\\n", + " pyngrok\n", + "\n", + "# โ”€โ”€ bitsandbytes: ู…ุทู„ูˆุจ ู„ุชุญู…ูŠู„ LLaMA ุจู€ 4-bit ุนู„ู‰ GPU โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "print('๐Ÿ“ฆ Installing bitsandbytes (required for LLaMA 4-bit)...')\n", + "!pip install -q -U \"bitsandbytes>=0.46.1\"\n", + "\n", + "# โ”€โ”€ accelerate: ู…ุทู„ูˆุจ ู„ู€ device_map=\"auto\" โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "!pip install -q -U accelerate\n", + "\n", + "# โ”€โ”€ ุชุญู‚ู‚ ู…ู† ุงู„ุชุซุจูŠุช โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "import importlib\n", + "for pkg in ['bitsandbytes', 'accelerate', 'fastapi', 'sklearn']:\n", + " try:\n", + " mod = importlib.import_module(pkg if pkg != 'sklearn' else 'sklearn')\n", + " ver = getattr(mod, '__version__', '?')\n", + " print(f' โœ… {pkg}=={ver}')\n", + " except ImportError:\n", + " print(f' โŒ {pkg} โ€” ูุดู„ ุงู„ุชุซุจูŠุช')\n", + "\n", + "print('\\nโœ… All requirements installed')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 5: ุฅุนุฏุงุฏ ู…ู„ู .env\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# โœ๏ธ ุถุนูŠ HF Token ู‡ู†ุง ุฅุฐุง ู„ู… ุชูุถูŠููŠู‡ ุนุจุฑ Colab Secrets\n", + "HF_TOKEN = os.environ.get('HF_TOKEN', 'ุงุฏุฎู„ ุงู„ุชูˆูƒู†')\n", + "\n", + "env_content = f\"\"\"# Auto-generated .env for Colab FULL mode\n", + "MURSHID_DB_URL=sqlite:////content/murshid.db\n", + "MURSHID_MODELS_DIR={NEEDED_PATH}\n", + "HF_TOKEN={HF_TOKEN}\n", + "MURSHID_SKIP_LLM=false\n", + "SECRET_KEY=murshid_colab_2026\n", + "LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct\n", + "EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus\n", + "LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib\n", + "LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy\n", + "LABEL_COLUMNS_JSON=murshid_label_columns.json\n", + "\"\"\"\n", + "\n", + "env_path = '/content/murshid_backend/.env'\n", + "with open(env_path, 'w') as f:\n", + " f.write(env_content)\n", + "\n", + "print('โœ… .env created at', env_path)\n", + "print('\\nContents:')\n", + "with open(env_path) as f:\n", + " for line in f:\n", + " if 'TOKEN' in line or 'SECRET' in line:\n", + " key = line.split('=')[0]\n", + " print(f' {key}=****')\n", + " else:\n", + " print(' ', line.rstrip())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 6: ุชู‡ุฌูŠุฑ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช (Alembic)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess, os\n", + "\n", + "os.chdir('/content/murshid_backend')\n", + "\n", + "result = subprocess.run(\n", + " ['python', '-m', 'alembic', 'upgrade', 'head'],\n", + " capture_output=True, text=True\n", + ")\n", + "\n", + "print(result.stdout)\n", + "if result.stderr:\n", + " print(result.stderr)\n", + "\n", + "import os\n", + "db_exists = os.path.isfile('/content/murshid.db')\n", + "print('โœ… Database ready:', '/content/murshid.db' if db_exists else 'โŒ ู„ู… ูŠูู†ุดุฃ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 7: ุงุณุชูŠุฑุงุฏ ู‚ูˆุงู„ุจ WQL ู…ู† Excel\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.insert(0, '/content/murshid_backend')\n", + "os.chdir('/content/murshid_backend')\n", + "\n", + "excel_path = f'{NEEDED_PATH}/murshid_query_template_structure_clean_shared.xlsx'\n", + "\n", + "if os.path.isfile(excel_path):\n", + " from app.db.session import SessionLocal\n", + " from scripts.import_excel_templates import run as import_excel\n", + "\n", + " db = SessionLocal()\n", + " try:\n", + " result = import_excel(db, replace=False)\n", + " print('โœ… Excel import result:')\n", + " for k, v in result.items():\n", + " print(f' {k}: {v}')\n", + " finally:\n", + " db.close()\n", + "else:\n", + " print(f'โš ๏ธ Excel file not found at: {excel_path}')\n", + " print(' ูŠู…ูƒู†ูƒ ุงู„ู…ุชุงุจุนุฉ โ€” ุงู„ู‚ูˆุงู„ุจ ุณุชูุถุงู ู„ุงุญู‚ุงู‹ ูŠุฏูˆูŠุงู‹')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 8: ุชุดุบูŠู„ FastAPI + ngrok\n", + "\n", + "> โณ ู‡ุฐู‡ ุงู„ุฎู„ูŠุฉ ุชุฃุฎุฐ **5-10 ุฏู‚ุงุฆู‚** ู„ุชุญู…ูŠู„ LLaMA (4.5GB) ูˆ SecureBERT+\n", + "\n", + "> ๐Ÿ”‘ **ุงู„ุฑุงุจุท ุงู„ุนุงู… ุณูŠุธู‡ุฑ ููŠ ุงู„ู†ู‡ุงูŠุฉ** โ€” ุงู†ุณุฎูŠู‡ ู„ู„ูุฑูˆู†ุช\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess, time, os, sys, urllib.request\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "\n", + "os.chdir('/content/murshid_backend')\n", + "\n", + "# โ”€โ”€โ”€ ุงู„ุชุญู‚ู‚ ู…ู† bitsandbytes ู‚ุจู„ ุชุดุบูŠู„ ุงู„ุฎุงุฏู… โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "try:\n", + " import bitsandbytes as bnb\n", + " print(f'โœ… bitsandbytes {bnb.__version__}')\n", + "except ImportError:\n", + " print('โŒ bitsandbytes ุบูŠุฑ ู…ุซุจู‘ุช โ€” ุดุบู‘ู„ูŠ ุงู„ุฎู„ูŠุฉ 4 ุฃูˆู„ุงู‹')\n", + " raise\n", + "\n", + "# โ”€โ”€โ”€ ุชุดุบูŠู„ uvicorn โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "log_path = '/content/murshid_server.log'\n", + "log_file = open(log_path, 'w')\n", + "\n", + "server_proc = subprocess.Popen(\n", + " [\n", + " 'python', '-m', 'uvicorn', 'app.main:app',\n", + " '--host', '0.0.0.0',\n", + " '--port', '8000',\n", + " '--log-level', 'info'\n", + " ],\n", + " cwd='/content/murshid_backend',\n", + " stdout=log_file,\n", + " stderr=subprocess.STDOUT\n", + ")\n", + "\n", + "print('โณ Loading LLaMA 3 8B + SecureBERT+...')\n", + "print(' ุฌุงุฑูŠ ุงู„ุชุญู…ูŠู„ โ€” ุงู†ุชุธุฑูŠ ุญุชู‰ ุชุธู‡ุฑ ุงู„ุฑุณุงู„ุฉ ุงู„ู†ู‡ุงุฆูŠุฉ')\n", + "\n", + "# โ”€โ”€โ”€ ุงู†ุชุธุงุฑ ุฐูƒูŠ ู…ุน ุนุฑุถ ุงู„ู„ูˆุฌ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "started = False\n", + "last_log_size = 0\n", + "\n", + "for i in range(180): # 15 ุฏู‚ูŠู‚ุฉ ูƒุญุฏ ุฃู‚ุตู‰\n", + " time.sleep(5)\n", + "\n", + " # ุชุญู‚ู‚ ุฅุฐุง ุจุฏุฃ ุงู„ุฎุงุฏู…\n", + " try:\n", + " resp = urllib.request.urlopen('http://localhost:8000/health', timeout=3)\n", + " if resp.status == 200:\n", + " started = True\n", + " break\n", + " except Exception:\n", + " pass\n", + "\n", + " # ุนุฑุถ ุงู„ู„ูˆุฌ ุงู„ุฌุฏูŠุฏ ูƒู„ 30 ุซุงู†ูŠุฉ\n", + " if i % 6 == 0:\n", + " elapsed = (i + 1) * 5\n", + " log_file.flush()\n", + " try:\n", + " with open(log_path) as f:\n", + " log_content = f.read()\n", + " new_content = log_content[last_log_size:]\n", + " last_log_size = len(log_content)\n", + "\n", + " # ุชุญู‚ู‚ ู…ู† ุฎุทุฃ ู…ุจูƒุฑ\n", + " if 'ERROR' in new_content or 'ImportError' in new_content:\n", + " print(f'\\nโŒ ุฎุทุฃ ููŠ ุงู„ุฎุงุฏู… ุนู†ุฏ {elapsed}s:')\n", + " # ุนุฑุถ ุขุฎุฑ 1000 ุญุฑู ู…ู† ุงู„ู„ูˆุฌ\n", + " print(log_content[-1500:])\n", + " server_proc.terminate()\n", + " log_file.close()\n", + " raise RuntimeError('Server failed to start. See log above.')\n", + "\n", + " # ุนุฑุถ ู…ุง ุชู… ุชุญู…ูŠู„ู‡\n", + " if 'Loaded' in new_content or 'loaded' in new_content or 'Application' in new_content:\n", + " for line in new_content.strip().split('\\n'):\n", + " if any(k in line for k in ['INFO', 'Loaded', 'loaded', 'Application', 'WARNING']):\n", + " print(f' {line.strip()}')\n", + " else:\n", + " mins = elapsed // 60\n", + " secs = elapsed % 60\n", + " print(f' โณ {mins}m {secs}s โ€” ูŠุฌุฑูŠ ุชุญู…ูŠู„ ุงู„ู†ู…ุงุฐุฌ...')\n", + " except RuntimeError:\n", + " raise\n", + " except Exception:\n", + " print(f' โณ {elapsed}s elapsed...')\n", + "\n", + "log_file.flush()\n", + "log_file.close()\n", + "\n", + "if not started:\n", + " print('\\nโŒ Server did not start after 15 minutes.')\n", + " print('โ”€โ”€โ”€ ุขุฎุฑ ุณุทูˆุฑ ุงู„ู„ูˆุฌ โ”€โ”€โ”€')\n", + " with open(log_path) as f:\n", + " print(f.read()[-3000:])\n", + "else:\n", + " print('\\nโœ… Server started successfully!')\n", + "\n", + " # โ”€โ”€โ”€ Cloudflare Tunnel (ู…ุฌุงู†ูŠ โ€” ุจุฏูˆู† ุญุณุงุจ) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + " import subprocess, re, threading, time\n", + "\n", + " # ุชุซุจูŠุช cloudflared\n", + " subprocess.run(\n", + " ['wget', '-q', 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n", + " '-O', '/usr/local/bin/cloudflared'],\n", + " check=True\n", + " )\n", + " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n", + " print('โœ… cloudflared installed')\n", + "\n", + " # ุชุดุบูŠู„ ุงู„ู†ูู‚\n", + " cf_log = open('/content/cloudflared.log', 'w')\n", + " cf_proc = subprocess.Popen(\n", + " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n", + " stdout=cf_log, stderr=subprocess.STDOUT\n", + " )\n", + "\n", + " # ุงู†ุชุธุงุฑ ุธู‡ูˆุฑ ุงู„ุฑุงุจุท ููŠ ุงู„ู„ูˆุฌ\n", + " public_url = None\n", + " for _ in range(30):\n", + " time.sleep(2)\n", + " cf_log.flush()\n", + " try:\n", + " with open('/content/cloudflared.log') as f:\n", + " content = f.read()\n", + " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n", + " if match:\n", + " public_url = match.group(0)\n", + " break\n", + " except Exception:\n", + " pass\n", + "\n", + " if public_url:\n", + " print('\\n' + '='*60)\n", + " print('๐ŸŒ PUBLIC URL (ุงู„ุฑุงุจุท ุงู„ุนุงู… โ€” Cloudflare):')\n", + " print(f' {public_url}')\n", + " print('='*60)\n", + " print(f'๐Ÿ“– Swagger: {public_url}/docs')\n", + " print(f'๐Ÿ’š Health: {public_url}/health')\n", + " print(f'๐Ÿ—„๏ธ DB Summary: {public_url}/api/db/summary')\n", + " print('='*60)\n", + " print('\\n๐Ÿ“‹ ุงู†ุณุฎูŠ ู‡ุฐุง ุงู„ุณุทุฑ ูˆุงู„ุตู‚ูŠู‡ ููŠ ุงู„ูุฑูˆู†ุช (index.html):')\n", + " print(f\" const BASE = '{public_url}';\")\n", + " else:\n", + " print('โš ๏ธ Cloudflare tunnel URL not found, check /content/cloudflared.log')\n", + " with open('/content/cloudflared.log') as f:\n", + " print(f.read()[-1000:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# โ”€โ”€โ”€ ุชุดุบูŠู„ Cloudflare Tunnel ุจุดูƒู„ ู…ู†ูุตู„ (ุฅุฐุง ูุดู„ ู…ุน ุงู„ุฎู„ูŠุฉ 8) โ”€\n", + "# ุดุบู‘ู„ูŠ ู‡ุฐู‡ ุงู„ุฎู„ูŠุฉ ูู‚ุท ุฅุฐุง ูƒุงู† ุงู„ุฎุงุฏู… ูŠุนู…ู„ ู„ูƒู† ุงู„ู€ tunnel ูุดู„\n", + "\n", + "import subprocess, re, time, os\n", + "\n", + "# ุชุซุจูŠุช cloudflared ุฅุฐุง ู„ู… ูŠูุซุจูŽู‘ุช\n", + "if not os.path.isfile('/usr/local/bin/cloudflared'):\n", + " subprocess.run(\n", + " ['wget', '-q',\n", + " 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n", + " '-O', '/usr/local/bin/cloudflared'],\n", + " check=True\n", + " )\n", + " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n", + " print('โœ… cloudflared installed')\n", + "else:\n", + " print('โœ… cloudflared already installed')\n", + "\n", + "# ุชุดุบูŠู„ ุงู„ู†ูู‚\n", + "cf_log_path = '/content/cloudflared.log'\n", + "cf_log = open(cf_log_path, 'w')\n", + "cf_proc = subprocess.Popen(\n", + " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n", + " stdout=cf_log, stderr=subprocess.STDOUT\n", + ")\n", + "\n", + "print('โณ Opening Cloudflare tunnel...')\n", + "\n", + "public_url = None\n", + "for _ in range(30):\n", + " time.sleep(2)\n", + " cf_log.flush()\n", + " try:\n", + " with open(cf_log_path) as f:\n", + " content = f.read()\n", + " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n", + " if match:\n", + " public_url = match.group(0)\n", + " break\n", + " except Exception:\n", + " pass\n", + "\n", + "if public_url:\n", + " print('\\n' + '='*60)\n", + " print(f'๐ŸŒ PUBLIC URL: {public_url}')\n", + " print(f'๐Ÿ“– Swagger: {public_url}/docs')\n", + " print(f'๐Ÿ’š Health: {public_url}/health')\n", + " print('='*60)\n", + " print('\\n๐Ÿ“‹ ุงู„ุตู‚ูŠ ู‡ุฐุง ุงู„ุณุทุฑ ููŠ index.html:')\n", + " print(f\" const BASE = '{public_url}';\")\n", + "else:\n", + " print('โŒ ู„ู… ูŠูุนุซุฑ ุนู„ู‰ URL. ุงู„ู„ูˆุฌ:')\n", + " with open(cf_log_path) as f:\n", + " print(f.read())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 9: ุฑุจุท ุงู„ูุฑูˆู†ุช ุจู€ Cloudflare URL\n", + "\n", + "ุจุนุฏ ุชุดุบูŠู„ ุงู„ุฎู„ูŠุฉ ุงู„ุณุงุจู‚ุฉุŒ ุณุชุธู‡ุฑ ุฑุณุงู„ุฉ ู…ุซู„:\n", + "```\n", + "๐ŸŒ PUBLIC URL: https://xxxx-xxxx.trycloudflare.com\n", + "```\n", + "\n", + "**ุงู„ุฎู„ูŠุฉ ุฃุฏู†ุงู‡ ุชูุญุฏู‘ุซ ุงู„ูุฑูˆู†ุช ุชู„ู‚ุงุฆูŠุงู‹** โ€” ุฃูˆ ูŠู…ูƒู†ูƒ ุงู„ุชุนุฏูŠู„ ูŠุฏูˆูŠุงู‹ ููŠ `index.html`:\n", + "```javascript\n", + "const BASE = 'https://xxxx-xxxx.trycloudflare.com';\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess, re, time, os\n", + "\n", + "# โ”€โ”€ ุงู„ุฎุทูˆุฉ 1: ุชุซุจูŠุช cloudflared โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "if not os.path.isfile('/usr/local/bin/cloudflared'):\n", + " subprocess.run([\n", + " 'wget', '-q',\n", + " 'https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64',\n", + " '-O', '/usr/local/bin/cloudflared'\n", + " ], check=True)\n", + " subprocess.run(['chmod', '+x', '/usr/local/bin/cloudflared'], check=True)\n", + " print('โœ… cloudflared installed')\n", + "else:\n", + " print('โœ… cloudflared ready')\n", + "\n", + "# โ”€โ”€ ุงู„ุฎุทูˆุฉ 2: ุชุดุบูŠู„ ุงู„ู†ูู‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "cf_log_path = '/content/cf.log'\n", + "cf_log = open(cf_log_path, 'w')\n", + "subprocess.Popen(\n", + " ['cloudflared', 'tunnel', '--url', 'http://localhost:8000'],\n", + " stdout=cf_log, stderr=subprocess.STDOUT\n", + ")\n", + "\n", + "print('โณ Opening Cloudflare tunnel...')\n", + "\n", + "# โ”€โ”€ ุงู„ุฎุทูˆุฉ 3: ุงู†ุชุธุงุฑ ุงู„ุฑุงุจุท โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "public_url = None\n", + "for _ in range(30):\n", + " time.sleep(2)\n", + " cf_log.flush()\n", + " with open(cf_log_path) as f:\n", + " content = f.read()\n", + " match = re.search(r'https://[a-z0-9\\-]+\\.trycloudflare\\.com', content)\n", + " if match:\n", + " public_url = match.group(0)\n", + " break\n", + "\n", + "if not public_url:\n", + " print('โŒ Tunnel failed. Log:')\n", + " with open(cf_log_path) as f: print(f.read())\n", + "else:\n", + " # โ”€โ”€ ุงู„ุฎุทูˆุฉ 4: ุชุญุฏูŠุซ index.html ุชู„ู‚ุงุฆูŠุงู‹ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + " frontend_path = '/content/drive/MyDrive/murshid_frontend/index.html'\n", + "\n", + " if os.path.isfile(frontend_path):\n", + " with open(frontend_path, 'r', encoding='utf-8') as f:\n", + " html = f.read()\n", + " html_updated = re.sub(r\"const BASE = '[^']*';\",\n", + " f\"const BASE = '{public_url}';\", html)\n", + " with open(frontend_path, 'w', encoding='utf-8') as f:\n", + " f.write(html_updated)\n", + " print(f'โœ… index.html updated automatically')\n", + " else:\n", + " print(f'โš ๏ธ index.html not found โ€” ุนุฏู‘ู„ูŠู‡ ูŠุฏูˆูŠุงู‹')\n", + "\n", + " print('\\n' + '='*60)\n", + " print(f'๐ŸŒ PUBLIC URL: {public_url}')\n", + " print(f'๐Ÿ“– Swagger: {public_url}/docs')\n", + " print(f'๐Ÿ’š Health: {public_url}/health')\n", + " print(f'๐Ÿ–ฅ๏ธ Frontend: {public_url}/index.html')\n", + " print('='*60)\n", + " print(f\"\\n๐Ÿ“‹ const BASE = '{public_url}';\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 10: ุงุฎุชุจุงุฑ ุงู„ู€ API\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import urllib.request, json\n", + "\n", + "# โ”€โ”€โ”€ Health Check โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "with urllib.request.urlopen('http://localhost:8000/health') as r:\n", + " health = json.load(r)\n", + "\n", + "print('=== Health Check ===')\n", + "print(f\" status: {health['status']}\")\n", + "print(f\" pipeline_mode: {health['pipeline_mode']}\")\n", + "print(f\" llama_loaded: {health['components']['llama_loaded']}\")\n", + "print(f\" embedder_loaded: {health['components']['embedder_loaded']}\")\n", + "print(f\" logreg_loaded: {health['components']['logreg_loaded']}\")\n", + "print(f\" cuda_available: {health['components']['cuda_available']}\")\n", + "\n", + "mode = health.get('pipeline_mode', 'unknown')\n", + "if mode == 'full':\n", + " print('\\nโœ… FULL mode โ€” ู†ุชุงุฆุฌ ู…ุทุงุจู‚ุฉ 100% ู„ู„ุฏูุชุฑ')\n", + "elif mode == 'local':\n", + " print('\\nโš ๏ธ LOCAL mode โ€” LLaMA ู„ู… ูŠูุญู…ูŽู‘ู„ุŒ ุชุญู‚ู‚ูŠ ู…ู† MURSHID_SKIP_LLM=false')\n", + "else:\n", + " print('\\nโŒ LITE mode โ€” ุชุญู‚ู‚ูŠ ู…ู† ุชุซุจูŠุช torch ูˆุงู„ู†ู…ุงุฐุฌ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# โ”€โ”€โ”€ ุชุญู„ูŠู„ ู‚ุงุนุฏุฉ ุงุฎุชุจุงุฑ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "import urllib.request, json\n", + "\n", + "test_rule = '''\n", + " 18201\n", + " ^634$|^4730$\n", + " Windows: Security Enabled Global Group Deleted\n", + " T1484\n", + " group_deleted,win_group_deleted\n", + "'''\n", + "\n", + "payload = json.dumps({'rule_xml': test_rule}).encode()\n", + "req = urllib.request.Request(\n", + " 'http://localhost:8000/rules/analyze',\n", + " data=payload,\n", + " headers={'Content-Type': 'application/json'},\n", + " method='POST'\n", + ")\n", + "\n", + "with urllib.request.urlopen(req) as r:\n", + " result = json.load(r)\n", + "\n", + "print('=== Analyze Result ===')\n", + "print(f\" rule_id: {result['rule_id']}\")\n", + "print(f\" pipeline_mode: {result['pipeline_mode']}\")\n", + "print(f\" summary: {result['summary']}\")\n", + "print(f\"\\n TOP 5 Techniques:\")\n", + "print(f\" {'Technique':<15} {'Conf%':>8} {'Proba':>8} {'Thr':>6} {'Gap':>8} {'Pred':>6}\")\n", + "print(f\" {'-'*55}\")\n", + "for r in result['all_results'][:5]:\n", + " pred = 'โœ…' if r['predicted'] else ' '\n", + " print(f\" {pred} {r['technique_id']:<13} {r['confidence_percent']:>7.2f}%\"\n", + " f\" {r['proba']:>8.4f} {r['threshold']:>6.2f} {r['gap']:>+8.4f}\")\n", + "\n", + "print(f\"\\n Detected: {len(result['detected'])} technique(s)\")\n", + "for d in result['detected']:\n", + " print(f\" โœ… {d['technique_id']} โ€” {d['confidence_percent']}%\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# โ”€โ”€โ”€ ู‚ูˆุงู„ุจ WQL ู„ู„ุชู‚ู†ูŠุฉ ุงู„ู…ูƒุชุดูุฉ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "if result['detected']:\n", + " top_technique = result['detected'][0]['technique_id']\n", + "\n", + " with urllib.request.urlopen(f'http://localhost:8000/queries/{top_technique}') as r:\n", + " queries = json.load(r)\n", + "\n", + " print(f'=== WQL Templates for {top_technique} ===')\n", + " for i, q in enumerate(queries, 1):\n", + " print(f\"\\n [{i}] {q.get('purpose', 'N/A')}\")\n", + " print(f\" Query: {q['wql_query'][:120]}...\")\n", + " print(f\" Note: {q.get('note', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 11: ุชุตุฏูŠุฑ ุงู„ู†ุชุงุฆุฌ (ุงุฎุชูŠุงุฑูŠ)\n", + "\n", + "ู„ุญูุธ ุงู„ู†ุชุงุฆุฌ ุจุตูŠุบุฉ JSON ู„ุงุณุชุฎุฏุงู…ู‡ุง ู„ุงุญู‚ุงู‹ ุนู„ู‰ ุงู„ุฌู‡ุงุฒ ุงู„ู…ุญู„ูŠ\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# โ”€โ”€โ”€ ุชุญู„ูŠู„ ู‚ุงุฆู…ุฉ ู…ู† ุงู„ู‚ูˆุงุนุฏ ูˆุชุตุฏูŠุฑู‡ุง โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\n", + "import urllib.request, json, os\n", + "\n", + "# โœ๏ธ ุฃุถูŠููŠ Rule IDs ุงู„ุชูŠ ุชุฑูŠุฏูŠู† ุชุญู„ูŠู„ู‡ุง\n", + "# ูŠู…ูƒู†ูƒ ู‚ุฑุงุกุชู‡ุง ู…ู† ู…ู„ู\n", + "test_ids_path = f'{NEEDED_PATH}/test_rule_ids.json'\n", + "\n", + "if os.path.isfile(test_ids_path):\n", + " with open(test_ids_path) as f:\n", + " rule_ids = json.load(f)\n", + " print(f'Loaded {len(rule_ids)} rule IDs from test_rule_ids.json')\n", + "else:\n", + " # ู‚ูˆุงุนุฏ ุชุฌุฑูŠุจูŠุฉ\n", + " rule_ids = ['18205']\n", + " print('Using default test rule')\n", + "\n", + "print(f'Processing {len(rule_ids)} rules...')\n", + "\n", + "export_results = []\n", + "\n", + "for rule_id in rule_ids:\n", + " try:\n", + " with urllib.request.urlopen(f'http://localhost:8000/results/{rule_id}') as r:\n", + " data = json.load(r)\n", + " data['source'] = 'colab_full_mode'\n", + " export_results.append(data)\n", + " detected = len(data.get('detected', []))\n", + " top = data['mappings'][0] if data['mappings'] else {}\n", + " print(f\" โœ… {rule_id}: {top.get('technique_id','?')} ({top.get('confidence_percent','?')}%) โ€” {detected} detected\")\n", + " except Exception as e:\n", + " print(f\" โš ๏ธ {rule_id}: {e}\")\n", + "\n", + "# ุญูุธ ุงู„ู†ุชุงุฆุฌ\n", + "export_path = f'{NEEDED_PATH}/murshid_full_results.json'\n", + "with open(export_path, 'w', encoding='utf-8') as f:\n", + " json.dump(export_results, f, ensure_ascii=False, indent=2)\n", + "\n", + "print(f'\\nโœ… Exported {len(export_results)} results to:')\n", + "print(f' {export_path}')\n", + "print('\\nูŠู…ูƒู†ูƒ ุงู„ุขู† ุงุณุชูŠุฑุงุฏ ู‡ุฐุง ุงู„ู…ู„ู ููŠ ุงู„ุจุงูƒู†ุฏ ุงู„ู…ุญู„ูŠ')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ุงู„ุฎู„ูŠุฉ 12: ุฅูŠู‚ุงู ุงู„ุฎุงุฏู… (ุนู†ุฏ ุงู„ุงู†ุชู‡ุงุก)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ุฅูŠู‚ุงู ุงู„ุฎุงุฏู… ูˆุฅุบู„ุงู‚ ngrok\n", + "try:\n", + " from pyngrok import ngrok\n", + " ngrok.kill()\n", + " print('โœ… ngrok tunnel closed')\n", + "except Exception:\n", + " pass\n", + "\n", + "try:\n", + " server_proc.terminate()\n", + " print('โœ… Server stopped')\n", + "except Exception:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## ู…ู„ุงุญุธุงุช ู…ู‡ู…ุฉ\n", + "\n", + "### ุฅุฐุง ุงู†ู‚ุทุน ุงู„ุงุชุตุงู„ ุจู€ Colab\n", + "- ุงู„ุฎุงุฏู… ูŠุชูˆู‚ู ุชู„ู‚ุงุฆูŠุงู‹\n", + "- ุฃุนูŠุฏูŠ ุชุดุบูŠู„ ุงู„ุฎู„ุงูŠุง ู…ู† ุงู„ุฎู„ูŠุฉ 8\n", + "- ุฑุงุจุท ngrok ุณูŠุชุบูŠู‘ุฑ โ€” ุนุฏู‘ู„ูŠ ุงู„ูุฑูˆู†ุช ุจุงู„ุฑุงุจุท ุงู„ุฌุฏูŠุฏ\n", + "\n", + "### ุฅุฐุง ุธู‡ุฑ ุฎุทุฃ ููŠ LLaMA\n", + "- ุชุฃูƒุฏูŠ ุฃู† ู„ุฏูŠูƒ ุตู„ุงุญูŠุฉ ุงู„ูˆุตูˆู„ ู„ู„ู†ู…ูˆุฐุฌ: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct\n", + "- ุชุฃูƒุฏูŠ ู…ู† ุตุญุฉ HF_TOKEN\n", + "\n", + "### ุงู„ู…ู‚ุงุฑู†ุฉ ู…ุน ุงู„ุฌู‡ุงุฒ ุงู„ู…ุญู„ูŠ\n", + "| | Colab (FULL) | ุงู„ุฌู‡ุงุฒ ุงู„ู…ุญู„ูŠ (LOCAL) |\n", + "|--|-------------|----------------------|\n", + "| LLaMA | โœ… | โŒ |\n", + "| T1484 confidence | **94.76%** | 89.29% |\n", + "| ุงู„ู‚ุฑุงุฑ ุงู„ู†ู‡ุงุฆูŠ | T1484 โœ… | T1484 โœ… |\n", + "\n", + "### ู„ู„ุนุฑุถ ุงู„ุชู‚ุฏูŠู…ูŠ\n", + "1. ุดุบู‘ู„ูŠ ุงู„ุฎู„ุงูŠุง 1-8 ู…ุณุจู‚ุงู‹ (ู‚ุจู„ ุงู„ุนุฑุถ ุจู€ 15 ุฏู‚ูŠู‚ุฉ)\n", + "2. ุงู†ุณุฎูŠ ุฑุงุจุท ngrok\n", + "3. ุนุฏู‘ู„ูŠ ุงู„ูุฑูˆู†ุช\n", + "4. ุงูุชุญูŠ `https://xxxx.ngrok-free.app/index.html`\n" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "machine_shape": "hm", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/MurshidBackend_Colab_Report.md b/MurshidBackend_Colab_Report.md new file mode 100644 index 0000000000000000000000000000000000000000..8365bffddb4ec81a48ef79b3a9a415ede46f3de7 --- /dev/null +++ b/MurshidBackend_Colab_Report.md @@ -0,0 +1,545 @@ +# ุชู‚ุฑูŠุฑ ุชู‚ู†ูŠ: ุขู„ูŠุฉ ุนู…ู„ MurshidBackend_Colab.ipynb + +## ู…ุดุฑูˆุน ู…ูุฑุดูุฏ | From Alerts to Guidance +### MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts + +--- + +## 1. ู†ุธุฑุฉ ุนุงู…ุฉ + +`MurshidBackend_Colab.ipynb` ู‡ูˆ ุฏูุชุฑ Jupyter ู…ูุตู…ูŽู‘ู… ู„ุชุดุบูŠู„ ุงู„ุจุงูƒู†ุฏ ุงู„ูƒุงู…ู„ ู„ู…ุดุฑูˆุน ู…ูุฑุดูุฏ ุนู„ู‰ ุจูŠุฆุฉ **Google Colab** ุจุงุณุชุฎุฏุงู… **GPU (Tesla T4)**ุŒ ู…ู…ุง ูŠูุชูŠุญ ุชุดุบูŠู„ ู†ู…ูˆุฐุฌ **LLaMA 3 8B** ุจุชูƒู…ูŠู… 4-bit ู„ุชูˆู„ูŠุฏ ู…ู„ุฎุตุงุช ุฏู„ุงู„ูŠุฉ ุบู†ูŠุฉ ู„ู‚ูˆุงุนุฏ Wazuh XMLุŒ ูˆุฐู„ูƒ ุนู„ู‰ ุนูƒุณ ุงู„ุจูŠุฆุฉ ุงู„ู…ุญู„ูŠุฉ ุงู„ุชูŠ ุชุนู…ู„ ุจุฏูˆู† LLaMA (LOCAL mode). + +### ุงู„ู‡ุฏู ุงู„ุฑุฆูŠุณูŠ +ุชุดุบูŠู„ **FULL mode** ู„ู„ู€ pipeline: +``` +ู‚ุงุนุฏุฉ Wazuh XML + โ†“ + LLaMA 3 8B โ†โ”€โ”€ ู…ู„ุฎุต ุฏู„ุงู„ูŠ ุบู†ูŠ (GPU) + โ†“ + SecureBERT+ โ†โ”€โ”€ 768-dim embedding + โ†“ + Logistic Regression โ†โ”€โ”€ confidence scores ู„ูƒู„ ุชู‚ู†ูŠุฉ + โ†“ + FastAPI + SQLite โ†โ”€โ”€ ุชุฎุฒูŠู† ูˆุฎุฏู…ุฉ ุงู„ู†ุชุงุฆุฌ + โ†“ + Cloudflare Tunnel โ†โ”€โ”€ ุฑุงุจุท ุนุงู… ู„ู„ูุฑูˆู†ุช +``` + +--- + +## 2. ุงู„ู…ุชุทู„ุจุงุช ู‚ุจู„ ุงู„ุชุดุบูŠู„ + +### 2.1 ุฅุนุฏุงุฏ Google Colab +| ุงู„ู…ุชุทู„ุจ | ุงู„ุชูุงุตูŠู„ | +|---------|----------| +| **GPU** | Tesla T4 โ€” ูŠููุนูŽู‘ู„ ู…ู†: `Runtime โ†’ Change runtime type โ†’ T4 GPU` | +| **ุงู„ุฐุงูƒุฑุฉ** | High RAM (machine_shape: "hm") | +| **ุงู„ุฅู†ุชุฑู†ุช** | ู…ูุนูŽู‘ู„ ู„ุชู†ุฒูŠู„ ุงู„ู†ู…ุงุฐุฌ ู…ู† Hugging Face | + +### 2.2 ุงู„ู…ู„ูุงุช ุงู„ู…ุทู„ูˆุจุฉ ุนู„ู‰ Google Drive +``` +MyDrive/ +โ”œโ”€โ”€ murshid_backend_for_drive.zip โ† ู…ู„ูุงุช ุงู„ุจุงูƒู†ุฏ ู…ุถุบูˆุทุฉ (44 KB) +โ”‚ ุฃูˆ +โ”œโ”€โ”€ murshid_backend/ โ† ุงู„ู…ุฌู„ุฏ ู…ุณุชุฎุฑุฌ ู…ุณุจู‚ุงู‹ +โ”‚ โ”œโ”€โ”€ app/ +โ”‚ โ”‚ โ”œโ”€โ”€ main.py +โ”‚ โ”‚ โ”œโ”€โ”€ config.py +โ”‚ โ”‚ โ”œโ”€โ”€ api/routes/ +โ”‚ โ”‚ โ”œโ”€โ”€ ml/ +โ”‚ โ”‚ โ”œโ”€โ”€ models/ +โ”‚ โ”‚ โ”œโ”€โ”€ services/ +โ”‚ โ”‚ โ””โ”€โ”€ repositories/ +โ”‚ โ”œโ”€โ”€ alembic/ +โ”‚ โ”œโ”€โ”€ scripts/ +โ”‚ โ”œโ”€โ”€ alembic.ini +โ”‚ โ””โ”€โ”€ requirements.txt +โ”‚ +โ””โ”€โ”€ Needed/ + โ”œโ”€โ”€ murshid_logreg_pipeline_manual_oof_pcatuned.joblib โ† ู†ู…ูˆุฐุฌ LogReg + โ”œโ”€โ”€ murshid_logreg_thresholds_manual_oof_pcatuned.npy โ† ุนุชุจุงุช ุงู„ุชู†ุจุค + โ”œโ”€โ”€ murshid_label_columns.json โ† ุฃุณู…ุงุก ุงู„ุชู‚ู†ูŠุงุช ุงู„ู€ 20 + โ””โ”€โ”€ murshid_query_template_structure_clean_shared.xlsx โ† 60 ู‚ุงู„ุจ WQL +``` + +### 2.3 Hugging Face Token +ู…ุทู„ูˆุจ ู„ู„ูˆุตูˆู„ ุฅู„ู‰ ู†ู…ูˆุฐุฌ `meta-llama/Meta-Llama-3-8B-Instruct`: +- ูŠูุถุงู ููŠ `Colab Secrets` ุจุงุณู… `HF_TOKEN` +- ุฃูˆ ู…ุจุงุดุฑุฉู‹ ููŠ ุฎู„ูŠุฉ 5 ู…ู† ุงู„ุฏูุชุฑ + +--- + +## 3. ุดุฑุญ ุงู„ุฎู„ุงูŠุง ุจุงู„ุชูุตูŠู„ + +### ุงู„ุฎู„ูŠุฉ 1: ุงู„ุชุญู‚ู‚ ู…ู† GPU + +**ุงู„ู‡ุฏู:** ุงู„ุชุฃูƒุฏ ู…ู† ูˆุฌูˆุฏ GPU ู‚ุจู„ ุงู„ุจุฏุก. + +```python +import torch +print('CUDA available:', torch.cuda.is_available()) +print('GPU:', torch.cuda.get_device_name(0)) +print('Memory:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB') +``` + +**ุงู„ู…ุฎุฑุฌ ุงู„ู…ุชูˆู‚ุน:** +``` +CUDA available: True +GPU: Tesla T4 +Memory: 15.8 GB +``` + +**ู…ุงุฐุง ูŠุญุฏุซ ุฅุฐุง ู„ู… ูŠูƒู† ู‡ู†ุงูƒ GPUุŸ** +- LLaMA ู„ู† ูŠูุญู…ูŽู‘ู„ (ูŠุญุชุงุฌ CUDA) +- ุงู„ุฎุงุฏู… ุณูŠุนู…ู„ ุจู€ LOCAL mode ูู‚ุท (ุจุฏูˆู† ุชู„ุฎูŠุต) + +--- + +### ุงู„ุฎู„ูŠุฉ 2: ุชุญู…ูŠู„ Google Drive ูˆุงู„ุชุญู‚ู‚ ู…ู† ุงู„ู…ู„ูุงุช + +**ุงู„ู‡ุฏู:** ุฑุจุท Colab ุจู€ Google Drive ูˆุงู„ุชุญู‚ู‚ ู…ู† ูˆุฌูˆุฏ ุฌู…ูŠุน ุงู„ู…ู„ูุงุช ุงู„ู…ุทู„ูˆุจุฉ. + +```python +from google.colab import drive +drive.mount('/content/drive') + +NEEDED_PATH = '/content/drive/MyDrive/Needed' +BACKEND_PATH = '/content/drive/MyDrive/murshid_backend' +ZIP_PATH = '/content/drive/MyDrive/murshid_backend_for_drive.zip' +``` + +**ู…ุง ูŠุชุญู‚ู‚ ู…ู†ู‡:** +| ุงู„ู…ู„ู | ุงู„ู†ูˆุน | ุงู„ุญุงู„ุฉ | +|-------|-------|--------| +| `murshid_logreg_pipeline_manual_oof_pcatuned.joblib` | ุฅู„ุฒุงู…ูŠ | โœ… / โŒ | +| `murshid_logreg_thresholds_manual_oof_pcatuned.npy` | ุฅู„ุฒุงู…ูŠ | โœ… / โŒ | +| `murshid_label_columns.json` | ุฅู„ุฒุงู…ูŠ | โœ… / โŒ | +| `murshid_query_template_structure_clean_shared.xlsx` | ุงุฎุชูŠุงุฑูŠ | โœ… / โš ๏ธ | +| `murshid_backend/` ุฃูˆ `.zip` | ุฅู„ุฒุงู…ูŠ | โœ… / โŒ | + +--- + +### ุงู„ุฎู„ูŠุฉ 3: ุชุฌู‡ูŠุฒ ุงู„ุจุงูƒู†ุฏ ููŠ /content + +**ุงู„ู‡ุฏู:** ู†ู‚ู„ ู…ู„ูุงุช ุงู„ุจุงูƒู†ุฏ ู…ู† Drive ุฅู„ู‰ `/content` ู„ุชุณุฑูŠุน ุงู„ู‚ุฑุงุกุฉ (Drive ุฃุจุทุฃ ููŠ I/O). + +**ุงู„ู…ู†ุทู‚ ุงู„ุฐูƒูŠ:** +``` +ู‡ู„ murshid_backend/ ู…ูˆุฌูˆุฏ ุนู„ู‰ DriveุŸ + โ†“ ู†ุนู… โ†’ ุงู†ุณุฎ ู…ุจุงุดุฑุฉู‹ ุฅู„ู‰ /content + โ†“ ู„ุง +ู‡ู„ murshid_backend_for_drive.zip ู…ูˆุฌูˆุฏุŸ + โ†“ ู†ุนู… โ†’ ุงุณุชุฎุฑุฌู‡ ุฅู„ู‰ Drive ุฃูˆู„ุงู‹ ุซู… ุงู†ุณุฎ + โ†“ ู„ุง +โ†’ โŒ ุฎุทุฃ: "ุงุฑูุนูŠ ZIP ุฅู„ู‰ Google Drive" +``` + +**ุงู„ุฎุทูˆุงุช ุงู„ู…ู†ููŽู‘ุฐุฉ:** +1. **ุงุณุชุฎุฑุงุฌ ZIP** (ุฅุฐุง ู„ุฒู…) ุฅู„ู‰ `MyDrive/` +2. **ู†ุณุฎ** `murshid_backend/` ุฅู„ู‰ `/content/murshid_backend/` (ุจุฏูˆู† pycache ูˆู…ู„ูุงุช ู…ุคู‚ุชุฉ) +3. **ุฅุถุงูุฉ** `/content/murshid_backend` ุฅู„ู‰ `sys.path` +4. **ุชุบูŠูŠุฑ** working directory ุฅู„ู‰ `/content/murshid_backend` + +**ู„ู…ุงุฐุง ุงู„ู†ุณุฎ ุฅู„ู‰ /contentุŸ** +- Drive ูŠุนุชู…ุฏ ุนู„ู‰ FUSE mount = ุจุทูŠุก ู„ู„ู‚ุฑุงุกุฉ ุงู„ู…ุชูƒุฑุฑุฉ +- `/content` ุนู„ู‰ SSD ู…ุญู„ูŠ ู„ู„ู€ VM = ุฃุณุฑุน ุจู€ 5-10x + +--- + +### ุงู„ุฎู„ูŠุฉ 4: ุชุซุจูŠุช ุงู„ู…ุชุทู„ุจุงุช + +**ุงู„ู‡ุฏู:** ุชุซุจูŠุช ุฌู…ูŠุน ุงู„ู…ูƒุชุจุงุช ุงู„ู„ุงุฒู…ุฉ ู„ุชุดุบูŠู„ ุงู„ุจุงูƒู†ุฏ. + +**ุงู„ู…ูƒุชุจุงุช ุงู„ู…ุซุจูŽู‘ุชุฉ:** + +| ุงู„ู…ูƒุชุจุฉ | ุงู„ุฅุตุฏุงุฑ | ุงู„ุบุฑุถ | +|---------|---------|--------| +| `fastapi` | 0.115.0 | ุฅุทุงุฑ API | +| `uvicorn` | 0.32.0 | ุฎุงุฏู… ASGI | +| `pydantic` | 2.9.0 | ุชุญู‚ู‚ ู…ู† ุงู„ุจูŠุงู†ุงุช | +| `sqlalchemy` | 2.0.0 | ORM | +| `alembic` | 1.13.0 | ู‡ุฌุฑุฉ DB | +| `scikit-learn` | **1.6.1** | ู†ู…ูˆุฐุฌ LogReg (ูŠุทุงุจู‚ ุจูŠุฆุฉ ุงู„ุชุฏุฑูŠุจ) | +| `bitsandbytes` | โ‰ฅ0.46.1 | ุชูƒู…ูŠู… LLaMA 4-bit | +| `accelerate` | ุขุฎุฑ ู†ุณุฎุฉ | `device_map="auto"` ู„ู„ู€ GPU | +| `openpyxl` | ุขุฎุฑ ู†ุณุฎุฉ | ู‚ุฑุงุกุฉ ู…ู„ู Excel | +| `lxml` | ุขุฎุฑ ู†ุณุฎุฉ | ู…ุนุงู„ุฌุฉ XML | +| `pyngrok` | ุขุฎุฑ ู†ุณุฎุฉ | (ุงุญุชูŠุงุทูŠ โ€” ุบูŠุฑ ู…ุณุชุฎุฏู…) | + +> **ู…ู„ุงุญุธุฉ ู…ู‡ู…ุฉ:** `scikit-learn==1.6.1` ู…ุญุฏูŽู‘ุฏ ุจุฏู‚ุฉ ู„ุฃู† ู…ู„ูุงุช joblib ุฏูุฑูู‘ุจุช ุจู‡ุฐู‡ ุงู„ู†ุณุฎุฉ โ€” ุงุณุชุฎุฏุงู… ู†ุณุฎุฉ ู…ุฎุชู„ูุฉ ูŠูู†ุชุฌ ุชุญุฐูŠุฑุงุช `InconsistentVersionWarning`. + +--- + +### ุงู„ุฎู„ูŠุฉ 5: ุฅุนุฏุงุฏ ู…ู„ู .env + +**ุงู„ู‡ุฏู:** ุฅู†ุดุงุก ู…ู„ู ุงู„ุฅุนุฏุงุฏุงุช ู„ุชุดุบูŠู„ FULL mode. + +**ู…ุญุชูˆู‰ ุงู„ู…ู„ู ุงู„ู…ููˆู„ูŽู‘ุฏ:** +```env +MURSHID_DB_URL=sqlite:////content/murshid.db +MURSHID_MODELS_DIR=/content/drive/MyDrive/Needed +HF_TOKEN=**** +MURSHID_SKIP_LLM=false โ† ู…ูุชุงุญ FULL mode +SECRET_KEY=murshid_colab_2026 +LLAMA_MODEL_ID=meta-llama/Meta-Llama-3-8B-Instruct +EMBED_MODEL_ID=ehsanaghaei/SecureBERT_Plus +LOGREG_JOBLIB=murshid_logreg_pipeline_manual_oof_pcatuned.joblib +LOGREG_THRESHOLDS_NPY=murshid_logreg_thresholds_manual_oof_pcatuned.npy +LABEL_COLUMNS_JSON=murshid_label_columns.json +``` + +**ุงู„ูุฑู‚ ุจูŠู† FULL ูˆ LOCAL mode:** +| ุงู„ู…ุชุบูŠุฑ | FULL mode | LOCAL mode | +|---------|-----------|------------| +| `MURSHID_SKIP_LLM` | `false` | `true` | +| LLaMA ูŠูุญู…ูŽู‘ู„ุŸ | โœ… ู†ุนู… | โŒ ู„ุง | +| ุฌูˆุฏุฉ ุงู„ุชู„ุฎูŠุต | ุนุงู„ูŠุฉ | ุงู„ูˆุตู ุงู„ุฎุงู… ูู‚ุท | +| T1484 confidence (ู…ุซุงู„) | **94.76%** | 89.29% | + +--- + +### ุงู„ุฎู„ูŠุฉ 6: ุชู‡ุฌูŠุฑ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช (Alembic) + +**ุงู„ู‡ุฏู:** ุฅู†ุดุงุก ุฌุฏุงูˆู„ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช SQLite. + +```bash +python -m alembic upgrade head +``` + +**ุงู„ุฌุฏุงูˆู„ ุงู„ู…ูู†ุดุฃุฉ (ู…ู† migration 0001):** + +| ุงู„ุฌุฏูˆู„ | ุงู„ุบุฑุถ | ู…ุตุฏุฑู‡ ููŠ ุงู„ุชู‚ุฑูŠุฑ | +|--------|--------|-----------------| +| `users` | ู…ุณุชุฎุฏู…ูˆ ุงู„ู†ุธุงู… (admin/analyst) | ER Diagram ยง3.2.6 | +| `mapping_jobs` | ูˆุธุงุฆู ู…ุนุงู„ุฌุฉ ู…ู„ูุงุช ุงู„ู‚ูˆุงุนุฏ | ER Diagram ยง3.2.6 | +| `rules` | ู‚ูˆุงุนุฏ Wazuh ุงู„ู…ูุญู„ูŽู‘ู„ุฉ | ER Diagram ยง3.2.6 | +| `techniques` | ุชู‚ู†ูŠุงุช MITRE ATT&CK | ER Diagram ยง3.2.6 | +| `rule_technique_mappings` | ุฑุจุท ุงู„ู‚ูˆุงุนุฏ ุจุงู„ุชู‚ู†ูŠุงุช + confidence | ER Diagram ยง3.2.6 | +| `query_templates` | ู‚ูˆุงู„ุจ WQL ู„ู„ุชุญู‚ูŠู‚ | ER Diagram ยง3.2.6 | + +> **ู…ู„ุงุญุธุฉ:** ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช ููŠ `/content/murshid.db` โ€” ุชูู†ุดุฃ ู…ู† ุฌุฏูŠุฏ ููŠ ูƒู„ ุฌู„ุณุฉ Colab. + +--- + +### ุงู„ุฎู„ูŠุฉ 7: ุงุณุชูŠุฑุงุฏ ู‚ูˆุงู„ุจ WQL ู…ู† Excel + +**ุงู„ู‡ุฏู:** ุชุญู…ูŠู„ 60 ู‚ุงู„ุจ WQL ู…ู† ู…ู„ู Excel ุฅู„ู‰ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช. + +**ุงู„ุจูŠุงู†ุงุช ุงู„ู…ุณุชูˆุฑุฏุฉ:** + +| ุงู„ุฅุญุตุงุฆูŠุฉ | ุงู„ู‚ูŠู…ุฉ | +|-----------|--------| +| ุฅุฌู…ุงู„ูŠ ุงู„ุชู‚ู†ูŠุงุช | 20 ุชู‚ู†ูŠุฉ | +| ุฅุฌู…ุงู„ูŠ ุงู„ู‚ูˆุงู„ุจ | 60 ู‚ุงู„ุจ (3 ู„ูƒู„ ุชู‚ู†ูŠุฉ) | +| ุงู„ุชู‚ู†ูŠุงุช ุงู„ู…ุดู…ูˆู„ุฉ | T1047, T1055, T1059.001, T1070.004, T1078, T1083, T1095, T1098, T1105, T1110, T1112, T1114, T1176, T1190, T1484, T1498, T1499, T1529, T1531, T1562.001 | + +**ู…ุซุงู„ ุนู„ู‰ ู‚ุงู„ุจ WQL (T1484):** +``` +Template 1: Host pivot + agent.name:${HOST} AND win.system.eventID:(4728 OR 4729 ...) AND @timestamp:[now-24h TO now] + +Template 2: Actor pivot + win.eventdata.SubjectUserName:${USER} AND win.system.eventID:(...) AND @timestamp:[now-24h TO now] + +Template 3: High-impact target change + win.system.eventID:(...) AND win.eventdata.TargetUserName:("Domain Admins" OR ...) AND @timestamp:[now-24h TO now] +``` + +**ู…ู†ุน ุงู„ุชูƒุฑุงุฑ:** +- ูŠุชุญู‚ู‚ ู…ู† ูˆุฌูˆุฏ (`technique_id` + `purpose`) ู‚ุจู„ ุงู„ุฅุถุงูุฉ +- `replace=False` ุจุดูƒู„ ุงูุชุฑุงุถูŠ (ู„ุง ูŠูุนูŠุฏ ุงู„ูƒุชุงุจุฉ) + +--- + +### ุงู„ุฎู„ูŠุฉ 8: ุชุดุบูŠู„ FastAPI + Cloudflare Tunnel + +**ุงู„ู‡ุฏู:** ุงู„ุฎู„ูŠุฉ ุงู„ุฑุฆูŠุณูŠุฉ โ€” ุชูุดุบู‘ู„ ุงู„ุจุงูƒู†ุฏ ูˆุชูู†ุดุฆ ุฑุงุจุทุงู‹ ุนุงู…ุงู‹. + +#### 8.1 ุงู„ุชุญู‚ู‚ ู…ู† bitsandbytes +```python +import bitsandbytes as bnb +print(f'โœ… bitsandbytes {bnb.__version__}') +``` +> ุฅุฐุง ูุดู„: ูŠููˆู‚ู ุงู„ุชุดุบูŠู„ ููˆุฑุงู‹ ู…ุน ุฑุณุงู„ุฉ ูˆุงุถุญุฉ. + +#### 8.2 ุชุดุบูŠู„ uvicorn +```bash +python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 --log-level info +``` +- `--host 0.0.0.0`: ูŠุณุชู…ุน ุนู„ู‰ ูƒู„ ุงู„ูˆุงุฌู‡ุงุช (ู…ุทู„ูˆุจ ู„ู„ู€ tunnel) +- ุงู„ู„ูˆุฌ ูŠูุญูุธ ููŠ `/content/murshid_server.log` + +#### 8.3 ุชุญู…ูŠู„ ุงู„ู†ู…ุงุฐุฌ (lifespan) +ุนู†ุฏ ุจุฏุก ุงู„ุฎุงุฏู… ุชูู†ููŽู‘ุฐ `load_models()` ุจู‡ุฐุง ุงู„ุชุฑุชูŠุจ: + +``` +1. hf_login(token) โ† 1-2 ุซุงู†ูŠุฉ +2. LLaMA 3 8B-Instruct (4-bit NF4) โ† 5-8 ุฏู‚ุงุฆู‚ (4.5 GB) + - BitsAndBytesConfig: load_in_4bit=True + - bnb_4bit_quant_type="nf4" + - bnb_4bit_compute_dtype=float16 +3. SecureBERT+ (ehsanaghaei) โ† 1-2 ุฏู‚ูŠู‚ุฉ + - AutoModel + AutoTokenizer + - mean pooling 768-dim +4. LogisticRegressionModel โ† < 1 ุซุงู†ูŠุฉ + - joblib.load (Pipeline: PCA + OneVsRestClassifier) + - np.load thresholds +``` + +#### 8.4 ุงู„ุงู†ุชุธุงุฑ ุงู„ุฐูƒูŠ +```python +for i in range(180): # 15 ุฏู‚ูŠู‚ุฉ ูƒุญุฏ ุฃู‚ุตู‰ + time.sleep(5) + # ูุญุต /health ูƒู„ 5 ุซูˆุงู†ู + # ุนุฑุถ ุงู„ู„ูˆุฌ ูƒู„ 30 ุซุงู†ูŠุฉ + # ูƒุดู ู…ุจูƒุฑ ู„ู„ุฃุฎุทุงุก (ERROR, ImportError) +``` + +#### 8.5 Cloudflare Tunnel +```bash +wget cloudflared-linux-amd64 โ†’ /usr/local/bin/cloudflared +cloudflared tunnel --url http://localhost:8000 +``` +- ู„ุง ูŠุญุชุงุฌ ุญุณุงุจุงู‹ ุฃูˆ ุชูˆูƒู†ุงู‹ +- ูŠูู†ุชุฌ ุฑุงุจุทุงู‹ ู…ุซู„: `https://xxxx.trycloudflare.com` +- ุตุงู„ุญ ุทูˆุงู„ ุฌู„ุณุฉ Colab + +--- + +### ุงู„ุฎู„ูŠุฉ 9: ุฑุจุท ุงู„ูุฑูˆู†ุช ุชู„ู‚ุงุฆูŠุงู‹ + +**ุงู„ู‡ุฏู:** ุชุญุฏูŠุซ `index.html` ุจุงู„ุฑุงุจุท ุงู„ุฌุฏูŠุฏ ู…ู† Cloudflare ุชู„ู‚ุงุฆูŠุงู‹. + +```python +# ุงุณุชุฎุฑุงุฌ ุงู„ุฑุงุจุท +match = re.search(r'https://[a-z0-9\-]+\.trycloudflare\.com', content) +public_url = match.group(0) + +# ุชุญุฏูŠุซ index.html ุนู„ู‰ Drive +html = re.sub( + r"const BASE = '[^']*';", + f"const BASE = '{public_url}';", + html +) +``` + +**ุงู„ู†ุชูŠุฌุฉ:** +```javascript +// ู‚ุจู„ +const BASE = 'http://127.0.0.1:8000'; + +// ุจุนุฏ +const BASE = 'https://xxxx.trycloudflare.com'; +``` + +--- + + + +### ุงู„ุฎู„ูŠุฉ 10: ุงุฎุชุจุงุฑ ุงู„ู€ API + +**ุงู„ู‡ุฏู:** ุงู„ุชุญู‚ู‚ ู…ู† ุนู…ู„ ูƒู„ ู…ูƒูˆู†. + +#### 10.1 Health Check +```python +urllib.request.urlopen('http://localhost:8000/health') +``` + +**ุงู„ู…ุฎุฑุฌ ุงู„ู…ุชูˆู‚ุน (FULL mode):** +```json +{ + "pipeline_mode": "full", + "pipeline_description": "LLaMA + SecureBERT+ + LogReg", + "components": { + "llama_loaded": true, + "embedder_loaded": true, + "logreg_loaded": true, + "cuda_available": true + }, + "all_model_files_present": true +} +``` + +#### 10.2 ุชุญู„ูŠู„ ู‚ุงุนุฏุฉ ุงุฎุชุจุงุฑ +```python +rule_xml = '...' +POST http://localhost:8000/rules/analyze +``` + +**ุงู„ู€ pipeline ุฎุทูˆุฉ ุจุฎุทูˆุฉ:** + +``` +XML Input (rule 18205) + โ†“ +sanitize_rule_from_string() + - ุญุฐู: mitre, if_sid, group, if_group + โ†“ +summarize_one_rule() [LLaMA] + - Input: sanitized XML + - Output: "Detects the deletion of a security-enabled global group on a Windows system." + โ†“ +build_text_for_embedding() + - text = summary + ". " + description + - "Detects the deletion of a security-enabled global group on a Windows system. Windows: Security Enabled Global Group Deleted." + โ†“ +SecureBERTEmbedder.embed_text() + - Chunks (256 tokens max) + - mean pooling per chunk + - average chunks โ†’ 768-dim vector + - L2 normalize + โ†“ +LogisticRegressionModel.predict() + - predict_proba(X_user) + - pred = (proba >= logreg_thr) + - conf = proba * 100 + - gap = proba - logreg_thr + โ†“ +save_technique_mappings() [DB] + - ุญูุธ 20 ุชู‚ู†ูŠุฉ ู…ุน confidence + โ†“ +JSON Response +``` + +**ุงู„ู…ุฎุฑุฌ ู„ู„ู‚ุงุนุฏุฉ 18205:** +``` +Technique Pred Conf% Proba Thr Gap +T1484 โœ… 94.76 0.9476 0.74 +0.2076 โ† Primary +T1531 โŒ 27.92 0.2792 ... ... +T1070.004 โŒ 21.03 0.2103 ... ... +T1098 โŒ 10.65 0.1065 ... ... +T1112 โŒ 9.27 0.0927 ... ... +``` + +--- +ุงู„ุฎุทูˆุงุช ุงู„ู‚ุงุฏู…ุฉ ู„ู„ู…ูˆุฏ ุงู„ู…ุญู„ูŠ (lOCAL Mode) ุบูŠุฑ ุถุฑูˆุฑูŠู‡ + +### ุงู„ุฎู„ูŠุฉ 11: ุชุตุฏูŠุฑ ุงู„ู†ุชุงุฆุฌ (ุงุฎุชูŠุงุฑูŠ) + +**ุงู„ู‡ุฏู:** ุชุตุฏูŠุฑ ู†ุชุงุฆุฌ ุงู„ู‚ูˆุงุนุฏ ุงู„ู…ูุญู„ูŽู‘ู„ุฉ ุฅู„ู‰ JSON ู„ุงุณุชุฎุฏุงู…ู‡ุง ู„ุงุญู‚ุงู‹ ุนู„ู‰ ุงู„ุฌู‡ุงุฒ ุงู„ู…ุญู„ูŠ. + +```python +export_path = f'{NEEDED_PATH}/murshid_full_results.json' +json.dump(export_results, f, ensure_ascii=False, indent=2) +``` + +**ุงู„ุงุณุชุฎุฏุงู…:** ูŠูู…ูƒูู‘ู† ุงุณุชูŠุฑุงุฏ ู†ุชุงุฆุฌ FULL mode ููŠ ุงู„ุจุงูƒู†ุฏ ุงู„ู…ุญู„ูŠ ุจุฏูˆู† GPU. + +--- + +### ุงู„ุฎู„ูŠุฉ 12: ุฅูŠู‚ุงู ุงู„ุฎุงุฏู… + +```python +cf_proc.terminate() # ุฅุบู„ุงู‚ Cloudflare tunnel +server_proc.terminate() # ุฅูŠู‚ุงู uvicorn +``` + +--- + +## 4. ู…ู‚ุงุฑู†ุฉ ุฃูˆุถุงุน ุงู„ุชุดุบูŠู„ + +| | FULL mode (Colab) | LOCAL mode (ุงู„ุฌู‡ุงุฒ) | LITE mode | +|--|-------------------|---------------------|-----------| +| **LLaMA** | โœ… | โŒ | โŒ | +| **SecureBERT+** | โœ… | โœ… | โŒ | +| **LogReg** | โœ… | โœ… | โœ… | +| **GPU** | Tesla T4 | ู„ุง ูŠู„ุฒู… | ู„ุง ูŠู„ุฒู… | +| **Embedding** | ู†ุต ู…ูุซุฑู‰ ุจู€ LLaMA | ูˆุตู ุงู„ู‚ุงุนุฏุฉ ูู‚ุท | ุนุดูˆุงุฆูŠ | +| **T1484 confidence** | **94.76%** | 89.29% | ุบูŠุฑ ู…ูˆุซูˆู‚ | +| **ุงู„ู‚ุฑุงุฑ ุงู„ู†ู‡ุงุฆูŠ** | T1484 โœ… | T1484 โœ… | ุบูŠุฑ ู…ูˆุซูˆู‚ | +| **ูˆู‚ุช ุงู„ุชุญู„ูŠู„/ู‚ุงุนุฏุฉ** | ~30-60 ุซุงู†ูŠุฉ | ~2-5 ุซูˆุงู†ู | < 1 ุซุงู†ูŠุฉ | +| **ุงู„ุงุณุชุฎุฏุงู…** | ุฅู†ุชุงุฌ / ุนุฑุถ | ุชุทูˆูŠุฑ ู…ุญู„ูŠ | ุงุฎุชุจุงุฑ ูู‚ุท | + +--- + +## 5. ู…ุนู…ุงุฑูŠุฉ ุงู„ู†ุธุงู… ุงู„ูƒุงู…ู„ุฉ ุนู„ู‰ Colab + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Google Colab VM โ”‚ +โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ /content/murshid_backend/ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ FastAPI (uvicorn :8000) โ”‚ โ”‚ +โ”‚ โ”‚ โ”œโ”€โ”€ /health โ”‚ โ”‚ +โ”‚ โ”‚ โ”œโ”€โ”€ POST /rules/analyze โ”‚ โ”‚ +โ”‚ โ”‚ โ”œโ”€โ”€ GET /results/{rule_id} โ”‚ โ”‚ +โ”‚ โ”‚ โ”œโ”€โ”€ GET /queries/{tech_id} โ”‚ โ”‚ +โ”‚ โ”‚ โ””โ”€โ”€ GET /api/db/... โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ ML Models (GPU VRAM) โ”‚ โ”‚ +โ”‚ โ”‚ โ”œโ”€โ”€ LLaMA 3 8B (4-bit) โ”‚ โ”‚ +โ”‚ โ”‚ โ”œโ”€โ”€ SecureBERT+ โ”‚ โ”‚ +โ”‚ โ”‚ โ””โ”€โ”€ LogReg Pipeline โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ /content/murshid.db โ”‚ โ”‚ +โ”‚ โ”‚ (SQLite โ€” 6 ุฌุฏุงูˆู„) โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ cloudflared tunnel โ”‚ โ”‚ +โ”‚ โ”‚ localhost:8000 โ†’ HTTPS โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ–ผ + https://xxxx.trycloudflare.com + โ”‚ + โ–ผ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ ุงู„ู…ุชุตูุญ / ุงู„ูุฑูˆู†ุช โ”‚ + โ”‚ index.html (React) โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +--- + +## 6. ุงู„ุฃุฎุทุงุก ุงู„ุดุงุฆุนุฉ ูˆุญู„ูˆู„ู‡ุง + +| ุงู„ุฎุทุฃ | ุงู„ุณุจุจ | ุงู„ุญู„ | +|-------|-------|------| +| `ImportError: bitsandbytes>=0.46.1` | ู†ุณุฎุฉ ู‚ุฏูŠู…ุฉ | ุดุบู‘ู„ูŠ `!pip install -U bitsandbytes>=0.46.1` | +| `FileNotFoundError: murshid_backend` | ZIP ุบูŠุฑ ู…ุฑููˆุน | ุงุฑูุนูŠ `murshid_backend_for_drive.zip` ุฅู„ู‰ Drive | +| `ERR_NGROK_4018` | ngrok ูŠุญุชุงุฌ ุญุณุงุจุงู‹ | ุงุณุชุฎุฏู…ูŠ Cloudflare Tunnel (ุฎู„ูŠุฉ 9) | +| `Cannot connect to backend` | CORS ู…ุบู„ู‚ | `allow_origins=["*"]` ููŠ `main.py` | +| Server ูŠุณุชุบุฑู‚ > 15 ุฏู‚ูŠู‚ุฉ | ุชู†ุฒูŠู„ LLaMA ุจุทูŠุก | ููŠ ุงู„ุฌู„ุณุฉ ุงู„ุซุงู†ูŠุฉ ุงู„ุชู†ุฒูŠู„ ู…ู† Cache | +| `InconsistentVersionWarning` | sklearn ุฅุตุฏุงุฑ ู…ุฎุชู„ู | ุชุฃูƒุฏูŠ ู…ู† `scikit-learn==1.6.1` | + +--- + +## 7. ุงู„ู€ Endpoints ุงู„ู…ุชุงุญุฉ ุจุนุฏ ุงู„ุชุดุบูŠู„ + +| Method | Endpoint | ุงู„ูˆุตู | +|--------|----------|-------| +| `GET` | `/health` | ุญุงู„ุฉ ุงู„ุฎุงุฏู… ูˆุงู„ู†ู…ุงุฐุฌ | +| `GET` | `/api/stats` | ุฅุญุตุงุฆูŠุงุช Dashboard | +| `GET` | `/api/db/summary` | ุนุฏุฏ ุงู„ุตููˆู ููŠ ุงู„ุฌุฏุงูˆู„ | +| `GET` | `/api/db/rules` | ุฌู…ูŠุน ุงู„ู‚ูˆุงุนุฏ ููŠ DB | +| `GET` | `/api/db/mappings` | ุฌู…ูŠุน ุงู„ู…ุทุงุจู‚ุงุช | +| `GET` | `/api/db/techniques` | ุชู‚ู†ูŠุงุช MITRE ุงู„ู…ุฎุฒู‘ู†ุฉ | +| `GET` | `/api/db/templates` | ู‚ูˆุงู„ุจ WQL | +| `POST` | `/api/db/import-excel` | ุงุณุชูŠุฑุงุฏ Excel | +| `POST` | `/rules/analyze` | ุชุญู„ูŠู„ ู‚ุงุนุฏุฉ XML (FULL pipeline) | +| `GET` | `/results/{rule_id}` | ู†ุชุงุฆุฌ ุชู‚ู†ูŠุฉ ู‚ุงุนุฏุฉ ู…ุญุฏุฏุฉ | +| `GET` | `/queries/{technique_id}` | ุงุณุชุนู„ุงู…ุงุช WQL ู„ุชู‚ู†ูŠุฉ | +| `POST` | `/admin/templates` | ุฅุถุงูุฉ ู‚ุงู„ุจ WQL | +| `PATCH` | `/admin/templates/{id}` | ุชุนุฏูŠู„ ู‚ุงู„ุจ | +| `GET` | `/docs` | Swagger UI ุงู„ุชูุงุนู„ูŠ | + +--- + +## 8. ู…ู„ุงุญุธุงุช ู„ู„ุนุฑุถ ุงู„ุชู‚ุฏูŠู…ูŠ + +1. **ุดุบู‘ู„ูŠ ุงู„ุฎู„ุงูŠุง ู‚ุจู„ ุงู„ุนุฑุถ ุจู€ 15 ุฏู‚ูŠู‚ุฉ** (ูˆู‚ุช ุชุญู…ูŠู„ LLaMA) +2. **ุงู†ุณุฎูŠ ุฑุงุจุท Cloudflare** ูˆุชุญู‚ู‚ูŠ ู…ู†ู‡ ููŠ ุงู„ู…ุชุตูุญ +3. **ุงู„ูุฑูˆู†ุช ูŠูุญุฏูŽู‘ุซ ุชู„ู‚ุงุฆูŠุงู‹** ุจุงู„ุฑุงุจุท ุงู„ุฌุฏูŠุฏ ููŠ ุฎู„ูŠุฉ 9 +4. **ูƒู„ ุฌู„ุณุฉ Colab ุฌุฏูŠุฏุฉ = ุฑุงุจุท Cloudflare ุฌุฏูŠุฏ** โ€” ูƒุฑู‘ุฑูŠ ุงู„ุฎุทูˆุงุช +5. **DB ูุงุฑุบุฉ ููŠ ูƒู„ ุฌู„ุณุฉ** โ€” ุญู„ู‘ู„ูŠ ุงู„ู‚ูˆุงุนุฏ ุนุจุฑ Admin Panel ุฃูˆ ุฎู„ูŠุฉ ุงุฎุชุจุงุฑ + +--- + +*ุชุงุฑูŠุฎ ุงู„ุฅู†ุดุงุก: 8 ุฃุจุฑูŠู„ 2026 | ู…ุดุฑูˆุน ู…ูุฑุดูุฏ โ€” CCIS, PNU* diff --git a/Needed/murshid_label_columns.json b/Needed/murshid_label_columns.json new file mode 100644 index 0000000000000000000000000000000000000000..0a3d6e9d00e91408fa9b1d28f1d33a50992cd8a6 --- /dev/null +++ b/Needed/murshid_label_columns.json @@ -0,0 +1,22 @@ +[ + "T1047", + "T1055", + "T1059.001", + "T1070.004", + "T1078", + "T1083", + "T1095", + "T1098", + "T1105", + "T1110", + "T1112", + "T1114", + "T1176", + "T1190", + "T1484", + "T1498", + "T1499", + "T1529", + "T1531", + "T1562.001" +] \ No newline at end of file diff --git a/Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib b/Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib new file mode 100644 index 0000000000000000000000000000000000000000..5ece55c1a1bd7e50f60b14abf3819d8e9a0a96fd --- /dev/null +++ b/Needed/murshid_logreg_pipeline_manual_oof_pcatuned.joblib @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be629d9f6780456a9435f8be2655e3fa0a848fbe2a4f166813913331b4c43ba4 +size 206584 diff --git a/Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy b/Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy new file mode 100644 index 0000000000000000000000000000000000000000..9cf4604dd051177673475df7c2a8223394f9b99e --- /dev/null +++ b/Needed/murshid_logreg_thresholds_manual_oof_pcatuned.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:005a664d7faa22104e4a9e58ace6976628d1d00c1cabcaead1833ff792366c79 +size 208 diff --git a/Needed/murshid_query_template_structure_clean_shared.xlsx b/Needed/murshid_query_template_structure_clean_shared.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..92b947fcb2e29b202201cabad674ad44111b9667 --- /dev/null +++ b/Needed/murshid_query_template_structure_clean_shared.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1491c4dee86bbf29691b3c4254a344e2cb87eabbb77f04f49da09856cb1d145 +size 20938 diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bcd145c353c7da7aefdb630be9ca4fc74b6e25e2 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +--- +title: Murshid - ู…ูุฑุดูุฏ +emoji: ๐Ÿ›ก๏ธ +colorFrom: blue +colorTo: indigo +sdk: docker +pinned: false +license: mit +--- + +# ๐Ÿ›ก๏ธ Murshid | ู…ูุฑุดูุฏ + +**From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts** + +REST API + Dashboard for analyzing Wazuh IDS rules and mapping them to MITRE ATT&CK techniques. + +## Features + +- **Rule Analysis**: Parse Wazuh XML rules and classify MITRE ATT&CK techniques +- **WQL Queries**: Get pre-built Wazuh Query Language templates per technique +- **Dashboard**: Interactive web UI with statistics and DB viewer +- **ML Pipeline**: Logistic Regression with SecureBERT+ embeddings + +## Tech Stack + +- **FastAPI** โ€” REST API +- **SQLite** โ€” Database +- **Logistic Regression** โ€” Primary classification model +- **SecureBERT+** โ€” Text embeddings (optional, requires torch) + +## API Endpoints + +| Method | URL | Description | +|--------|-----|-------------| +| `GET` | `/health` | System health check | +| `POST` | `/rules/analyze` | Analyze a Wazuh XML rule | +| `GET` | `/results/{rule_id}` | Get stored results for a rule | +| `GET` | `/queries/{technique_id}` | Get WQL templates for a technique | +| `GET` | `/docs` | Interactive Swagger documentation | diff --git a/murshid_backend/README.md b/murshid_backend/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3c8e7ae9131131334df33516a3c192aeb22f7a33 --- /dev/null +++ b/murshid_backend/README.md @@ -0,0 +1,156 @@ +# Murshid Backend + +REST API ู„ู…ุดุฑูˆุน "ู…ุฑุดุฏ โ€” ู…ู† ุงู„ุชู†ุจูŠู‡ุงุช ุฅู„ู‰ ุงู„ุชูˆุฌูŠู‡: ุฑุจุท ุชู‚ู†ูŠุงุช MITRE ATT&CK ู„ู…ุญู„ู„ูŠ SOC" + +## ุงู„ุชู‚ู†ูŠุงุช + +- **FastAPI** โ€” REST API +- **MySQL** + **SQLAlchemy** โ€” ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช +- **Alembic** โ€” ู‡ุฌุฑุฉ ุงู„ุฌุฏุงูˆู„ +- **Logistic Regression** โ€” ุงู„ู†ู…ูˆุฐุฌ ุงู„ุฃุณุงุณูŠ ููŠ ู‡ุฐู‡ ุงู„ู…ุฑุญู„ุฉ +- **SecureBERT+** โ€” ุชุถู…ูŠู†ุงุช ู†ุตูŠุฉ +- **Llama 3 8B** โ€” ุชู„ุฎูŠุต ู‚ูˆุงุนุฏ Wazuh + +> ุงู„ู…ู†ุทู‚ ู…ุณุชุฎุฑุฌ ู…ู† `MurshidUIPipeline.ipynb` ุฏูˆู† ุชุนุฏูŠู„ู‡. + +--- + +## ู‡ูŠูƒู„ ุงู„ู…ุดุฑูˆุน + +``` +murshid_backend/ + app/ + main.py โ† ู†ู‚ุทุฉ ุชุดุบูŠู„ FastAPI + config.py + api/routes/ + health.py โ† GET /health + rules.py โ† POST /rules/analyze + GET /results/{rule_id} + queries.py โ† GET /queries/{technique_id} + Admin endpoints + services/ + ml_service.py + rule_service.py + result_service.py + template_service.py + ml/ + sanitizer.py โ† ุชู†ุธูŠู XML + summarizer.py โ† ุชู„ุฎูŠุต Llama + embedder.py โ† SecureBERT+ + logistic_model.py โ† Logistic Regression inference + pipeline.py โ† analyze_rule() ุงู„ุดุงู…ู„ + models/ โ† SQLAlchemy ORM (6 ุฌุฏุงูˆู„ ู…ู† ER Diagram) + schemas/ โ† Pydantic schemas + repositories/ โ† DB access layer + db/ + base.py + session.py + alembic/ + versions/0001_initial_schema.py + requirements.txt + .env.example +``` + +--- + +## ุฌุฏุงูˆู„ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช (ู…ุณุชุฎุฑุฌุฉ ู…ู† ER Diagram ยง3.2.6) + +| ุฌุฏูˆู„ | ุงู„ู…ุตุฏุฑ ููŠ ุงู„ุชู‚ุฑูŠุฑ | +|------|-------------------| +| `users` | User entity โ€” username, email, password_hash, role | +| `mapping_jobs` | MappingJob entity โ€” job_id, file_name, status, progress, timestamp | +| `rules` | Rule entity โ€” rule_id, embedding_vector, job_id | +| `techniques` | Technique entity โ€” technique_id, technique_name, tactic | +| `rule_technique_mappings` | RuleTechniqueMapping โ€” rule_id, technique_id, confidence_score | +| `query_templates` | QueryTemplate โ€” purpose, wql_query, note, is_active | + +--- + +## ุงู„ุฅุนุฏุงุฏ ูˆุงู„ุชุดุบูŠู„ + +### 1) ู…ุชุทู„ุจุงุช + +- Python 3.10+ +- MySQL 8+ +- GPU ู…ูˆุตู‰ ุจู‡ ู„ู€ Llama 3 8B + +### 2) ุชุซุจูŠุช + +```powershell +cd d:\GP\murshid_backend +python -m venv .venv +.\.venv\Scripts\activate +pip install -r requirements.txt +``` + +### 3) ุฅุนุฏุงุฏ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช + +ุฅู†ุดุงุก ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช ููŠ MySQL: +```sql +CREATE DATABASE murshid_db CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; +``` + +### 4) ุฅุนุฏุงุฏ `.env` + +```powershell +copy .env.example .env +``` + +ุนุฏู‘ู„ูŠ ุงู„ู‚ูŠู…: +```env +MURSHID_DB_URL=mysql+pymysql://root:YOUR_PASSWORD@localhost:3306/murshid_db +MURSHID_MODELS_DIR=d:/GP/Needed +HF_TOKEN=hf_xxxx +MURSHID_SKIP_LLM=false +``` + +### 5) ุชุฃูƒุฏ ู…ู† ูˆุฌูˆุฏ ู…ู„ูุงุช ุงู„ู†ู…ุงุฐุฌ ููŠ `d:\GP\Needed` + +``` +murshid_logreg_pipeline_manual_oof_pcatuned.joblib +murshid_logreg_thresholds_manual_oof_pcatuned.npy +murshid_label_columns.json +``` + +### 6) ุชุดุบูŠู„ Alembic (ู‡ุฌุฑุฉ ุงู„ุฌุฏุงูˆู„) + +```powershell +alembic upgrade head +``` + +### 7) ุชุดุบูŠู„ ุงู„ู€ API + +```powershell +uvicorn app.main:app --reload --host 127.0.0.1 --port 8000 +``` + +--- + +## ุงู„ู€ Endpoints + +| Method | URL | ุงู„ูˆุตู | +|--------|-----|--------| +| `GET` | `/health` | ูุญุต ุญุงู„ุฉ ุงู„ู†ุธุงู… ูˆุงู„ู†ู…ุงุฐุฌ | +| `POST` | `/rules/analyze` | ุชุญู„ูŠู„ ู‚ุงุนุฏุฉ Wazuh XML ูˆุญูุธ ุงู„ู†ุชุงุฆุฌ | +| `GET` | `/results/{rule_id}` | ุงุณุชุฑุฌุงุน ุงู„ุชู‚ู†ูŠุงุช ุงู„ู…ุฎุฒู†ุฉ ู„ู…ุนุฑู ุงู„ู‚ุงุนุฏุฉ | +| `GET` | `/queries/{technique_id}` | ุฌู„ุจ ู‚ูˆุงู„ุจ WQL ู„ุชู‚ู†ูŠุฉ ู…ุนูŠู†ุฉ | +| `POST` | `/admin/templates` | ุฅุถุงูุฉ ู‚ุงู„ุจ WQL ุฌุฏูŠุฏ (Admin) | +| `PATCH` | `/admin/templates/{id}` | ุชุนุฏูŠู„ ุฃูˆ ุชุนุทูŠู„ ู‚ุงู„ุจ (Admin) | + +### ู…ุซุงู„ โ€” ุชุญู„ูŠู„ ู‚ุงุนุฏุฉ + +```bash +curl -X POST http://127.0.0.1:8000/rules/analyze \ + -H "Content-Type: application/json" \ + -d '{"rule_xml": "Registry Key Entry Deleted."}' +``` + +### ุงู„ุชูˆุซูŠู‚ ุงู„ุชูุงุนู„ูŠ + +ุงูุชุญูŠ: **http://127.0.0.1:8000/docs** + +--- + +## ู…ู„ุงุญุธุงุช + +- ุงู„ู…ู„ู ุงู„ุฃุตู„ูŠ `MurshidUIPipeline.ipynb` **ู„ู… ูŠูุนุฏูŽู‘ู„** โ€” ุงู„ู…ู†ุทู‚ ู…ู†ุณูˆุฎ ุฅู„ู‰ ุทุจู‚ุฉ `app/ml/`. +- ุงู„ู†ู…ูˆุฐุฌ ุงู„ู…ุนุชู…ุฏ ููŠ ู‡ุฐู‡ ุงู„ู…ุฑุญู„ุฉ: **Logistic Regression** ูู‚ุท. +- ู„ุชุดุบูŠู„ ุจุฏูˆู† GPU ู„ู„ุงุฎุชุจุงุฑ ูู‚ุท: ุถุนูŠ `MURSHID_SKIP_LLM=true` ููŠ `.env` (ู„ูƒู† `/rules/analyze` ุณุชุนูŠุฏ 503). diff --git a/murshid_backend/TECHNICAL_REPORT.md b/murshid_backend/TECHNICAL_REPORT.md new file mode 100644 index 0000000000000000000000000000000000000000..0bce7ea061d1e3fceb3469629d1f91ee04a754d3 --- /dev/null +++ b/murshid_backend/TECHNICAL_REPORT.md @@ -0,0 +1,322 @@ +# ุชู‚ุฑูŠุฑ ุชู‚ู†ูŠ ู…ูุตู‘ู„ โ€” ู…ุดุฑูˆุน ู…ูุฑุดูุฏ (Murshid) +## From Alerts to Guidance: MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts + +--- + +## 1. ู†ุธุฑุฉ ุนุงู…ุฉ + +ู…ูุฑุดูุฏ ู†ุธุงู… ุฐูƒูŠ ูŠุญูˆู‘ู„ ุชู†ุจูŠู‡ุงุช ู‚ูˆุงุนุฏ Wazuh XML ุฅู„ู‰ ุชู‚ู†ูŠุงุช MITRE ATT&CK ู…ูุฑุชู‘ุจุฉ ุจุฏุฑุฌุงุช ุซู‚ุฉุŒ ูˆูŠูู†ุชุฌ ุงุณุชุนู„ุงู…ุงุช ุชุญู‚ูŠู‚ WQL ุฌุงู‡ุฒุฉ ู„ู…ุญู„ู„ูŠ SOC. + +``` +ู‚ุงุนุฏุฉ Wazuh XML + โ†“ + Sanitization (ุญุฐู if_sid, group, mitre) + โ†“ + LLaMA 3 8B (ุชู„ุฎูŠุต ุจุฌู…ู„ุฉ ูˆุงุญุฏุฉ) + โ†“ + SecureBERT+ (768-dim embedding) + โ†“ + Logistic Regression + PCA (ุชุตู†ูŠู) + โ†“ + ุชู‚ู†ูŠุงุช MITRE ATT&CK + Confidence Scores + โ†“ + ู‚ูˆุงู„ุจ WQL ู„ู„ุชุญู‚ูŠู‚ +``` + +--- + +## 2. ู‡ูŠูƒู„ ุงู„ู…ุดุฑูˆุน ุงู„ูƒุงู…ู„ + +``` +d:\GP\ +โ”œโ”€โ”€ MurshidUIPipeline.ipynb โ† ุงู„ุฏูุชุฑ ุงู„ุฃุตู„ูŠ (ู„ุง ูŠูุนุฏูŽู‘ู„) +โ”œโ”€โ”€ Needed\ โ† ู…ู„ูุงุช ุงู„ู†ู…ุงุฐุฌ ุงู„ู…ุฏุฑู‘ุจุฉ +โ”‚ โ”œโ”€โ”€ murshid_logreg_pipeline_manual_oof_pcatuned.joblib +โ”‚ โ”œโ”€โ”€ murshid_logreg_thresholds_manual_oof_pcatuned.npy +โ”‚ โ”œโ”€โ”€ murshid_svmlinear_per_label_thresholds.joblib +โ”‚ โ””โ”€โ”€ murshid_label_columns.json (20 ุชู‚ู†ูŠุฉ) +โ”œโ”€โ”€ murshid_backend\ โ† ุฎุฏู…ุฉ FastAPI +โ”‚ โ”œโ”€โ”€ app\ +โ”‚ โ”‚ โ”œโ”€โ”€ main.py +โ”‚ โ”‚ โ”œโ”€โ”€ config.py +โ”‚ โ”‚ โ”œโ”€โ”€ api\routes\ +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ health.py GET /health +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ rules.py POST /rules/analyze | GET /results/{rule_id} +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ queries.py GET /queries/{technique_id} | POST,PATCH /admin/templates +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ stats.py GET /api/stats +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ db_viewer.py GET /api/db/{summary|rules|mappings|...} +โ”‚ โ”‚ โ”œโ”€โ”€ ml\ +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ sanitizer.py ุชู†ุธูŠู XML +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ summarizer.py LLaMA inference +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ embedder.py SecureBERT+ embeddings +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ logistic_model.py LogReg inference (PRIMARY) +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ pipeline.py ุชู†ุณูŠู‚ ุงู„ู…ุฑุงุญู„ (FULL|LOCAL|LITE) +โ”‚ โ”‚ โ”œโ”€โ”€ models\ SQLAlchemy ORM +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ user.py +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ mapping_job.py +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ rule.py +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ technique.py +โ”‚ โ”‚ โ”‚ โ”œโ”€โ”€ rule_technique_mapping.py +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€ query_template.py +โ”‚ โ”‚ โ”œโ”€โ”€ schemas\ Pydantic schemas +โ”‚ โ”‚ โ”œโ”€โ”€ services\ Business logic +โ”‚ โ”‚ โ”œโ”€โ”€ repositories\ DB access +โ”‚ โ”‚ โ””โ”€โ”€ db\ SQLAlchemy session +โ”‚ โ”œโ”€โ”€ alembic\ Migrations +โ”‚ โ”œโ”€โ”€ murshid.db SQLite database +โ”‚ โ”œโ”€โ”€ .env +โ”‚ โ””โ”€โ”€ requirements.txt +โ””โ”€โ”€ murshid_frontend\ ูˆุงุฌู‡ุฉ React + โ””โ”€โ”€ index.html +``` + +--- + +## 3. ุทุจู‚ุฉ ุงู„ุจุงูƒู†ุฏ (FastAPI) + +### 3.1 ุงู„ู€ Endpoints + +| Method | URL | ุงู„ูˆุตู | Actor | +|--------|-----|--------|-------| +| `GET` | `/health` | ุญุงู„ุฉ ุงู„ู†ุธุงู… + pipeline mode + ู…ู„ูุงุช ุงู„ู†ู…ุงุฐุฌ | All | +| `GET` | `/api/stats` | ุฅุญุตุงุฆูŠุงุช Dashboard (KPIs + Technique Frequency) | All | +| `GET` | `/api/db/summary` | ุนุฏุฏ ุงู„ุตููˆู ููŠ ูƒู„ ุฌุฏูˆู„ | Testing | +| `GET` | `/api/db/rules` | ุฌู…ูŠุน ุงู„ู‚ูˆุงุนุฏ ุงู„ู…ุฎุฒู‘ู†ุฉ | Testing | +| `GET` | `/api/db/mappings` | ุฌู…ูŠุน ู…ุทุงุจู‚ุงุช ุงู„ู‚ูˆุงุนุฏ-ุงู„ุชู‚ู†ูŠุงุช | Testing | +| `GET` | `/api/db/techniques` | ุฌู…ูŠุน ุชู‚ู†ูŠุงุช MITRE ุงู„ู…ุฎุฒู‘ู†ุฉ | Testing | +| `GET` | `/api/db/templates` | ุฌู…ูŠุน ู‚ูˆุงู„ุจ WQL | Testing | +| `POST` | `/rules/analyze` | ุชุญู„ูŠู„ ู‚ุงุนุฏุฉ XML โ†’ ุชุฎุฒูŠู† ุงู„ู†ุชุงุฆุฌ | Admin | +| `GET` | `/results/{rule_id}` | ุงุณุชุฑุฌุงุน ุชู‚ู†ูŠุงุช ู‚ุงุนุฏุฉ ู…ุญุฏุฏุฉ (Figure 4-11/12) | SOC Analyst | +| `GET` | `/queries/{technique_id}` | ู‚ูˆุงู„ุจ WQL ู„ุชู‚ู†ูŠุฉ ู…ุญุฏุฏุฉ | SOC Analyst | +| `POST` | `/admin/templates` | ุฅุถุงูุฉ ู‚ุงู„ุจ WQL ุฌุฏูŠุฏ | Admin | +| `PATCH` | `/admin/templates/{id}` | ุชุนุฏูŠู„/ุชุนุทูŠู„ ู‚ุงู„ุจ | Admin | + +### 3.2 ู…ุนู…ุงุฑูŠุฉ ุงู„ุทุจู‚ุงุช + +``` +HTTP Request + โ”‚ + โ–ผ +API Layer (FastAPI routes) + โ”‚ validates input (Pydantic) + โ–ผ +Service Layer + โ”‚ orchestrates business logic + โ–ผ +ML Layer Repository Layer + โ”‚ โ”‚ + โ–ผ โ–ผ +Pipeline SQLAlchemy ORM +(sanitizeโ†’embedโ†’classify) โ”‚ + โ”‚ โ–ผ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ†’ SQLite DB +``` + +### 3.3 ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช (SQLite + SQLAlchemy) + +ู…ุณุชุฎุฑุฌุฉ ุญุฑููŠุงู‹ ู…ู† ER Diagram (ยง3.2.6 ู…ู† ุงู„ุชู‚ุฑูŠุฑ): + +| ุงู„ุฌุฏูˆู„ | ุงู„ุฃุนู…ุฏุฉ ุงู„ุฑุฆูŠุณูŠุฉ | ุงู„ู…ุตุฏุฑ ููŠ ุงู„ุชู‚ุฑูŠุฑ | +|--------|------------------|-------------------| +| `users` | user_id, username, email, password_hash, role | User entity | +| `mapping_jobs` | job_id, user_id, file_name, status, progress, timestamp | MappingJob entity | +| `rules` | rule_id (PK), job_id, embedding_vector | Rule entity | +| `techniques` | technique_id (PK), technique_name, tactic | Technique entity | +| `rule_technique_mappings` | mapping_id, rule_id, technique_id, confidence_score | RuleTechniqueMapping | +| `query_templates` | template_id, technique_id, purpose, wql_query, note, is_active | QueryTemplate | + +> Index ุนู„ู‰ `rule_id` ููŠ `rule_technique_mappings` (Use Case 6 ยง3.2.7) + +--- + +## 4. ุทุจู‚ุฉ ML + +### 4.1 ู…ุฑุงุญู„ ุงู„ู€ Pipeline (ู…ู† ุงู„ุฏูุชุฑ) + +#### ุงู„ู…ุฑุญู„ุฉ 1: Sanitization +```python +# ml/sanitizer.py โ€” ู…ู† cell 10 ููŠ ุงู„ุฏูุชุฑ +REMOVE_TAGS_ANYWHERE = {"mitre", "if_sid", "group", "if_group"} +# ูŠูุญุฐู: group tags, if_sid, mitre IDs, compliance tags +# ูŠุจู‚ู‰: description, id, category, decoded_as, info +``` + +#### ุงู„ู…ุฑุญู„ุฉ 2: LLM Summarization (LLaMA 3 8B) +```python +# ml/summarizer.py โ€” ู…ู† cell 11 ููŠ ุงู„ุฏูุชุฑ +# Input: sanitized XML +# Prompt: "Write EXACTLY ONE sentence describing the observable event pattern" +# Output: JSON {"summary": "Detects ..."} +# Constraints: 7-18 words, ูŠุจุฏุฃ ุจู€ Detects/Monitors/... +``` + +#### ุงู„ู…ุฑุญู„ุฉ 3: Paragraph Construction +```python +# ml/embedder.py โ€” ู…ู† cell 12 ููŠ ุงู„ุฏูุชุฑ +text = f"{summary}. {description}." +# ู…ุซุงู„: "Detects deletion of global group. Windows: Security Enabled Global Group Deleted." +``` + +#### ุงู„ู…ุฑุญู„ุฉ 4: SecureBERT+ Embedding +```python +# ml/embedder.py โ€” ู…ู† cell 15 ููŠ ุงู„ุฏูุชุฑ +# Model: ehsanaghaei/SecureBERT_Plus +# MAX_LEN: 512 tokens, chunks +# Pooling: Mean pooling across tokens โ†’ 768-dim vector +# Normalization: L2 +``` + +#### ุงู„ู…ุฑุญู„ุฉ 5: Logistic Regression Inference +```python +# ml/logistic_model.py โ€” ู…ู† cell 18-19 ููŠ ุงู„ุฏูุชุฑ +proba = logreg_model.predict_proba(X_user) +proba = proba.reshape(-1) +pred = (proba >= logreg_thr).astype(int) +conf = proba * 100 +gap = proba - logreg_thr +# ุชูุฑุฌุน ุฌู…ูŠุน ุงู„ู€ 20 ุชู‚ู†ูŠุฉ ู…ุฑุชู‘ุจุฉ ุชู†ุงุฒู„ูŠุงู‹ +``` + +### 4.2 ุฃูˆุถุงุน ุงู„ุชุดุบูŠู„ + +| ุงู„ูˆุถุน | ุงู„ุดุฑุท | ุงู„ุฏู‚ุฉ | ุงู„ุงุณุชุฎุฏุงู… | +|-------|--------|-------|-----------| +| **FULL** | LLaMA + SecureBERT + LogReg | 100% (ู…ุทุงุจู‚ ู„ู„ุฏูุชุฑ) | Colab/GPU | +| **LOCAL** | SecureBERT + LogReg (ุจุฏูˆู† LLaMA) | ~95% (ูˆุตู ุจุฏูˆู† ู…ู„ุฎุต) | ุงู„ุฌู‡ุงุฒ ุงู„ู…ุญู„ูŠ | +| **LITE** | LogReg ูู‚ุท (ุจุฏูˆู† torch) | ู…ู†ุฎูุถุฉ (ุนุดูˆุงุฆูŠ) | ุงุฎุชุจุงุฑ ุงู„ุจู†ูŠุฉ ูู‚ุท | + +--- + +## 5. ุทุจู‚ุฉ ุงู„ูุฑูˆู†ุช (React + Tailwind + Chart.js) + +### 5.1 ุงู„ุตูุญุงุช (CDN-based React, ุจุฏูˆู† Build Step) + +| ุงู„ุตูุญุฉ | ID | ุงู„ู…ุณุชุฎุฏู… | ุงู„ูˆุตู | +|--------|-----|----------|--------| +| Login | โ€” | All | ุชุณุฌูŠู„ ุฏุฎูˆู„ + ุงุฎุชูŠุงุฑ ุฏูˆุฑ | +| Dashboard | `dashboard` | All | KPIs + MITRE Technique Frequency Chart | +| Rule Lookup | `rules` | SOC Analyst | ุจุญุซ ุจู€ Rule ID โ†’ Figure 4-11 + Figure 4-12 | +| ู†ุชุงุฆุฌ DB | `dbviewer` | All | ุงุณุชุนุฑุงุถ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช ู„ู„ุงุฎุชุจุงุฑ | +| Rule Mapping | `admin` | Admin | ุฑูุน XML + ุชุญู„ูŠู„ + ุฌุฏูˆู„ ุงู„ุชู‚ุฏู… | +| WQL Templates | `templates` | Admin | ุฅุฏุงุฑุฉ ู‚ูˆุงู„ุจ ุงู„ุงุณุชุนู„ุงู…ุงุช | +| Settings | `settings` | All | ู…ู„ู ุดุฎุตูŠ + Dark Mode + ุฃู„ูˆุงู† | + +### 5.2 ุงู„ู€ Figures ูƒู…ุง ููŠ ุงู„ุชู‚ุฑูŠุฑ + +| Figure | ุงู„ุตูุญุฉ | ุงู„ู…ูƒูˆู‘ู† | +|--------|--------|---------| +| Figure 4-10 | Rule Lookup | Search bar + Rule ID input | +| Figure 4-11 | Rule Lookup | `TechniqueDistributionChart` โ€” Horizontal bar chart (Top 5, ู…ูู„ูˆูŽู‘ู† H/M/L) | +| Figure 4-12 | Rule Lookup | Investigation Queries table (Primary + Secondary โ‰ฅ50%) | +| Figure 4-13 | Admin | Rule Mapping Panel (paste XML + Submit) | +| Figure 4-14 | Admin | Mapping Progress Table (Job ID, Status, Progress) | +| Figure 4-9 | Dashboard | KPIs + Technique Frequency Bar Chart | + +### 5.3 ุฑุจุท ุงู„ูุฑูˆู†ุช ุจุงู„ุจุงูƒู†ุฏ + +```javascript +const BASE = 'http://127.0.0.1:8000'; +// CORS ู…ููุนูŽู‘ู„ ููŠ ุงู„ุจุงูƒู†ุฏ ู„ู€ http://localhost:5173 ูˆ http://127.0.0.1:5173 +// ุงู„ูุฑูˆู†ุช ูŠูุฎุฏูŽู‘ู… ู…ุจุงุดุฑุฉู‹ ู…ู† FastAPI ุนุจุฑ StaticFiles +``` + +--- + +## 6. ู…ุฎุทุท ุชุฏูู‚ ุงู„ุจูŠุงู†ุงุช ุงู„ูƒุงู…ู„ + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ SOC Analyst / Admin โ”‚ +โ”‚ (murshid_frontend/index.html) โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ HTTP/JSON + โ–ผ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ FastAPI (port 8000) โ”‚ +โ”‚ โ”‚ +โ”‚ /health โ†’ pipeline status โ”‚ +โ”‚ POST /rules/analyze: โ”‚ +โ”‚ 1. sanitizer.py โ†’ clean XML โ”‚ +โ”‚ 2. summarizer.py โ†’ LLaMA summary โ”‚ โ† FULL mode only +โ”‚ 3. embedder.py โ†’ 768-dim vector โ”‚ +โ”‚ 4. logistic_model โ†’ proba + scores โ”‚ +โ”‚ 5. rule_repo โ†’ save to DB โ”‚ +โ”‚ โ”‚ +โ”‚ GET /results/{id} โ†’ from DB โ”‚ +โ”‚ GET /queries/{id} โ†’ WQL templates โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ SQLAlchemy + โ–ผ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ SQLite (murshid.db) โ”‚ +โ”‚ rules | techniques | mappings โ”‚ +โ”‚ query_templates | mapping_jobs โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +--- + +## 7. ุงู„ุชุดุบูŠู„ + +### ุงู„ู…ุชุทู„ุจุงุช +- Python 3.12 (ุนุจุฑ uv) +- ู…ู„ูุงุช ุงู„ู†ู…ุงุฐุฌ ููŠ `d:\GP\Needed\` +- ุงุชุตุงู„ ุฅู†ุชุฑู†ุช (ู„ู€ SecureBERT+ ู…ู† HuggingFace ุฃูˆู„ ู…ุฑุฉ) + +### ุชุดุบูŠู„ ุงู„ุฎุงุฏู… +```powershell +cd d:\GP\murshid_backend +.venv\Scripts\python.exe -m uvicorn app.main:app --host 127.0.0.1 --port 8000 +``` + +### ุงู„ุฑูˆุงุจุท +| ุงู„ุฑุงุจุท | ุงู„ูˆุตู | +|--------|--------| +| http://127.0.0.1:8000/index.html | ุงู„ูˆุงุฌู‡ุฉ ุงู„ุฑุฆูŠุณูŠุฉ | +| http://127.0.0.1:8000/docs | Swagger API Documentation | +| http://127.0.0.1:8000/health | ูุญุต ุญุงู„ุฉ ุงู„ู†ุธุงู… | +| http://127.0.0.1:8000/api/db/summary | ู…ู„ุฎุต ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช | + +### ุงุฎุชุจุงุฑ ุณุฑูŠุน +```powershell +# 1. ุชุญู„ูŠู„ ู‚ุงุนุฏุฉ +$body = '{"rule_xml":"Registry Key Entry Deleted."}' +Invoke-RestMethod -Uri "http://127.0.0.1:8000/rules/analyze" -Method POST -ContentType "application/json" -Body $body + +# 2. ุงุณุชุฑุฌุงุน ุงู„ู†ุชุงุฆุฌ +Invoke-RestMethod "http://127.0.0.1:8000/results/597" + +# 3. ุฅุถุงูุฉ ู‚ุงู„ุจ WQL +$t = '{"technique_id":"T1112","purpose":"Detect registry modification","wql_query":"agent.name:${HOST} AND rule.description:\"registry\"","note":"Replace ${HOST}"}' +Invoke-RestMethod -Uri "http://127.0.0.1:8000/admin/templates" -Method POST -ContentType "application/json" -Body $t + +# 4. ุฌู„ุจ ุงู„ุงุณุชุนู„ุงู…ุงุช +Invoke-RestMethod "http://127.0.0.1:8000/queries/T1112" +``` + +--- + +## 8. ุงู„ูุฑู‚ ุจูŠู† FULL mode (Colab) ูˆ LOCAL mode (ุงู„ุฌู‡ุงุฒ) + +| | Colab (FULL) | ุงู„ุฌู‡ุงุฒ ุงู„ู…ุญู„ูŠ (LOCAL) | +|--|-------------|----------------------| +| Input text | `"Detects deletion of a security-enabled global group. Windows: Security Enabled Global Group Deleted."` | `"Windows: Security Enabled Global Group Deleted"` | +| T1484 proba | **0.9476 (94.76%)** | **0.8929 (89.29%)** | +| ุณุจุจ ุงู„ูุฑู‚ | LLaMA ูŠูุซุฑูŠ ุงู„ู†ุต ุจุณูŠุงู‚ ุฏู„ุงู„ูŠ | ุงู„ูˆุตู ูู‚ุท ุจุฏูˆู† ุฅุซุฑุงุก | +| ุงู„ู‚ุฑุงุฑ ุงู„ุตุญูŠุญ | T1484 โœ… | T1484 โœ… | + +**ุงู„ุงุณุชู†ุชุงุฌ:** ุงู„ู‚ุฑุงุฑ ุงู„ู†ู‡ุงุฆูŠ ุตุญูŠุญ ููŠ ูƒู„ุง ุงู„ูˆุถุนูŠู† โ€” ุงู„ุงุฎุชู„ุงู ููŠ ุฏุฑุฌุฉ ุงู„ุซู‚ุฉ ูู‚ุท. + +--- + +## 9. ุญุงู„ุงุช ุงู„ุงุณุชุฎุฏุงู… ุงู„ู…ูู†ููŽู‘ุฐุฉ (ู…ู† ุงู„ุชู‚ุฑูŠุฑ) + +| Use Case | ุงู„ูˆุตู | ู…ูู†ููŽู‘ุฐ | +|----------|--------|---------| +| UC1 | View techniques and scores for a rule | โœ… `GET /results/{rule_id}` | +| UC2 | View WQL investigation queries | โœ… `GET /queries/{technique_id}` | +| UC3 | Copy and fill investigation query | โœ… ุฒุฑ Copy ููŠ ุงู„ูุฑูˆู†ุช | +| UC4 | Upload Wazuh rule(s) | โœ… Admin Panel | +| UC5 | Process rule via ML pipeline | โœ… `POST /rules/analyze` | +| UC6 | Store mapped techniques in DB | โœ… ุชู„ู‚ุงุฆูŠ ุจุนุฏ analyze | +| UC7 | Manage WQL templates repository | โœ… `POST/PATCH /admin/templates` | diff --git a/murshid_backend/alembic.ini b/murshid_backend/alembic.ini new file mode 100644 index 0000000000000000000000000000000000000000..0eda848ee5b8059ac2c9504b0805ce277162ba3e --- /dev/null +++ b/murshid_backend/alembic.ini @@ -0,0 +1,38 @@ +[alembic] +script_location = alembic +prepend_sys_path = . +sqlalchemy.url = sqlite:///murshid.db + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/murshid_backend/alembic/env.py b/murshid_backend/alembic/env.py new file mode 100644 index 0000000000000000000000000000000000000000..292be508a12fafc3b9c2a45726e7acf4809a2d73 --- /dev/null +++ b/murshid_backend/alembic/env.py @@ -0,0 +1,52 @@ +import sys +from logging.config import fileConfig +from pathlib import Path + +from sqlalchemy import engine_from_config, pool + +from alembic import context + +# make app importable +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from app.config import settings +from app.db.base import Base +import app.models # noqa: F401 โ€” registers all models with Base.metadata + +config = context.config +config.set_main_option("sqlalchemy.url", settings.murshid_db_url) + +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +target_metadata = Base.metadata + + +def run_migrations_offline() -> None: + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + with connectable.connect() as connection: + context.configure(connection=connection, target_metadata=target_metadata) + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/murshid_backend/alembic/script.py.mako b/murshid_backend/alembic/script.py.mako new file mode 100644 index 0000000000000000000000000000000000000000..17dcba0ef89f896010374bbb3db808071268aa4c --- /dev/null +++ b/murshid_backend/alembic/script.py.mako @@ -0,0 +1,25 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/murshid_backend/alembic/versions/0001_initial_schema.py b/murshid_backend/alembic/versions/0001_initial_schema.py new file mode 100644 index 0000000000000000000000000000000000000000..c0bd898f311cf11808d3a093b923ea592614aaf2 --- /dev/null +++ b/murshid_backend/alembic/versions/0001_initial_schema.py @@ -0,0 +1,87 @@ +"""initial schema โ€” all 6 tables from ER Diagram ยง3.2.6 + +Revision ID: 0001 +Revises: +Create Date: 2026-04-08 +""" + +from typing import Sequence, Union + +import sqlalchemy as sa +from alembic import op + +revision: str = "0001" +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "users", + sa.Column("user_id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("username", sa.String(100), unique=True, nullable=False), + sa.Column("email", sa.String(255), unique=True, nullable=False), + sa.Column("password_hash", sa.String(255), nullable=False), + sa.Column("role", sa.String(20), nullable=False, server_default="analyst"), + ) + + op.create_table( + "mapping_jobs", + sa.Column("job_id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("user_id", sa.Integer(), sa.ForeignKey("users.user_id"), nullable=False), + sa.Column("file_name", sa.String(255), nullable=False), + sa.Column("rules_count", sa.Integer(), server_default="0"), + sa.Column("status", sa.String(20), nullable=False, server_default="pending"), + sa.Column("progress", sa.Integer(), server_default="0"), + sa.Column("timestamp", sa.DateTime(), server_default=sa.func.now()), + ) + + op.create_table( + "rules", + sa.Column("rule_id", sa.String(50), primary_key=True), + sa.Column("job_id", sa.Integer(), sa.ForeignKey("mapping_jobs.job_id"), nullable=True), + sa.Column("embedding_vector", sa.Text(), nullable=True), + ) + + op.create_table( + "techniques", + sa.Column("technique_id", sa.String(20), primary_key=True), + sa.Column("technique_name", sa.String(255), nullable=False), + sa.Column("tactic", sa.String(100), nullable=True), + ) + + op.create_table( + "rule_technique_mappings", + sa.Column("mapping_id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column("rule_id", sa.String(50), sa.ForeignKey("rules.rule_id"), nullable=False), + sa.Column( + "technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False + ), + sa.Column("confidence_score", sa.Float(), nullable=False), + ) + # Index on rule_id โ€” Use Case 6 ยง3.2.7 + op.create_index("ix_rule_technique_rule_id", "rule_technique_mappings", ["rule_id"]) + + op.create_table( + "query_templates", + sa.Column("template_id", sa.Integer(), primary_key=True, autoincrement=True), + sa.Column( + "technique_id", sa.String(20), sa.ForeignKey("techniques.technique_id"), nullable=False + ), + sa.Column("purpose", sa.String(255), nullable=True), + sa.Column("wql_query", sa.Text(), nullable=False), + sa.Column("note", sa.Text(), nullable=True), + sa.Column("is_active", sa.Boolean(), nullable=False, server_default="1"), + ) + + +def downgrade() -> None: + op.drop_table("query_templates") + op.drop_index("ix_rule_technique_rule_id", table_name="rule_technique_mappings") + op.drop_table("rule_technique_mappings") + op.drop_table("techniques") + op.drop_table("rules") + op.drop_table("mapping_jobs") + op.drop_table("users") + pass # SQLite: no custom types to drop diff --git a/murshid_backend/app/__init__.py b/murshid_backend/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b4873060f5df02cc130c2c7b5a1a40b8021952e0 --- /dev/null +++ b/murshid_backend/app/__init__.py @@ -0,0 +1 @@ +"""Murshid backend package.""" diff --git a/murshid_backend/app/api/__init__.py b/murshid_backend/app/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f3b5cfaec50d63556a3347dcd266a77f433af2 --- /dev/null +++ b/murshid_backend/app/api/__init__.py @@ -0,0 +1 @@ +"""API layer โ€” FastAPI routers.""" diff --git a/murshid_backend/app/api/routes/__init__.py b/murshid_backend/app/api/routes/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f36ec2fa9b2d5f19160f5c9b6b2319bc6bec36aa --- /dev/null +++ b/murshid_backend/app/api/routes/__init__.py @@ -0,0 +1 @@ +"""Route modules.""" diff --git a/murshid_backend/app/api/routes/db_viewer.py b/murshid_backend/app/api/routes/db_viewer.py new file mode 100644 index 0000000000000000000000000000000000000000..d2c1efe8c6da1bb3f5d86c28c9dd8e511c641507 --- /dev/null +++ b/murshid_backend/app/api/routes/db_viewer.py @@ -0,0 +1,122 @@ +""" +GET /api/db/rules โ€” all rules in DB +GET /api/db/mappings โ€” all rule-technique mappings +GET /api/db/techniques โ€” all techniques +GET /api/db/templates โ€” all query templates +GET /api/db/summary โ€” counts per table +POST /api/db/import-excel โ€” import WQL templates from Excel file +""" + +from fastapi import APIRouter, Depends, HTTPException, Query +from sqlalchemy import func +from sqlalchemy.orm import Session + +from app.db.session import get_db +from app.models.mapping_job import MappingJob +from app.models.query_template import QueryTemplate +from app.models.rule import Rule +from app.models.rule_technique_mapping import RuleTechniqueMapping +from app.models.technique import Technique + +router = APIRouter(prefix="/api/db", tags=["db-viewer"]) + + +@router.get("/summary") +def db_summary(db: Session = Depends(get_db)): + return { + "rules": db.query(func.count(Rule.rule_id)).scalar(), + "techniques": db.query(func.count(Technique.technique_id)).scalar(), + "rule_mappings": db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar(), + "query_templates": db.query(func.count(QueryTemplate.template_id)).scalar(), + "mapping_jobs": db.query(func.count(MappingJob.job_id)).scalar(), + } + + +@router.get("/rules") +def all_rules(db: Session = Depends(get_db)): + rows = db.query(Rule).order_by(Rule.rule_id).all() + return [ + { + "rule_id": r.rule_id, + "job_id": r.job_id, + "has_embedding": r.embedding_vector is not None, + } + for r in rows + ] + + +@router.get("/mappings") +def all_mappings(db: Session = Depends(get_db)): + rows = ( + db.query(RuleTechniqueMapping) + .order_by( + RuleTechniqueMapping.rule_id, + RuleTechniqueMapping.confidence_score.desc(), + ) + .all() + ) + return [ + { + "mapping_id": m.mapping_id, + "rule_id": m.rule_id, + "technique_id": m.technique_id, + "confidence_score": round(m.confidence_score, 4), + "confidence_pct": round(m.confidence_score * 100, 2), + } + for m in rows + ] + + +@router.get("/techniques") +def all_techniques(db: Session = Depends(get_db)): + rows = db.query(Technique).order_by(Technique.technique_id).all() + return [ + { + "technique_id": t.technique_id, + "technique_name": t.technique_name, + "tactic": t.tactic, + } + for t in rows + ] + + +@router.get("/templates") +def all_templates(db: Session = Depends(get_db)): + rows = db.query(QueryTemplate).order_by(QueryTemplate.technique_id, QueryTemplate.template_id).all() + return [ + { + "template_id": t.template_id, + "technique_id": t.technique_id, + "purpose": t.purpose, + "wql_query": t.wql_query, + "note": t.note, + "is_active": t.is_active, + } + for t in rows + ] + + +@router.post("/import-excel") +def import_excel_templates( + replace: bool = Query(False, description="Update existing templates if True"), + db: Session = Depends(get_db), +): + """ + Import WQL query templates from the Excel file: + murshid_query_template_structure_clean_shared.xlsx + + The file is read from MURSHID_MODELS_DIR or the GP root folder. + Pass ?replace=true to overwrite existing templates. + """ + try: + from scripts.import_excel_templates import run + result = run(db, replace=replace) + except FileNotFoundError as e: + raise HTTPException(status_code=404, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + if "error" in result: + raise HTTPException(status_code=404, detail=result["error"]) + + return result diff --git a/murshid_backend/app/api/routes/health.py b/murshid_backend/app/api/routes/health.py new file mode 100644 index 0000000000000000000000000000000000000000..79ce5be0efbfec6b3d34e1d0ac08f61dbeb83cf6 --- /dev/null +++ b/murshid_backend/app/api/routes/health.py @@ -0,0 +1,73 @@ +"""GET /health โ€” system readiness check with clear pipeline mode info.""" + +from pathlib import Path + +from fastapi import APIRouter + +from app.config import settings +from app.ml.pipeline import _store, is_ready + +router = APIRouter(tags=["health"]) + +try: + import torch + _CUDA = torch.cuda.is_available() + _TORCH = True + _TORCH_ERR = None +except (ImportError, OSError) as _e: + _CUDA = False + _TORCH = False + _TORCH_ERR = str(_e) + + +def _check_model_files() -> dict: + base = Path(settings.murshid_models_dir).resolve() + files = { + "logreg_joblib": base / settings.logreg_joblib, + "logreg_thresholds": base / settings.logreg_thresholds_npy, + "label_columns": base / settings.label_columns_json, + } + return {k: v.is_file() for k, v in files.items()} + + +@router.get("/health") +def health(): + model_files = _check_model_files() + all_files_ok = all(model_files.values()) + + if _store.llama_model is not None: + mode = "full" + mode_desc = "LLaMA + SecureBERT+ + LogReg" + elif _store.embedder is not None and _store.logreg is not None: + mode = "local" + mode_desc = "SecureBERT+ + LogReg (no LLaMA โ€” using description as text)" + elif _store.logreg is not None: + mode = "lite" + mode_desc = "LogReg only (no embedder โ€” random vectors, testing only)" + else: + mode = "not_ready" + mode_desc = "No ML models loaded" + + return { + "status": "ok", + "pipeline_ready": is_ready(), + "pipeline_mode": mode, + "pipeline_description": mode_desc, + "analyze_available": _store.logreg is not None, + "components": { + "llama_loaded": _store.llama_model is not None, + "embedder_loaded": _store.embedder is not None, + "logreg_loaded": _store.logreg is not None, + "torch_installed": _TORCH, + "cuda_available": _CUDA, + "torch_error": _TORCH_ERR, + }, + "model_files": model_files, + "all_model_files_present": all_files_ok, + "models_dir": str(settings.murshid_models_dir.resolve()), + "skip_llm_env": settings.murshid_skip_llm, + "next_step": ( + "POST /rules/analyze is ready!" if _store.logreg is not None + else "Copy .joblib and .npy files to MURSHID_MODELS_DIR and restart." + ), + } diff --git a/murshid_backend/app/api/routes/queries.py b/murshid_backend/app/api/routes/queries.py new file mode 100644 index 0000000000000000000000000000000000000000..949ab5bb46ec6c01fb126c18de64eebe0303371d --- /dev/null +++ b/murshid_backend/app/api/routes/queries.py @@ -0,0 +1,78 @@ +""" +GET /queries/{technique_id} โ€” SOC Analyst: fetch WQL templates. +POST /admin/templates โ€” Admin: add new template. +PATCH /admin/templates/{template_id} โ€” Admin: update / disable template. + +Based on: + Use Case 2 (View Investigation WQL Queries) โ€” ยง3.2.7 + Use Case 7 (Manage static query templates) โ€” ยง3.2.7 +""" + +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy.orm import Session + +from app.db.session import get_db +from app.schemas.query import QueryTemplateIn, QueryTemplateOut, QueryTemplateUpdate +from app.services.template_service import TemplateService + +router = APIRouter(tags=["queries"]) + + +def _get_template_service(db: Session = Depends(get_db)) -> TemplateService: + return TemplateService(db=db) + + +# --------------------------------------------------------------------------- +# GET /queries/{technique_id} +# --------------------------------------------------------------------------- + + +@router.get("/queries/{technique_id}", response_model=list[QueryTemplateOut]) +def get_queries( + technique_id: str, + svc: TemplateService = Depends(_get_template_service), +): + """ + Returns all active WQL templates for the given MITRE technique. + Use Case 2 โ€” ยง3.2.7 + """ + templates = svc.get_queries_for_technique(technique_id) + if not templates: + raise HTTPException( + status_code=404, + detail=f"No active query templates found for technique '{technique_id}'.", + ) + return [QueryTemplateOut(**t) for t in templates] + + +# --------------------------------------------------------------------------- +# Admin endpoints +# --------------------------------------------------------------------------- + + +@router.post("/admin/templates", response_model=QueryTemplateOut, status_code=201) +def add_template( + body: QueryTemplateIn, + svc: TemplateService = Depends(_get_template_service), +): + """Admin: add a new WQL template. Use Case 7 โ€” ยง3.2.7""" + result = svc.add_template( + technique_id=body.technique_id, + purpose=body.purpose, + wql_query=body.wql_query, + note=body.note, + ) + return QueryTemplateOut(**result) + + +@router.patch("/admin/templates/{template_id}", response_model=QueryTemplateOut) +def update_template( + template_id: int, + body: QueryTemplateUpdate, + svc: TemplateService = Depends(_get_template_service), +): + """Admin: update or disable a WQL template. Use Case 7 โ€” ยง3.2.7""" + result = svc.update_template(template_id, body.model_dump(exclude_none=True)) + if result is None: + raise HTTPException(status_code=404, detail=f"Template {template_id} not found.") + return QueryTemplateOut(**result) diff --git a/murshid_backend/app/api/routes/rules.py b/murshid_backend/app/api/routes/rules.py new file mode 100644 index 0000000000000000000000000000000000000000..ff8b9b937761bdb0cbbb1838a257e77ce7474c89 --- /dev/null +++ b/murshid_backend/app/api/routes/rules.py @@ -0,0 +1,100 @@ +""" +POST /rules/analyze โ€” Admin: analyze a rule, persist results. +GET /results/{rule_id} โ€” SOC Analyst: retrieve stored mappings. + +Based on: + Use Case 4+5+6 (Upload, Process, Store) โ€” ยง3.2.7 + Use Case 1 (View techniques and scores) โ€” ยง3.2.7 +""" + +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy.orm import Session + +from app.db.session import get_db +from app.ml.pipeline import is_ready +from app.schemas.result import MappingResult, ResultsResponse +from app.schemas.rule import AnalyzeRequest, AnalyzeResponse, TechniqueResult +from app.services.ml_service import MLService +from app.services.result_service import ResultService +from app.services.rule_service import RuleService + +router = APIRouter(tags=["rules"]) + + +def _get_rule_service(db: Session = Depends(get_db)) -> RuleService: + return RuleService(db=db, ml=MLService()) + + +def _get_result_service(db: Session = Depends(get_db)) -> ResultService: + return ResultService(db=db) + + +# --------------------------------------------------------------------------- +# POST /rules/analyze +# --------------------------------------------------------------------------- + + +@router.post("/rules/analyze", response_model=AnalyzeResponse, status_code=201) +def analyze_rule( + body: AnalyzeRequest, + svc: RuleService = Depends(_get_rule_service), +): + """ + Runs the full ML pipeline on the submitted Wazuh rule XML and stores + the results in the database. + """ + if not is_ready(): + raise HTTPException(status_code=503, detail="ML pipeline not ready.") + + try: + result = svc.analyze_and_persist(body.rule_xml) + except ValueError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + except RuntimeError as exc: + raise HTTPException(status_code=503, detail=str(exc)) from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=str(exc)) from exc + + all_results = [TechniqueResult(**r) for r in result["results"]] + detected = [r for r in all_results if r.predicted] + + return AnalyzeResponse( + rule_id=result["rule_id"], + sanitized_xml=result["sanitized_xml"], + summary=result["summary"], + text_for_embedding=result["text_for_embedding"], + embedding_dim=result["embedding_dim"], + pipeline_mode=result.get("pipeline_mode", "full"), + detected=detected, + all_results=all_results, + ) + + +# --------------------------------------------------------------------------- +# GET /results/{rule_id} +# --------------------------------------------------------------------------- + + +@router.get("/results/{rule_id}", response_model=ResultsResponse) +def get_results( + rule_id: str, + svc: ResultService = Depends(_get_result_service), +): + """ + Returns all stored MITRE ATT&CK techniques for a rule ID, sorted by confidence. + Use Case 1 โ€” ยง3.2.7 + - mappings: ALL techniques sorted by confidence desc (for Figure 4-11 Top 5 chart) + - detected: primary + secondary (โ‰ฅ0.5) only (for Figure 4-12 WQL queries) + """ + data = svc.get_results_for_rule(rule_id) + if data is None: + raise HTTPException( + status_code=404, + detail=f"No mapping results found for rule_id '{rule_id}'. " + "Run POST /rules/analyze first.", + ) + return ResultsResponse( + rule_id=rule_id, + mappings=[MappingResult(**m) for m in data["mappings"]], + detected=[MappingResult(**m) for m in data["detected"]], + ) diff --git a/murshid_backend/app/api/routes/stats.py b/murshid_backend/app/api/routes/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..a9a68f477e1cfac68d95f0ca7294a728fa745698 --- /dev/null +++ b/murshid_backend/app/api/routes/stats.py @@ -0,0 +1,43 @@ +"""GET /api/stats โ€” dashboard KPIs.""" + +from fastapi import APIRouter, Depends +from sqlalchemy import func +from sqlalchemy.orm import Session + +from app.db.session import get_db +from app.models.rule import Rule +from app.models.rule_technique_mapping import RuleTechniqueMapping +from app.models.query_template import QueryTemplate +from app.models.technique import Technique + +router = APIRouter(prefix="/api", tags=["stats"]) + + +@router.get("/stats") +def get_stats(db: Session = Depends(get_db)): + total_rules = db.query(func.count(Rule.rule_id)).scalar() or 0 + total_mappings = db.query(func.count(RuleTechniqueMapping.mapping_id)).scalar() or 0 + total_queries = db.query(func.count(QueryTemplate.template_id)).filter(QueryTemplate.is_active.is_(True)).scalar() or 0 + total_techniques = db.query(func.count(Technique.technique_id)).scalar() or 0 + + technique_freq = ( + db.query( + RuleTechniqueMapping.technique_id, + func.count(RuleTechniqueMapping.mapping_id).label("count"), + ) + .group_by(RuleTechniqueMapping.technique_id) + .order_by(func.count(RuleTechniqueMapping.mapping_id).desc()) + .limit(10) + .all() + ) + + return { + "total_rules_mapped": total_rules, + "total_techniques": total_techniques, + "total_mappings": total_mappings, + "total_queries": total_queries, + "technique_frequency": [ + {"technique_id": t.technique_id, "count": t.count} + for t in technique_freq + ], + } diff --git a/murshid_backend/app/config.py b/murshid_backend/app/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d1410fe100ad19fb332b99acdb74b5c9ac6c8061 --- /dev/null +++ b/murshid_backend/app/config.py @@ -0,0 +1,29 @@ +from pathlib import Path + +from pydantic_settings import BaseSettings, SettingsConfigDict + +_GP_ROOT = Path(__file__).resolve().parent.parent.parent + + +class Settings(BaseSettings): + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", + ) + + murshid_db_url: str = "mysql+pymysql://root:password@localhost:3306/murshid_db" + murshid_models_dir: Path = _GP_ROOT / "Needed" + hf_token: str | None = None + murshid_skip_llm: bool = False + secret_key: str = "change_me" + + llama_model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct" + embed_model_id: str = "ehsanaghaei/SecureBERT_Plus" + + logreg_joblib: str = "murshid_logreg_pipeline_manual_oof_pcatuned.joblib" + logreg_thresholds_npy: str = "murshid_logreg_thresholds_manual_oof_pcatuned.npy" + label_columns_json: str = "murshid_label_columns.json" + + +settings = Settings() diff --git a/murshid_backend/app/db/__init__.py b/murshid_backend/app/db/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8e10b1a368332298776a47106feabeadf5f013cc --- /dev/null +++ b/murshid_backend/app/db/__init__.py @@ -0,0 +1 @@ +"""Database layer.""" diff --git a/murshid_backend/app/db/base.py b/murshid_backend/app/db/base.py new file mode 100644 index 0000000000000000000000000000000000000000..fa2b68a5d245bbdde7fbea6b86c9650a584167d6 --- /dev/null +++ b/murshid_backend/app/db/base.py @@ -0,0 +1,5 @@ +from sqlalchemy.orm import DeclarativeBase + + +class Base(DeclarativeBase): + pass diff --git a/murshid_backend/app/db/session.py b/murshid_backend/app/db/session.py new file mode 100644 index 0000000000000000000000000000000000000000..9e4d24961db1aa125a645572b8244708ae3ae779 --- /dev/null +++ b/murshid_backend/app/db/session.py @@ -0,0 +1,25 @@ +from collections.abc import Generator + +from sqlalchemy import create_engine +from sqlalchemy.orm import Session, sessionmaker + +from app.config import settings + +_is_sqlite = settings.murshid_db_url.startswith("sqlite") + +engine = create_engine( + settings.murshid_db_url, + connect_args={"check_same_thread": False} if _is_sqlite else {}, + pool_pre_ping=not _is_sqlite, + pool_recycle=3600 if not _is_sqlite else -1, +) + +SessionLocal = sessionmaker(bind=engine, autocommit=False, autoflush=False) + + +def get_db() -> Generator[Session, None, None]: + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/murshid_backend/app/main.py b/murshid_backend/app/main.py new file mode 100644 index 0000000000000000000000000000000000000000..ece73d45e892818c0afa726e8f1e93597b1a097b --- /dev/null +++ b/murshid_backend/app/main.py @@ -0,0 +1,60 @@ +""" +Murshid Backend โ€” FastAPI entrypoint. + +Architecture: + API Layer โ†’ app/api/routes/ + Service Layerโ†’ app/services/ + ML Layer โ†’ app/ml/ + Repository โ†’ app/repositories/ + Database โ†’ app/db/ (SQLAlchemy + Alembic, MySQL) +""" + +from __future__ import annotations + +from contextlib import asynccontextmanager + +from pathlib import Path + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from fastapi.staticfiles import StaticFiles + +from app.api.routes import db_viewer, health, queries, rules, stats +from app.ml.pipeline import load_models, unload_models + +_FRONTEND_DIR = Path(__file__).resolve().parent.parent.parent / "murshid_frontend" + + +@asynccontextmanager +async def lifespan(app: FastAPI): + load_models() + yield + unload_models() + + +app = FastAPI( + title="Murshid API", + description=( + "MITRE ATT&CK-Aligned Techniques Mapping for SOC Analysts. " + "Transforms Wazuh IDS rules into actionable threat intelligence." + ), + version="1.0.0", + lifespan=lifespan, +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(health.router) +app.include_router(stats.router) +app.include_router(db_viewer.router) +app.include_router(rules.router) +app.include_router(queries.router) + +if _FRONTEND_DIR.is_dir(): + app.mount("/", StaticFiles(directory=str(_FRONTEND_DIR), html=True), name="frontend") diff --git a/murshid_backend/app/ml/__init__.py b/murshid_backend/app/ml/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5e732fa909b6b716cd3232cbc29aeb52f6b51a7e --- /dev/null +++ b/murshid_backend/app/ml/__init__.py @@ -0,0 +1 @@ +"""ML layer โ€” logic extracted from MurshidUIPipeline.ipynb without modifying the original.""" diff --git a/murshid_backend/app/ml/embedder.py b/murshid_backend/app/ml/embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..593f54c21dca923841e8acfad01588177ae60fc9 --- /dev/null +++ b/murshid_backend/app/ml/embedder.py @@ -0,0 +1,116 @@ +""" +SecureBERT+ embedder โ€” extracted from MurshidUIPipeline.ipynb (cell 15). +Produces a 768-dim float32 embedding for a text paragraph. +Also provides build_text_for_embedding (cell 12). +Original file is NOT modified. +""" + +from __future__ import annotations + +import numpy as np +from lxml import etree + +try: + import torch + from transformers import AutoModel, AutoTokenizer + _TORCH_OK = True +except (ImportError, OSError): + _TORCH_OK = False + +from app.config import settings + + +def _norm_spaces(s: str) -> str: + return " ".join((s or "").split()).strip() + + +def _strip_end_punct(s: str) -> str: + return (s or "").rstrip(". ").strip() + + +def build_text_for_embedding(clean_rule: str, summary: str) -> str: + """Combine LLM summary with rule description โ€” cell 12 of notebook.""" + rule_elem = etree.fromstring(clean_rule.strip()) + raw_desc = rule_elem.findtext("description") or "" + description = _norm_spaces(raw_desc) + summary = _norm_spaces(summary) + description = _norm_spaces(description) + + if not summary and not description: + return "" + if summary and not description: + return summary + if description and not summary: + return description + + s0 = _strip_end_punct(summary).lower() + d0 = _strip_end_punct(description).lower() + + if s0 == d0: + return _strip_end_punct(summary) + "." + return f"{_strip_end_punct(summary)}. {_strip_end_punct(description)}." + + +class SecureBERTEmbedder: + """Mean-pooling embedder using ehsanaghaei/SecureBERT_Plus โ€” cell 15.""" + + MAX_LEN = 512 + BATCH_CHUNKS = 8 + + def __init__(self, model_id: str | None = None, device: str | None = None): + if not _TORCH_OK: + raise RuntimeError("torch/transformers not available โ€” SecureBERTEmbedder cannot be initialised.") + mid = model_id or settings.embed_model_id + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + self.tokenizer = AutoTokenizer.from_pretrained(mid, use_fast=True) + self.model = AutoModel.from_pretrained(mid).to(self.device) + self.model.eval() + self.cls_id = self.tokenizer.cls_token_id + self.sep_id = self.tokenizer.sep_token_id + self.pad_id = ( + self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None + else self.sep_id + ) + + def _chunk_text(self, text: str) -> list[list[int]]: + token_ids = self.tokenizer.encode(text, add_special_tokens=False) + chunk_size = self.MAX_LEN - 2 + chunks = [] + for i in range(0, len(token_ids), chunk_size): + piece = token_ids[i : i + chunk_size] + chunks.append([self.cls_id] + piece + [self.sep_id]) + return chunks + + def embed_text(self, text: str) -> np.ndarray: + chunks = self._chunk_text(text) + all_embs: list[np.ndarray] = [] + + for i in range(0, len(chunks), self.BATCH_CHUNKS): + batch = chunks[i : i + self.BATCH_CHUNKS] + max_len = max(len(x) for x in batch) + input_ids, masks = [], [] + for x in batch: + pad = max_len - len(x) + input_ids.append(x + [self.pad_id] * pad) + masks.append([1] * len(x) + [0] * pad) + + ids_t = torch.tensor(input_ids).to(self.device) + mask_t = torch.tensor(masks).to(self.device) + + with torch.no_grad(): + out = self.model(input_ids=ids_t, attention_mask=mask_t) + tok_emb = out.last_hidden_state + mask_exp = mask_t.unsqueeze(-1).expand(tok_emb.size()).float() + summed = torch.sum(tok_emb * mask_exp, dim=1) + denom = torch.clamp(mask_exp.sum(dim=1), min=1e-9) + mean_pooled = summed / denom + + all_embs.append(mean_pooled.cpu().numpy()) + + all_embs_np = np.vstack(all_embs) + para_emb = all_embs_np.mean(axis=0) + para_emb /= np.linalg.norm(para_emb) + 1e-12 + return para_emb.astype(np.float32) diff --git a/murshid_backend/app/ml/logistic_model.py b/murshid_backend/app/ml/logistic_model.py new file mode 100644 index 0000000000000000000000000000000000000000..fc366242bf8f7cb7ce3021365fcba7117376b9ef --- /dev/null +++ b/murshid_backend/app/ml/logistic_model.py @@ -0,0 +1,111 @@ +""" +Logistic Regression โ€” PRIMARY model per user decision. + +Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19): + + logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib") + logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy") + + proba = logreg_model.predict_proba(X_user) + + if isinstance(proba, list): + proba = np.column_stack([p[:, 1] for p in proba]) + elif proba.ndim == 3: + proba = proba[:, :, 1] + + proba = proba.reshape(-1) + + pred_logreg = (proba >= logreg_thr).astype(int) + conf_logreg = proba * 100 + gap_logreg = proba - logreg_thr + +Original notebook file is NOT modified. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import joblib +import numpy as np + +from app.config import settings + + +class LogisticRegressionModel: + """ + Wraps the trained Logistic Regression pipeline + per-label thresholds. + File structure (from notebook cell 18): + logreg_model โ†’ sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg)) + logreg_thr โ†’ np.ndarray shape (n_techniques,) per-label thresholds + """ + + def __init__(self, models_dir: Path | None = None) -> None: + base = Path(models_dir or settings.murshid_models_dir).resolve() + + logreg_path = base / settings.logreg_joblib + thr_path = base / settings.logreg_thresholds_npy + labels_path = base / settings.label_columns_json + + for p in (logreg_path, thr_path, labels_path): + if not p.is_file(): + raise FileNotFoundError(f"Missing model file: {p}") + + # --- notebook cell 18: load model + thresholds --- + self._model = joblib.load(logreg_path) # logreg_model + self._thr = np.load(thr_path) # logreg_thr + + with open(labels_path, encoding="utf-8") as f: + self.technique_names: list[str] = json.load(f) + + n = len(self.technique_names) + if self._thr.shape[0] != n: + raise ValueError( + f"LogReg thresholds length {self._thr.shape[0]} != {n} labels" + ) + + # ------------------------------------------------------------------ + + def predict(self, embedding_1d: np.ndarray) -> list[dict]: + """ + Run LogReg inference exactly as in notebook cell 19. + + Returns list of dicts sorted by confidence_percent desc: + technique_id, predicted, confidence_percent, proba, threshold, gap + """ + X_user = embedding_1d.reshape(1, -1) + + # --- verbatim from notebook cell 19 --- + proba = self._model.predict_proba(X_user) + + if isinstance(proba, list): + proba = np.column_stack([p[:, 1] for p in proba]) + elif proba.ndim == 3: + proba = proba[:, :, 1] + + proba = proba.reshape(-1) + + pred_logreg = (proba >= self._thr).astype(int) + conf_logreg = proba * 100 + gap_logreg = proba - self._thr + # --- end verbatim --- + + results = [ + { + "technique_id": self.technique_names[i], + "predicted": bool(pred_logreg[i]), + "confidence_percent": round(float(conf_logreg[i]), 2), + "proba": round(float(proba[i]), 4), + "threshold": round(float(self._thr[i]), 4), + "gap": round(float(gap_logreg[i]), 4), + } + for i in range(len(self.technique_names)) + ] + + # sort: predicted first, then by confidence desc (notebook sort logic) + return sorted( + results, + key=lambda r: (r["predicted"], r["confidence_percent"]), + reverse=True, + ) diff --git a/murshid_backend/app/ml/pipeline.py b/murshid_backend/app/ml/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..20d4a99961633e98a851581040febd56025b3747 --- /dev/null +++ b/murshid_backend/app/ml/pipeline.py @@ -0,0 +1,225 @@ +""" +Full inference pipeline โ€” combines sanitizer โ†’ summarizer โ†’ embedder โ†’ logistic_model. +Exposes analyze_rule(rule_xml) -> dict as the single callable for the service layer. + +Modes: + FULL : LLaMA available + SecureBERT+ + LogReg (GPU/Colab required) + LOCAL : MURSHID_SKIP_LLM=true + SecureBERT+ + LogReg + โ†’ skips LLaMA; uses field as the paragraph text. + This allows POST /rules/analyze to work locally without a GPU. + LITE : torch not installed โ†’ uses a trivial bag-of-words fake embedding (testing only) +""" + +from __future__ import annotations + +import xml.etree.ElementTree as ET +from dataclasses import dataclass +from typing import Any + +import numpy as np + +from app.config import settings +from app.ml.logistic_model import LogisticRegressionModel +from app.ml.sanitizer import sanitize_rule_from_string + +try: + import torch + from huggingface_hub import login as hf_login + from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + from app.ml.embedder import SecureBERTEmbedder, build_text_for_embedding + from app.ml.summarizer import summarize_one_rule + _TORCH_AVAILABLE = True + _TORCH_ERROR: str | None = None +except (ImportError, OSError) as _e: + _TORCH_AVAILABLE = False + _TORCH_ERROR = str(_e) + + +# --------------------------------------------------------------------------- +# Singleton container (loaded once at startup) +# --------------------------------------------------------------------------- + + +@dataclass +class _ModelStore: + llama_model: Any | None = None + llama_tokenizer: Any | None = None + llama_device: str = "cpu" + embedder: SecureBERTEmbedder | None = None + logreg: LogisticRegressionModel | None = None + ready: bool = False + + +_store = _ModelStore() + + +def load_models() -> None: + """ + Load all models into _store. + Call once at FastAPI startup (lifespan). + """ + if _TORCH_AVAILABLE and settings.hf_token: + hf_login(token=settings.hf_token, add_to_git_credential=False) + + if not settings.murshid_skip_llm: + if not _TORCH_AVAILABLE: + print("[Murshid] WARNING: torch not installed โ€” skipping LLM load.") + else: + bnb_cfg = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + ) + tok = AutoTokenizer.from_pretrained(settings.llama_model_id, use_fast=True) + if tok.pad_token is None: + tok.pad_token = tok.eos_token + m = AutoModelForCausalLM.from_pretrained( + settings.llama_model_id, + quantization_config=bnb_cfg, + device_map="auto", + low_cpu_mem_usage=True, + dtype=torch.float16, + ) + m.config.pad_token_id = tok.pad_token_id + m.eval() + _store.llama_tokenizer = tok + _store.llama_model = m + _store.llama_device = "cuda" if torch.cuda.is_available() else "cpu" + + if _TORCH_AVAILABLE: + try: + _store.embedder = SecureBERTEmbedder() + except Exception as exc: + print(f"[Murshid] WARNING: SecureBERT+ not loaded โ€” {exc}") + _store.embedder = None + else: + print("[Murshid] WARNING: torch not installed โ€” embedder skipped.") + _store.embedder = None + + try: + _store.logreg = LogisticRegressionModel() + except FileNotFoundError as exc: + print(f"[Murshid] WARNING: LogReg model files missing โ€” {exc}") + _store.logreg = None + except Exception as exc: + print(f"[Murshid] WARNING: LogReg not loaded โ€” {exc}") + _store.logreg = None + + _store.ready = True + + +def unload_models() -> None: + _store.llama_model = None + _store.llama_tokenizer = None + _store.embedder = None + _store.logreg = None + _store.ready = False + + +def is_ready() -> bool: + return _store.ready + + +# --------------------------------------------------------------------------- +# Public function +# --------------------------------------------------------------------------- + + +def _extract_description(clean_xml: str) -> str: + """Extract text from sanitized rule XML.""" + try: + elem = ET.fromstring(clean_xml.strip()) + desc = elem.findtext("description") or "" + return " ".join(desc.split()).strip() + except ET.ParseError: + return "" + + +def analyze_rule(rule_xml: str) -> dict: + """ + Full pipeline: XML โ†’ sanitize โ†’ summarize โ†’ embed โ†’ LogReg โ†’ ranked results. + + Operates in three modes depending on environment: + + FULL mode (MURSHID_SKIP_LLM=false, GPU available): + LLaMA generates a natural-language summary โ†’ SecureBERT+ embeds it โ†’ LogReg predicts. + + LOCAL mode (MURSHID_SKIP_LLM=true, torch installed): + Skips LLaMA. Uses the rule's field directly as the text. + SecureBERT+ still embeds it properly โ†’ LogReg predicts. + โš ๏ธ Accuracy slightly lower than FULL mode (no LLaMA enrichment). + + LITE mode (torch not installed): + Uses a random unit-vector as a placeholder embedding. + Results are meaningless โ€” for structural testing only. + + Returns: + { + "sanitized_xml": str, + "summary": str, # LLaMA output OR description OR "(lite mode)" + "text_for_embedding": str, + "embedding_dim": int, + "pipeline_mode": str, # "full" | "local" | "lite" + "results": [...], # all techniques sorted by confidence desc + "detected": [...], # predicted == True only + } + """ + if not _store.ready: + raise RuntimeError("Models not loaded. Call load_models() first.") + + if "" not in rule_xml: + raise ValueError("Incomplete XML: must contain and .") + + if _store.logreg is None: + raise RuntimeError( + "LogReg model not loaded. " + "Copy the .joblib and .npy files to MURSHID_MODELS_DIR and restart." + ) + + clean_xml = sanitize_rule_from_string(rule_xml) + + # โ”€โ”€ Choose mode โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + if _store.llama_model is not None and _store.llama_tokenizer is not None: + # FULL mode: LLaMA summary + mode = "full" + summary = summarize_one_rule( + clean_xml, + _store.llama_model, + _store.llama_tokenizer, + _store.llama_device, + ) + text = build_text_for_embedding(clean_xml, summary) + embedding: np.ndarray = _store.embedder.embed_text(text) + + elif _store.embedder is not None: + # LOCAL mode: no LLaMA, use as text + mode = "local" + desc = _extract_description(clean_xml) + summary = desc or "No description available." + text = desc or clean_xml[:300] + embedding = _store.embedder.embed_text(text) + + else: + # LITE mode: torch not available, random unit-vector (structural test only) + mode = "lite" + desc = _extract_description(clean_xml) + summary = f"(lite mode โ€” no embedder) {desc}" + text = desc or clean_xml[:300] + dim = 768 + raw = np.random.default_rng(abs(hash(text)) % (2**32)).random(dim).astype(np.float32) + embedding = raw / (np.linalg.norm(raw) + 1e-12) + + # โ”€โ”€ Classify โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + all_results = _store.logreg.predict(embedding) + detected = [r for r in all_results if r["predicted"]] + + return { + "sanitized_xml": clean_xml, + "summary": summary, + "text_for_embedding": text, + "embedding_dim": int(embedding.shape[0]), + "pipeline_mode": mode, + "results": all_results, + "detected": detected, + } diff --git a/murshid_backend/app/ml/sanitizer.py b/murshid_backend/app/ml/sanitizer.py new file mode 100644 index 0000000000000000000000000000000000000000..a039fc45c2f90b171037db3046796d26a483c5d7 --- /dev/null +++ b/murshid_backend/app/ml/sanitizer.py @@ -0,0 +1,32 @@ +""" +Rule sanitizer โ€” extracted from MurshidUIPipeline.ipynb (cell 10). +Removes: mitre, if_sid, group, if_group tags from Wazuh XML rule. +Original file is NOT modified. +""" + +from __future__ import annotations + +import copy +import xml.etree.ElementTree as ET + +REMOVE_TAGS_ANYWHERE: set[str] = {"mitre", "if_sid", "group", "if_group"} + + +def _remove_tag_anywhere(root_elem: ET.Element, tag: str) -> None: + for parent in list(root_elem.iter()): + for child in list(parent): + if child.tag == tag: + parent.remove(child) + + +def sanitize_rule(rule_elem: ET.Element) -> ET.Element: + r = copy.deepcopy(rule_elem) + for tag in REMOVE_TAGS_ANYWHERE: + _remove_tag_anywhere(r, tag) + return r + + +def sanitize_rule_from_string(rule_xml: str) -> str: + rule_elem = ET.fromstring(rule_xml.strip()) + sanitized = sanitize_rule(rule_elem) + return ET.tostring(sanitized, encoding="unicode") diff --git a/murshid_backend/app/ml/summarizer.py b/murshid_backend/app/ml/summarizer.py new file mode 100644 index 0000000000000000000000000000000000000000..b5171fa955d02b36a07c01dbc37a37d8ee9b49f9 --- /dev/null +++ b/murshid_backend/app/ml/summarizer.py @@ -0,0 +1,262 @@ +""" +LLM summarizer โ€” extracted from MurshidUIPipeline.ipynb (cells 11-12). +Converts sanitized Wazuh XML rule to a one-sentence behavior summary. +Original file is NOT modified. +""" + +from __future__ import annotations + +import json +import re +import unicodedata + +import torch + +# -------------------------------------------------------------------------- +# Constants (identical to notebook) +# -------------------------------------------------------------------------- +MAX_INPUT_TOKENS = 2048 +MAX_NEW_TOKENS = 160 +DO_SAMPLE = False +NUM_BEAMS = 4 +MAX_RETRIES = 3 + +SYSTEM_INSTR = ( + "You are a cybersecurity expert.\n" + "You will be provided with a Wazuh rule in XML format.\n" + "Write EXACTLY ONE sentence describing the observable event pattern the rule matches.\n\n" + "HARD CONSTRAINTS:\n" + '1) Output must be minified JSON only: {"summary":"..."}\n' + "2) ONE sentence only.\n" + "3) Start with one of: Detects, Monitors, Identifies, Flags, Reports, Tracks, Captures.\n" + "4) Use ONLY facts present in the XML. Describe the observable system event only.\n" + "5) Do NOT infer attacker intent, attack type, or technique.\n" + "6) Do NOT mention MITRE, ATT&CK, or attack technique names unless explicitly present in the XML.\n" + "7) Do NOT use speculative language: likely, potentially, possible, possibly, may indicate, or could indicate.\n" + "8) Length: 7 to 18 words.\n" + "9) SHOULD include a clear event type when possible.\n" + "10) Mention at least ONE concrete indicator if available (event_id, process name, file path,\n" + " registry key, service, protocol/port, URL pattern, command, username, IP).\n" + "If only a single indicator exists, still produce a complete behavior-focused sentence.\n" +) + +REPAIR_HINT = ( + "Your previous output was rejected.\n" + "Fix it to satisfy ALL constraints:\n" + '- Output MUST be minified JSON only: {"summary":"..."}\n' + "- One sentence only.\n" + "- Keep it behavior-focused.\n" + "- Include at least ONE concrete indicator if present in the XML.\n" + "- Do NOT add any extra text outside JSON.\n" +) + +VERB_OK = ("Detects", "Monitors", "Identifies", "Flags", "Reports", "Tracks", "Captures") +JSON_OBJ_RE = re.compile(r"\{.*?\}", re.DOTALL) +BAD_INTRO_RE = re.compile( + r"^\s*(this\s+(wazuh\s+)?rule|the\s+rule|this\s+alert)\b", re.IGNORECASE +) +BAD_INTENT_RE = re.compile(r"\b(likely|potentially|possible|maybe)\b", re.IGNORECASE) +GENERIC_RE = re.compile( + r"\b(detects activity|detects suspicious activity|detects potentially suspicious activity|" + r"monitors activity|reports activity|detects an event pattern defined by the rule indicators)\b", + re.IGNORECASE, +) + + +# -------------------------------------------------------------------------- +# Helpers (identical to notebook) +# -------------------------------------------------------------------------- + +def _build_prompt(rule_xml: str, tokenizer, extra_hint: str = "") -> str: + sys = SYSTEM_INSTR + (("\n" + extra_hint) if extra_hint else "") + user = f"Wazuh rule XML:\n{rule_xml}\n\nReturn JSON only:" + messages = [{"role": "system", "content": sys}, {"role": "user", "content": user}] + return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + +def _looks_broken_encoding(s: str) -> bool: + return any(m in s for m in ("รƒ", "ร", "ร‘", "รข", "รข")) if s else False + + +def _try_extract_json_summary(text: str) -> str | None: + t = (text or "").strip() + if not t: + return None + if t.startswith("{") and '"summary"' in t: + try: + obj = json.loads(t) + if isinstance(obj, dict) and isinstance(obj.get("summary"), str): + return obj["summary"].strip() + except Exception: + pass + m = JSON_OBJ_RE.search(t) + if m and '"summary"' in m.group(0): + blob = m.group(0) + try: + obj = json.loads(blob) + if isinstance(obj, dict) and isinstance(obj.get("summary"), str): + return obj["summary"].strip() + except Exception: + m2 = re.search(r'"summary"\s*:\s*"([^"]+)"', blob) + if m2: + return m2.group(1).strip() + return None + + +def _normalize_one_sentence(s: str) -> str: + s = re.sub(r"\s+", " ", (s or "").strip()).strip() + s = unicodedata.normalize("NFKC", s) + if not s: + return "" + if BAD_INTRO_RE.match(s): + s = BAD_INTRO_RE.sub("", s).lstrip(":,- ").strip() + if not s: + return "" + if not any(s.startswith(v) for v in VERB_OK): + s = "Detects " + (s[0].lower() + s[1:]) if len(s) > 1 else "" + if not s: + return "" + m = re.search(r"[.!?](?:\s|$)", s) + s = s[: m.end()].strip() if m else s + "." + s = re.sub(r"^(Detects\s+)+", "Detects ", s).strip() + return re.sub(r"\s+", " ", s).strip() + + +def _looks_truncated(s: str) -> bool: + return not s or s.strip().endswith(("(", ":", " -", ",")) + + +def _has_behavior_signal(s: str) -> bool: + kws = ["create","delete","execute","spawn","launch","login","logon","authentication", + "connect","request","query","modify","registry","process","command","file", + "service","ip","url","dns","http","vpn","account"] + return any(k in s.lower() for k in kws) + + +def _has_indicator_signal(s: str) -> bool: + kws = [".exe",".dll",".ps1",".bat",".cmd","powershell","cmd.exe","reg.exe","rundll32", + "svchost","registry","temp","system32","event_id","http","dns","ip","url","port","key"] + return any(k in s.lower() for k in kws) + + +def _is_bad(s: str) -> bool: + if not s or BAD_INTRO_RE.match(s) or BAD_INTENT_RE.search(s) or GENERIC_RE.search(s): + return True + if _looks_broken_encoding(s) or _looks_truncated(s): + return True + wc = len(s.split()) + if wc < 7 or wc > 18 or not _has_behavior_signal(s): + return True + return bool((s.startswith("{") and "summary" in s) or ('"summary"' in s and "{" in s)) + + +def _is_catastrophic(s: str) -> bool: + return not s or _looks_broken_encoding(s) or _looks_truncated(s) or len(s.split()) < 3 + + +def _score(s: str) -> int: + wc = len(s.split()) + return ( + (3 if 7 <= wc <= 18 else 0) + + (3 if _has_behavior_signal(s) else 0) + + (2 if _has_indicator_signal(s) else 0) + + (1 if not GENERIC_RE.search(s) else 0) + + (1 if not BAD_INTENT_RE.search(s) else 0) + ) + + +def _rescue_finalize(s: str) -> str: + s = _normalize_one_sentence(s) + if not s: + return "Detects rule-matched behavior." + s = re.sub(r",\s*(possibly|potentially|maybe|may)\b.*$", "", s, flags=re.IGNORECASE).strip() + s = re.sub(r"\b(possibly|potentially|maybe|may)\b", "", s, flags=re.IGNORECASE) + s = re.sub(r"\s+", " ", s).strip() + if len(s.split()) < 7: + low = s.lower() + for kw, rep in [ + ("powershell", "Detects powershell.exe process execution."), + ("cmd", "Detects cmd.exe process execution."), + ("reg", "Detects reg.exe process execution."), + ("svchost", "Detects svchost.exe process execution."), + ]: + if kw in low: + s = rep + break + else: + s = s.rstrip(".") + " matching rule indicators." + if _looks_truncated(s): + s = s.rstrip(".") + " matching rule indicators." + if not any(s.startswith(v) for v in VERB_OK): + s = "Detects " + s[0].lower() + s[1:] if len(s) > 1 else "Detects rule-matched behavior." + words = s.split() + if len(words) > 18: + s = " ".join(words[:18]).rstrip(".") + "." + return re.sub(r"\s+", " ", s if s.endswith(".") else s + ".").strip() + + +# -------------------------------------------------------------------------- +# Public API +# -------------------------------------------------------------------------- + +def summarize_one_rule(rule_xml: str, model, tokenizer, device: str | None = None) -> str: + """Generate a one-sentence summary for a sanitized Wazuh rule XML string.""" + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + + pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id + eos_id = tokenizer.eos_token_id or pad_id + + best: str | None = None + best_any: str | None = None + last_raw = "" + last_cleaned = "" + + for attempt in range(1, MAX_RETRIES + 1): + prompt = _build_prompt( + rule_xml, tokenizer, extra_hint=REPAIR_HINT if attempt >= 2 else "" + ) + inputs = tokenizer( + prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_TOKENS + ).to(device) + + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=MAX_NEW_TOKENS, + do_sample=DO_SAMPLE, + num_beams=NUM_BEAMS, + pad_token_id=pad_id, + eos_token_id=eos_id, + repetition_penalty=1.05, + no_repeat_ngram_size=3, + ) + + raw = tokenizer.decode( + outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True + ).strip() + last_raw = raw + + parsed = _try_extract_json_summary(raw) + if parsed is None: + continue + + cleaned = _normalize_one_sentence(parsed) + last_cleaned = cleaned + + if cleaned and not _is_catastrophic(cleaned): + if best_any is None or _score(cleaned) > _score(best_any): + best_any = cleaned + + if not _is_bad(cleaned): + best = cleaned + break + + if best is None: + if best_any and not _is_catastrophic(best_any): + best = best_any + else: + src = last_cleaned or _try_extract_json_summary(last_raw) or last_raw + best = _rescue_finalize(src) + + return best diff --git a/murshid_backend/app/ml/svm_model.py b/murshid_backend/app/ml/svm_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ab39e3643dedbf4305d375bf4d6951bafc50a2cc --- /dev/null +++ b/murshid_backend/app/ml/svm_model.py @@ -0,0 +1,101 @@ +""" +SVM classifier โ€” PRIMARY model per the report (ยง3.1.3 + ยง4.1). + +Report quote: + "the Support Vector Machine (SVM) was adopted as the core classifier" + "classification using SVM to predict the associated MITRE ATT&CK techniques" + +Inference logic (verbatim from MurshidUIPipeline.ipynb cell 16+19): + scores = svm_model.named_steps["clf"].decision_function( + svm_model.named_steps["pca"].transform(X_user) + ).reshape(-1) + pred = (scores >= thr_per_label).astype(int) + margins = scores - thr_per_label + conf = sigmoid(margins) * 100 + +Original notebook file is NOT modified. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import joblib +import numpy as np + +from app.config import settings + + +def _sigmoid(x: np.ndarray) -> np.ndarray: + """Probability calibration: sigmoid(margin) โ€” notebook cell 17.""" + x = np.clip(x, -30, 30) + return 1.0 / (1.0 + np.exp(-x)) + + +class SVMModel: + """ + Wraps the trained LinearSVC pipeline with per-label thresholds. + Structure of the .joblib pack (from notebook): + svm_pack["model"] โ†’ sklearn Pipeline (PCA + LinearSVC) + svm_pack["thresholds_per_label"] โ†’ np.ndarray shape (n_techniques,) + """ + + def __init__(self, models_dir: Path | None = None) -> None: + base = Path(models_dir or settings.murshid_models_dir).resolve() + + svm_path = base / settings.svm_joblib + labels_path = base / settings.label_columns_json + + for p in (svm_path, labels_path): + if not p.is_file(): + raise FileNotFoundError(f"Missing model file: {p}") + + svm_pack = joblib.load(svm_path) + self._model = svm_pack["model"] # Pipeline(PCA โ†’ LinearSVC) + self._thresholds = np.asarray( + svm_pack["thresholds_per_label"], dtype=np.float64 + ) + + with open(labels_path, encoding="utf-8") as f: + self.technique_names: list[str] = json.load(f) + + n = len(self.technique_names) + if self._thresholds.shape[0] != n: + raise ValueError( + f"SVM thresholds length {self._thresholds.shape[0]} != {n} labels" + ) + + # ------------------------------------------------------------------ + + def predict(self, embedding_1d: np.ndarray) -> list[dict]: + """ + Run SVM inference exactly as in the notebook. + + Returns list of dicts sorted by confidence_percent desc: + technique_id, predicted, confidence_percent, score, threshold, margin + """ + X = embedding_1d.reshape(1, -1) + + # Apply PCA then LinearSVC decision function (notebook cell 19) + scores = self._model.named_steps["clf"].decision_function( + self._model.named_steps["pca"].transform(X) + ).reshape(-1) + + pred = (scores >= self._thresholds).astype(int) + margins = scores - self._thresholds + conf = _sigmoid(margins) * 100 # calibrated confidence (%) + + results = [ + { + "technique_id": self.technique_names[i], + "predicted": bool(pred[i]), + "confidence_percent": round(float(conf[i]), 2), + "score": round(float(scores[i]), 4), + "threshold": round(float(self._thresholds[i]), 4), + "margin": round(float(margins[i]), 4), + } + for i in range(len(self.technique_names)) + ] + + return sorted(results, key=lambda r: r["confidence_percent"], reverse=True) diff --git a/murshid_backend/app/models/__init__.py b/murshid_backend/app/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..341bc901a88c9e72bdaffe20b15d28e8b345e170 --- /dev/null +++ b/murshid_backend/app/models/__init__.py @@ -0,0 +1,16 @@ +"""SQLAlchemy ORM models (tables defined exactly per ER Diagram ยง3.2.6 of the report).""" +from app.models.user import User +from app.models.mapping_job import MappingJob +from app.models.rule import Rule +from app.models.technique import Technique +from app.models.rule_technique_mapping import RuleTechniqueMapping +from app.models.query_template import QueryTemplate + +__all__ = [ + "User", + "MappingJob", + "Rule", + "Technique", + "RuleTechniqueMapping", + "QueryTemplate", +] diff --git a/murshid_backend/app/models/mapping_job.py b/murshid_backend/app/models/mapping_job.py new file mode 100644 index 0000000000000000000000000000000000000000..c7891559075c9d5fc06da2ce4ae59d8121f968df --- /dev/null +++ b/murshid_backend/app/models/mapping_job.py @@ -0,0 +1,40 @@ +""" +MappingJob entity โ€” ER Diagram ยง3.2.6 +Attributes: job_ID, file_name, timestamp, rules_count, status, progress +Linked to User via "uploads" relationship. +Also visible in Figure 4-14 (Mapping Progress Table). +""" + +import enum +from datetime import datetime + +from sqlalchemy import DateTime, Enum, ForeignKey, Integer, String, func +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class JobStatus(str, enum.Enum): + pending = "pending" + running = "running" + done = "done" + failed = "failed" + + +class MappingJob(Base): + __tablename__ = "mapping_jobs" + + job_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + user_id: Mapped[int] = mapped_column(ForeignKey("users.user_id"), nullable=False) + file_name: Mapped[str] = mapped_column(String(255), nullable=False) + rules_count: Mapped[int] = mapped_column(Integer, default=0) + status: Mapped[JobStatus] = mapped_column( + Enum(JobStatus), nullable=False, default=JobStatus.pending + ) + progress: Mapped[int] = mapped_column(Integer, default=0) + timestamp: Mapped[datetime] = mapped_column( + DateTime, nullable=False, server_default=func.now() + ) + + user: Mapped["User"] = relationship(back_populates="jobs") + rules: Mapped[list["Rule"]] = relationship(back_populates="job") diff --git a/murshid_backend/app/models/query_template.py b/murshid_backend/app/models/query_template.py new file mode 100644 index 0000000000000000000000000000000000000000..ef79528c949e49ba0de246c655dd3e3d30293e1d --- /dev/null +++ b/murshid_backend/app/models/query_template.py @@ -0,0 +1,27 @@ +""" +QueryTemplate entity โ€” ER Diagram ยง3.2.6 +Attributes: Template_ID, Purpose, wql_query, Note +Linked to Technique. Admin can add/update/disable (Use Case 7, ยง3.2.7). +""" + +from sqlalchemy import Boolean, ForeignKey, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class QueryTemplate(Base): + __tablename__ = "query_templates" + + template_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + technique_id: Mapped[str] = mapped_column( + String(20), ForeignKey("techniques.technique_id"), nullable=False + ) + purpose: Mapped[str | None] = mapped_column(String(255), nullable=True) + # WQL with placeholders: ${HOST}, ${USER}, ${IP} + wql_query: Mapped[str] = mapped_column(Text, nullable=False) + note: Mapped[str | None] = mapped_column(Text, nullable=True) + # Admin can disable without deleting โ€” Use Case 7 + is_active: Mapped[bool] = mapped_column(Boolean, default=True, nullable=False) + + technique: Mapped["Technique"] = relationship(back_populates="query_templates") diff --git a/murshid_backend/app/models/rule.py b/murshid_backend/app/models/rule.py new file mode 100644 index 0000000000000000000000000000000000000000..378fe3096b9071a6d16551b453ed4722f3c84675 --- /dev/null +++ b/murshid_backend/app/models/rule.py @@ -0,0 +1,27 @@ +""" +Rule entity โ€” ER Diagram ยง3.2.6 +Attributes: Rule_ID, embedding_vector, job_ID (FK) +Rule_ID is the Wazuh rule ID string (e.g. "597"). +""" + +from sqlalchemy import ForeignKey, String, Text +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class Rule(Base): + __tablename__ = "rules" + + rule_id: Mapped[str] = mapped_column(String(50), primary_key=True) + job_id: Mapped[int | None] = mapped_column( + ForeignKey("mapping_jobs.job_id"), nullable=True + ) + # 768-dimensional float vector stored as JSON string; kept nullable for + # rules where only the mapping result is persisted without the vector. + embedding_vector: Mapped[str | None] = mapped_column(Text, nullable=True) + + job: Mapped["MappingJob | None"] = relationship(back_populates="rules") + technique_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship( + back_populates="rule", cascade="all, delete-orphan" + ) diff --git a/murshid_backend/app/models/rule_technique_mapping.py b/murshid_backend/app/models/rule_technique_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..351c9559d26c7d0b8f4dedc53f174a48d6e00614 --- /dev/null +++ b/murshid_backend/app/models/rule_technique_mapping.py @@ -0,0 +1,31 @@ +""" +RuleTechniqueMapping associative entity โ€” ER Diagram ยง3.2.6 +Attributes: Mapping_ID, Rule_ID (FK), Technique_ID (FK), confidence_score +Index on rule_id for fast lookup โ€” mentioned explicitly in Use Case 6 (ยง3.2.7). +""" + +from sqlalchemy import Float, ForeignKey, Index, Integer, String +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class RuleTechniqueMapping(Base): + __tablename__ = "rule_technique_mappings" + + mapping_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + rule_id: Mapped[str] = mapped_column( + String(50), ForeignKey("rules.rule_id"), nullable=False + ) + technique_id: Mapped[str] = mapped_column( + String(20), ForeignKey("techniques.technique_id"), nullable=False + ) + confidence_score: Mapped[float] = mapped_column(Float, nullable=False) + + rule: Mapped["Rule"] = relationship(back_populates="technique_mappings") + technique: Mapped["Technique"] = relationship(back_populates="rule_mappings") + + __table_args__ = ( + # "creates an index on rule_id for efficient lookup" โ€” Use Case 6 + Index("ix_rule_technique_rule_id", "rule_id"), + ) diff --git a/murshid_backend/app/models/technique.py b/murshid_backend/app/models/technique.py new file mode 100644 index 0000000000000000000000000000000000000000..42fe7d76290619aa103e7448c4b1f70f7ceed865 --- /dev/null +++ b/murshid_backend/app/models/technique.py @@ -0,0 +1,24 @@ +""" +Technique entity โ€” ER Diagram ยง3.2.6 +Attributes: Technique_ID, technique_name, tactic +""" + +from sqlalchemy import String +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class Technique(Base): + __tablename__ = "techniques" + + technique_id: Mapped[str] = mapped_column(String(20), primary_key=True) + technique_name: Mapped[str] = mapped_column(String(255), nullable=False) + tactic: Mapped[str | None] = mapped_column(String(100), nullable=True) + + rule_mappings: Mapped[list["RuleTechniqueMapping"]] = relationship( + back_populates="technique" + ) + query_templates: Mapped[list["QueryTemplate"]] = relationship( + back_populates="technique" + ) diff --git a/murshid_backend/app/models/user.py b/murshid_backend/app/models/user.py new file mode 100644 index 0000000000000000000000000000000000000000..1ac6e3a53d83d5dffe5e51e48ac46d1122bf5310 --- /dev/null +++ b/murshid_backend/app/models/user.py @@ -0,0 +1,30 @@ +""" +User entity โ€” ER Diagram ยง3.2.6 +Attributes: User_ID, username, email, password_hash, role +""" + +import enum + +from sqlalchemy import Enum, String +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.db.base import Base + + +class UserRole(str, enum.Enum): + admin = "admin" + analyst = "analyst" + + +class User(Base): + __tablename__ = "users" + + user_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + username: Mapped[str] = mapped_column(String(100), unique=True, nullable=False) + email: Mapped[str] = mapped_column(String(255), unique=True, nullable=False) + password_hash: Mapped[str] = mapped_column(String(255), nullable=False) + role: Mapped[UserRole] = mapped_column( + Enum(UserRole), nullable=False, default=UserRole.analyst + ) + + jobs: Mapped[list["MappingJob"]] = relationship(back_populates="user") diff --git a/murshid_backend/app/repositories/__init__.py b/murshid_backend/app/repositories/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4674f8c0fa126fc4ef11a6f766e140318af248ea --- /dev/null +++ b/murshid_backend/app/repositories/__init__.py @@ -0,0 +1 @@ +"""Repository layer โ€” thin DB access wrappers.""" diff --git a/murshid_backend/app/repositories/job_repo.py b/murshid_backend/app/repositories/job_repo.py new file mode 100644 index 0000000000000000000000000000000000000000..b5acdb40cc12d85c203f44e7e3ab8cd4d9f9805c --- /dev/null +++ b/murshid_backend/app/repositories/job_repo.py @@ -0,0 +1,44 @@ +"""CRUD for MappingJob table.""" + +from __future__ import annotations + +from datetime import datetime, timezone + +from sqlalchemy.orm import Session + +from app.models.mapping_job import JobStatus, MappingJob + + +def create_job(db: Session, *, user_id: int, file_name: str, rules_count: int = 0) -> MappingJob: + job = MappingJob( + user_id=user_id, + file_name=file_name, + rules_count=rules_count, + status=JobStatus.pending, + progress=0, + timestamp=datetime.now(tz=timezone.utc), + ) + db.add(job) + db.flush() + return job + + +def update_job_status( + db: Session, + job_id: int, + *, + status: JobStatus, + progress: int | None = None, +) -> MappingJob | None: + job = db.get(MappingJob, job_id) + if job is None: + return None + job.status = status + if progress is not None: + job.progress = progress + db.flush() + return job + + +def get_job(db: Session, job_id: int) -> MappingJob | None: + return db.get(MappingJob, job_id) diff --git a/murshid_backend/app/repositories/rule_repo.py b/murshid_backend/app/repositories/rule_repo.py new file mode 100644 index 0000000000000000000000000000000000000000..a542660280c881ef8dedbf2f91c2e40fe46558b2 --- /dev/null +++ b/murshid_backend/app/repositories/rule_repo.py @@ -0,0 +1,71 @@ +"""CRUD for Rule and RuleTechniqueMapping tables.""" + +from __future__ import annotations + +import json + +import numpy as np +from sqlalchemy.orm import Session + +from app.models.rule import Rule +from app.models.rule_technique_mapping import RuleTechniqueMapping + + +def upsert_rule( + db: Session, + *, + rule_id: str, + job_id: int | None = None, + embedding: np.ndarray | None = None, +) -> Rule: + rule = db.get(Rule, rule_id) + if rule is None: + rule = Rule(rule_id=rule_id) + db.add(rule) + if job_id is not None: + rule.job_id = job_id + if embedding is not None: + rule.embedding_vector = json.dumps(embedding.tolist()) + db.flush() + return rule + + +def save_technique_mappings( + db: Session, + *, + rule_id: str, + results: list[dict], +) -> list[RuleTechniqueMapping]: + """ + Persist ALL (rule_id, technique_id, confidence_score) rows sorted by confidence. + Deletes existing mappings first so re-runs are idempotent. + Saves ALL techniques (not just detected ones) so Figure 4-11 can show Top 5. + """ + db.query(RuleTechniqueMapping).filter( + RuleTechniqueMapping.rule_id == rule_id + ).delete(synchronize_session=False) + + sorted_results = sorted(results, key=lambda r: r["confidence_percent"], reverse=True) + + rows = [] + for r in sorted_results: + row = RuleTechniqueMapping( + rule_id=rule_id, + technique_id=r["technique_id"], + confidence_score=r["confidence_percent"] / 100.0, + ) + db.add(row) + rows.append(row) + db.flush() + return rows + + +def get_mappings_for_rule( + db: Session, rule_id: str +) -> list[RuleTechniqueMapping]: + return ( + db.query(RuleTechniqueMapping) + .filter(RuleTechniqueMapping.rule_id == rule_id) + .order_by(RuleTechniqueMapping.confidence_score.desc()) + .all() + ) diff --git a/murshid_backend/app/repositories/template_repo.py b/murshid_backend/app/repositories/template_repo.py new file mode 100644 index 0000000000000000000000000000000000000000..13c58d427945b93d57cbb495176fc9b69227b871 --- /dev/null +++ b/murshid_backend/app/repositories/template_repo.py @@ -0,0 +1,94 @@ +"""CRUD for Technique and QueryTemplate tables.""" + +from __future__ import annotations + +from sqlalchemy.orm import Session + +from app.models.query_template import QueryTemplate +from app.models.technique import Technique + + +# -------------------------------------------------------------------------- +# Techniques +# -------------------------------------------------------------------------- + + +def get_or_create_technique( + db: Session, *, technique_id: str, technique_name: str = "", tactic: str | None = None +) -> Technique: + t = db.get(Technique, technique_id) + if t is None: + t = Technique( + technique_id=technique_id, + technique_name=technique_name or technique_id, + tactic=tactic, + ) + db.add(t) + db.flush() + return t + + +def get_technique(db: Session, technique_id: str) -> Technique | None: + return db.get(Technique, technique_id) + + +# -------------------------------------------------------------------------- +# Query templates +# -------------------------------------------------------------------------- + + +def get_templates_for_technique( + db: Session, technique_id: str +) -> list[QueryTemplate]: + return ( + db.query(QueryTemplate) + .filter( + QueryTemplate.technique_id == technique_id, + QueryTemplate.is_active.is_(True), + ) + .all() + ) + + +def create_template( + db: Session, + *, + technique_id: str, + purpose: str | None, + wql_query: str, + note: str | None, +) -> QueryTemplate: + tpl = QueryTemplate( + technique_id=technique_id, + purpose=purpose, + wql_query=wql_query, + note=note, + is_active=True, + ) + db.add(tpl) + db.flush() + return tpl + + +def update_template( + db: Session, + template_id: int, + *, + purpose: str | None = None, + wql_query: str | None = None, + note: str | None = None, + is_active: bool | None = None, +) -> QueryTemplate | None: + tpl = db.get(QueryTemplate, template_id) + if tpl is None: + return None + if purpose is not None: + tpl.purpose = purpose + if wql_query is not None: + tpl.wql_query = wql_query + if note is not None: + tpl.note = note + if is_active is not None: + tpl.is_active = is_active + db.flush() + return tpl diff --git a/murshid_backend/app/schemas/__init__.py b/murshid_backend/app/schemas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2a4f43ee5c7534f5249f2b0d979d0c279e386f --- /dev/null +++ b/murshid_backend/app/schemas/__init__.py @@ -0,0 +1 @@ +"""Pydantic schemas for API request/response validation.""" diff --git a/murshid_backend/app/schemas/query.py b/murshid_backend/app/schemas/query.py new file mode 100644 index 0000000000000000000000000000000000000000..a310b5a4b6fcdeecfd276e9f23a0ff06158ce089 --- /dev/null +++ b/murshid_backend/app/schemas/query.py @@ -0,0 +1,23 @@ +from pydantic import BaseModel + + +class QueryTemplateOut(BaseModel): + template_id: int + technique_id: str + purpose: str | None + wql_query: str + note: str | None + + +class QueryTemplateIn(BaseModel): + technique_id: str + purpose: str | None = None + wql_query: str + note: str | None = None + + +class QueryTemplateUpdate(BaseModel): + purpose: str | None = None + wql_query: str | None = None + note: str | None = None + is_active: bool | None = None diff --git a/murshid_backend/app/schemas/result.py b/murshid_backend/app/schemas/result.py new file mode 100644 index 0000000000000000000000000000000000000000..b00cf13a2f4eb321c170e213959f4ad7fe6884b0 --- /dev/null +++ b/murshid_backend/app/schemas/result.py @@ -0,0 +1,17 @@ +from pydantic import BaseModel + + +class MappingResult(BaseModel): + technique_id: str + confidence_score: float + confidence_percent: float + # primary = highest confidence; secondary = second if >=0.5; others = below threshold + rank: int + is_primary: bool = False + is_secondary: bool = False + + +class ResultsResponse(BaseModel): + rule_id: str + mappings: list[MappingResult] # all techniques sorted by confidence desc + detected: list[MappingResult] # primary + secondary (confidence >= 0.5) diff --git a/murshid_backend/app/schemas/rule.py b/murshid_backend/app/schemas/rule.py new file mode 100644 index 0000000000000000000000000000000000000000..6455f018c39472c2464fb7cceb8b20bd92b7ee2b --- /dev/null +++ b/murshid_backend/app/schemas/rule.py @@ -0,0 +1,29 @@ +from pydantic import BaseModel, Field + + +class AnalyzeRequest(BaseModel): + rule_xml: str = Field( + ..., + min_length=10, + description="Full Wazuh rule XML including ...", + ) + + +class TechniqueResult(BaseModel): + technique_id: str + predicted: bool + confidence_percent: float + proba: float + threshold: float + gap: float + + +class AnalyzeResponse(BaseModel): + rule_id: str + sanitized_xml: str + summary: str + text_for_embedding: str + embedding_dim: int + pipeline_mode: str = "full" # "full" | "local" | "lite" + detected: list[TechniqueResult] + all_results: list[TechniqueResult] diff --git a/murshid_backend/app/services/__init__.py b/murshid_backend/app/services/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48f9301cd497582af5d4433209a36c9059c52f48 --- /dev/null +++ b/murshid_backend/app/services/__init__.py @@ -0,0 +1 @@ +"""Service layer โ€” business logic between API and ML/repositories.""" diff --git a/murshid_backend/app/services/ml_service.py b/murshid_backend/app/services/ml_service.py new file mode 100644 index 0000000000000000000000000000000000000000..fe3ce81294617cf87ac439536440aff80e8c1720 --- /dev/null +++ b/murshid_backend/app/services/ml_service.py @@ -0,0 +1,16 @@ +""" +MLService โ€” thin wrapper that calls the ML pipeline and exposes +analyze_rule() for use by other services. +""" + +from __future__ import annotations + +from app.ml.pipeline import analyze_rule as _pipeline_analyze +from app.ml.pipeline import is_ready + + +class MLService: + def analyze(self, rule_xml: str) -> dict: + if not is_ready(): + raise RuntimeError("ML pipeline is not ready. Models still loading.") + return _pipeline_analyze(rule_xml) diff --git a/murshid_backend/app/services/result_service.py b/murshid_backend/app/services/result_service.py new file mode 100644 index 0000000000000000000000000000000000000000..063b8c9c36658f0d910d28623015df431a5c4068 --- /dev/null +++ b/murshid_backend/app/services/result_service.py @@ -0,0 +1,50 @@ +""" +ResultService โ€” fetches stored technique mappings for a given rule_id. +Use Case 1: "View the techniques and their scores associated with an alert" (ยง3.2.7). +""" + +from __future__ import annotations + +from sqlalchemy.orm import Session + +from app.repositories.rule_repo import get_mappings_for_rule + + +class ResultService: + def __init__(self, db: Session) -> None: + self._db = db + + SECONDARY_THRESHOLD = 0.50 # ยง3.2.3.2: secondary if confidence โ‰ฅ 0.5 + + def get_results_for_rule(self, rule_id: str) -> dict: + """ + Returns: + mappings: all techniques sorted by confidence desc (for Figure 4-11 Top 5) + detected: primary + secondary techniques only (for Figure 4-12 WQL queries) + """ + mappings = get_mappings_for_rule(self._db, rule_id) + if not mappings: + return None + + all_mappings = [] + detected = [] + + for i, m in enumerate(mappings): + conf_pct = round(m.confidence_score * 100, 2) + is_primary = (i == 0) + is_secondary = (i == 1 and m.confidence_score >= self.SECONDARY_THRESHOLD) + + row = { + "technique_id": m.technique_id, + "confidence_score": round(m.confidence_score, 4), + "confidence_percent": conf_pct, + "rank": i + 1, + "is_primary": is_primary, + "is_secondary": is_secondary, + } + all_mappings.append(row) + + if is_primary or is_secondary: + detected.append(row) + + return {"mappings": all_mappings, "detected": detected} diff --git a/murshid_backend/app/services/rule_service.py b/murshid_backend/app/services/rule_service.py new file mode 100644 index 0000000000000000000000000000000000000000..c1691c8e9935fb7082748fd5cb7474997461866b --- /dev/null +++ b/murshid_backend/app/services/rule_service.py @@ -0,0 +1,71 @@ +""" +RuleService โ€” orchestrates: + 1. ML analysis + 2. Persisting Rule + RuleTechniqueMapping rows + 3. Ensuring Technique rows exist +""" + +from __future__ import annotations + +import xml.etree.ElementTree as ET + +from sqlalchemy.orm import Session + +from app.repositories import rule_repo, template_repo +from app.services.ml_service import MLService + + +class RuleService: + def __init__(self, db: Session, ml: MLService | None = None) -> None: + self._db = db + self._ml = ml or MLService() + + # ------------------------------------------------------------------ + + def analyze_and_persist( + self, + rule_xml: str, + *, + job_id: int | None = None, + ) -> dict: + """ + Full pipeline call followed by DB persistence. + Returns the analysis result dict from the ML pipeline. + """ + result = self._ml.analyze(rule_xml) + + # Extract rule_id from raw XML (not the sanitised version) + rule_id = self._extract_rule_id(rule_xml) + + # Ensure each predicted technique has a row in `techniques` + for r in result["detected"]: + template_repo.get_or_create_technique( + self._db, + technique_id=r["technique_id"], + technique_name=r["technique_id"], + ) + + # Upsert rule row + rule_repo.upsert_rule(self._db, rule_id=rule_id, job_id=job_id) + + # Persist technique mappings + rule_repo.save_technique_mappings( + self._db, rule_id=rule_id, results=result["results"] + ) + + self._db.commit() + + # Attach rule_id to the result for convenience + result["rule_id"] = rule_id + return result + + # ------------------------------------------------------------------ + + @staticmethod + def _extract_rule_id(rule_xml: str) -> str: + try: + elem = ET.fromstring(rule_xml.strip()) + rid = elem.get("id", "").strip() + return rid if rid else "unknown" + except ET.ParseError: + return "unknown" diff --git a/murshid_backend/app/services/template_service.py b/murshid_backend/app/services/template_service.py new file mode 100644 index 0000000000000000000000000000000000000000..bb4f360aa80073afef8fcc780e67f38b83a577fb --- /dev/null +++ b/murshid_backend/app/services/template_service.py @@ -0,0 +1,87 @@ +""" +TemplateService โ€” fetches and manages WQL query templates. +Use Case 2: "View Investigation WQL Queries" (ยง3.2.7). +Use Case 7: "Manage static query templates" (ยง3.2.7). +""" + +from __future__ import annotations + +from sqlalchemy.orm import Session + +from app.repositories.template_repo import ( + create_template, + get_templates_for_technique, + update_template, +) + +CONFIDENCE_THRESHOLD_SECONDARY = 0.5 # from ยง3.2.3.2 "secondary if score >= 0.5" + + +class TemplateService: + def __init__(self, db: Session) -> None: + self._db = db + + def get_queries_for_technique(self, technique_id: str) -> list[dict]: + """ + Returns all active WQL templates for the given technique. + Use Case 2. + """ + templates = get_templates_for_technique(self._db, technique_id) + return [ + { + "template_id": t.template_id, + "technique_id": t.technique_id, + "purpose": t.purpose, + "wql_query": t.wql_query, + "note": t.note, + } + for t in templates + ] + + def add_template( + self, + *, + technique_id: str, + purpose: str | None, + wql_query: str, + note: str | None, + ) -> dict: + """Admin: add a new WQL template. Use Case 7.""" + tpl = create_template( + self._db, + technique_id=technique_id, + purpose=purpose, + wql_query=wql_query, + note=note, + ) + self._db.commit() + return { + "template_id": tpl.template_id, + "technique_id": tpl.technique_id, + "purpose": tpl.purpose, + "wql_query": tpl.wql_query, + "note": tpl.note, + "is_active": tpl.is_active, + } + + def update_template(self, template_id: int, data: dict) -> dict | None: + """Admin: update or disable a template. Use Case 7.""" + tpl = update_template( + self._db, + template_id, + purpose=data.get("purpose"), + wql_query=data.get("wql_query"), + note=data.get("note"), + is_active=data.get("is_active"), + ) + if tpl is None: + return None + self._db.commit() + return { + "template_id": tpl.template_id, + "technique_id": tpl.technique_id, + "purpose": tpl.purpose, + "wql_query": tpl.wql_query, + "note": tpl.note, + "is_active": tpl.is_active, + } diff --git a/murshid_backend/requirements.txt b/murshid_backend/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..88cd0453663e72f26b12f4b2d49212a22b78bb1e --- /dev/null +++ b/murshid_backend/requirements.txt @@ -0,0 +1,22 @@ +fastapi>=0.115.0 +uvicorn[standard]>=0.32.0 +pydantic>=2.9.0 +pydantic-settings>=2.6.0 +python-dotenv>=1.0.0 + +# database +sqlalchemy>=2.0.0 +alembic>=1.13.0 +pymysql>=1.1.0 +cryptography>=43.0.0 + +# ML / numerics +numpy>=1.26.0 +joblib>=1.4.0 +torch>=2.0.0 +transformers>=4.44.0 +accelerate>=0.34.0 +bitsandbytes>=0.46.1 +sentencepiece>=0.2.0 +lxml>=5.0.0 +huggingface_hub>=0.25.0 diff --git a/murshid_backend/requirements_light.txt b/murshid_backend/requirements_light.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f9426bd41267892ab7029fc9596a29695beb793 --- /dev/null +++ b/murshid_backend/requirements_light.txt @@ -0,0 +1,13 @@ +# ุฎููŠู โ€” ู„ู„ุงุฎุชุจุงุฑ ุงู„ุฃูˆู„ูŠ ุจุฏูˆู† GPU/LLM +fastapi>=0.115.0 +uvicorn[standard]>=0.32.0 +pydantic>=2.9.0 +pydantic-settings>=2.6.0 +python-dotenv>=1.0.0 +sqlalchemy>=2.0.0 +alembic>=1.13.0 +pymysql>=1.1.0 +cryptography>=43.0.0 +numpy>=1.26.0 +joblib>=1.4.0 +lxml>=5.0.0 diff --git a/murshid_backend/scripts/__init__.py b/murshid_backend/scripts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f23051728e4f570f0fe2d03d0689510cf9e343b8 --- /dev/null +++ b/murshid_backend/scripts/__init__.py @@ -0,0 +1 @@ +"""Scripts for one-time data operations.""" diff --git a/murshid_backend/scripts/import_excel_templates.py b/murshid_backend/scripts/import_excel_templates.py new file mode 100644 index 0000000000000000000000000000000000000000..d191c8794866c3e4406c8f238ab77f1944925d4a --- /dev/null +++ b/murshid_backend/scripts/import_excel_templates.py @@ -0,0 +1,130 @@ +""" +ุงุณุชูŠุฑุงุฏ ู‚ูˆุงู„ุจ WQL ู…ู† Excel ุฅู„ู‰ ู‚ุงุนุฏุฉ ุงู„ุจูŠุงู†ุงุช. + +ูŠูุดุบูŽู‘ู„ ู…ุฑุฉ ูˆุงุญุฏุฉ: + cd d:\GP\murshid_backend + .venv\Scripts\python.exe scripts\import_excel_templates.py +""" + +import sys +import re +from pathlib import Path + +# ุฃุถู ู…ุณุงุฑ ุงู„ุจุงูƒู†ุฏ ู„ุงุณุชูŠุฑุงุฏ app +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import openpyxl +from sqlalchemy.orm import Session + +from app.config import settings +from app.db.session import SessionLocal +from app.models.query_template import QueryTemplate +from app.models.technique import Technique + +EXCEL_PATH = Path(settings.murshid_models_dir) / "murshid_query_template_structure_clean_shared.xlsx" + +# Fallback: same directory as project root +if not EXCEL_PATH.is_file(): + EXCEL_PATH = Path(__file__).resolve().parent.parent.parent / "murshid_query_template_structure_clean_shared.xlsx" + +def normalise_query(q: str | None) -> str: + """Collapse whitespace/newlines in WQL query.""" + if not q: + return "" + return re.sub(r"\s+", " ", q.strip()) + + +def run(db: Session, replace: bool = False) -> dict: + if not EXCEL_PATH.is_file(): + return {"error": f"Excel file not found: {EXCEL_PATH}"} + + wb = openpyxl.load_workbook(EXCEL_PATH) + ws = wb.active + + rows = list(ws.iter_rows(min_row=2, values_only=True)) + + inserted_techniques = 0 + inserted_templates = 0 + skipped = 0 + errors = [] + + for idx, row in enumerate(rows, start=2): + technique_id = str(row[0] or "").strip() + technique_name = str(row[1] or "").strip() + template_id_str = str(row[2] or "").strip() # e.g. "T1484-1" + purpose = str(row[3] or "").strip() or None + wql_query = normalise_query(str(row[4] or "")) + note = str(row[5] or "").strip() or None + + if not technique_id or not wql_query: + skipped += 1 + continue + + # 1. Upsert Technique + tech = db.get(Technique, technique_id) + if tech is None: + tech = Technique( + technique_id=technique_id, + technique_name=technique_name or technique_id, + tactic=None, + ) + db.add(tech) + db.flush() + inserted_techniques += 1 + elif technique_name and not tech.technique_name: + tech.technique_name = technique_name + + # 2. Insert QueryTemplate (skip duplicate template_id_str unless replace=True) + # Check uniqueness by (technique_id + purpose) to avoid duplicates on re-run + existing = ( + db.query(QueryTemplate) + .filter( + QueryTemplate.technique_id == technique_id, + QueryTemplate.purpose == purpose, + ) + .first() + ) + + if existing: + if replace: + existing.wql_query = wql_query + existing.note = note + existing.is_active = True + inserted_templates += 1 + else: + skipped += 1 + continue + + tpl = QueryTemplate( + technique_id=technique_id, + purpose=purpose, + wql_query=wql_query, + note=note, + is_active=True, + ) + db.add(tpl) + inserted_templates += 1 + + db.commit() + + return { + "excel_path": str(EXCEL_PATH), + "rows_processed": len(rows), + "techniques_inserted": inserted_techniques, + "templates_inserted": inserted_templates, + "skipped": skipped, + "errors": errors, + } + + +if __name__ == "__main__": + replace = "--replace" in sys.argv + + db: Session = SessionLocal() + try: + result = run(db, replace=replace) + print("\n=== Import Result ===") + for k, v in result.items(): + print(f" {k}: {v}") + finally: + db.close() diff --git a/murshid_frontend/index.html b/murshid_frontend/index.html new file mode 100644 index 0000000000000000000000000000000000000000..cfdb45b57203c63384664282f44076e31d84469f --- /dev/null +++ b/murshid_frontend/index.html @@ -0,0 +1,1347 @@ + + + + + + Murshid | ู…ูุฑุดูุฏ + + + + + + + + + + + +
+ + + + diff --git a/start.sh b/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..41f7777220fae2ed0467233676c6003367ebb372 --- /dev/null +++ b/start.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +cd /app/murshid_backend + +# Run Alembic migrations +echo "๐Ÿ”„ Running database migrations..." +python -m alembic upgrade head +echo "โœ… Database ready" + +# Import Excel templates (if not already imported) +echo "๐Ÿ“Š Importing WQL templates from Excel..." +python -c " +from app.db.session import SessionLocal +from scripts.import_excel_templates import run as import_excel +db = SessionLocal() +try: + result = import_excel(db, replace=False) + print('Templates:', result) +finally: + db.close() +" || echo "โš ๏ธ Template import skipped (non-critical)" + +echo "๐Ÿš€ Starting Murshid API on port ${PORT:-7860}..." +exec python -m uvicorn app.main:app \ + --host 0.0.0.0 \ + --port "${PORT:-7860}" \ + --log-level info