Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.env.example +96 -0
.gitattributes +0 -2
.gitignore +28 -2
ARCHITECTURE.md +235 -0
DOCKER.md +443 -0
Dockerfile +31 -9
OPEN_WEBUI.md +385 -0
README.md +294 -16
SETUP.md +590 -0
build_index.py +74 -64
docker-compose.yml +42 -4
enrich_dataset.py +210 -0
main.py +695 -346
requirements.txt +21 -9

.env.example ADDED Viewed

	@@ -0,0 +1,96 @@

+# QModel v4 Configuration Template
+# ==================================
+# Copy this to .env and update values for your environment
+# LLM Backend Selection
+# Options: "hf" (HuggingFace) or "ollama"
+LLM_BACKEND=ollama
+# ─────────────────────────────────────────────────────────────────────
+# OLLAMA BACKEND (if LLM_BACKEND=ollama)
+# ─────────────────────────────────────────────────────────────────────
+OLLAMA_HOST=http://localhost:11434
+OLLAMA_MODEL=minimax-m2.7:cloud
+# Available models: llama3.1, mistral, neural-chat, openhermes
+# ─────────────────────────────────────────────────────────────────────
+# HUGGINGFACE BACKEND (if LLM_BACKEND=hf)
+# ─────────────────────────────────────────────────────────────────────
+# HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+# HF_DEVICE=auto              # Options: auto, cuda, cpu
+# HF_MAX_NEW_TOKENS=2048
+# Popular models:
+#   - Qwen/Qwen2-7B-Instruct (excellent Arabic)
+#   - mistralai/Mistral-7B-Instruct-v0.2
+#   - meta-llama/Llama-2-13b-chat-hf
+# ─────────────────────────────────────────────────────────────────────
+# EMBEDDING MODEL (shared by both backends)
+# ─────────────────────────────────────────────────────────────────────
+EMBED_MODEL=intfloat/multilingual-e5-large
+# ─────────────────────────────────────────────────────────────────────
+# DATA FILES
+# ─────────────────────────────────────────────────────────────────────
+FAISS_INDEX=QModel.index
+METADATA_FILE=metadata.json
+# ─────────────────────────────────────────────────────────────────────
+# RETRIEVAL SETTINGS
+# ─────────────────────────────────────────────────────────────────────
+TOP_K_SEARCH=20         # Candidate pool size
+TOP_K_RETURN=5          # Final results returned to user
+# ─────────────────────────────────────────────────────────────────────
+# GENERATION SETTINGS
+# ─────────────────────────────────────────────────────────────────────
+TEMPERATURE=0.2         # 0.0=deterministic, 1.0=creative
+MAX_TOKENS=2048         # Max output length
+# ─────────────────────────────────────────────────────────────────────
+# SAFETY & QUALITY
+# ─────────────────────────────────────────────────────────────────────
+# Confidence threshold: Below this score, skip LLM and return "not found"
+# Prevents hallucinations but may miss valid results
+# Range: 0.0-1.0 (default 0.30)
+# Tune up (0.50+) for stricter, tune down (0.20) for looser
+CONFIDENCE_THRESHOLD=0.30
+# Hadith boost: Score bonus when intent=hadith
+# Prevents Quran verses from outranking relevant Hadiths
+HADITH_BOOST=0.08
+# ─────────────────────────────────────────────────────────────────────
+# RANKING
+# ─────────────────────────────────────────────────────────────────────
+RERANK_ALPHA=0.6        # 60% dense (embedding), 40% sparse (BM25)
+# ──────────────────────────────────��──────────────────────────────────
+# CACHING
+# ─────────────────────────────────────────────────────────────────────
+CACHE_SIZE=512          # Max cache entries
+CACHE_TTL=3600          # Cache expiry in seconds
+# ─────────────────────────────────────────────────────────────────────
+# SECURITY
+# ─────────────────────────────────────────────────────────────────────
+ALLOWED_ORIGINS=*       # CORS origins (restrict in production: origin1.com,origin2.com)
+# ─────────────────────────────────────────────────────────────────────
+# USAGE EXAMPLES
+# ─────────────────────────────────────────────────────────────────────
+#
+# Development (Ollama):
+#   LLM_BACKEND=ollama
+#   OLLAMA_HOST=http://localhost:11434
+#   OLLAMA_MODEL=llama2
+#
+# Production (HuggingFace GPU):
+#   LLM_BACKEND=hf
+#   HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+#   HF_DEVICE=cuda
+#
+# Production (HuggingFace CPU):
+#   LLM_BACKEND=hf
+#   HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+#   HF_DEVICE=cpu

.gitattributes CHANGED Viewed

@@ -1,4 +1,2 @@
 # Auto detect text files and perform LF normalization
 * text=auto
-QModel.index filter=lfs diff=lfs merge=lfs -text
-metadata.json filter=lfs diff=lfs merge=lfs -text


1	# Auto detect text files and perform LF normalization
2	* text=auto

.gitignore CHANGED Viewed

@@ -173,12 +173,38 @@ cython_debug/
 # PyPI configuration file
 .pypirc
-# Cursor
-#  Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
 #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
 #  refer to https://docs.cursor.com/context/ignore-files
 .cursorignore
 .cursorindexingignore
 .DS_Store
 data/

 # PyPI configuration file
 .pypirc
+# Cursor
+#  Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
 #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
 #  refer to https://docs.cursor.com/context/ignore-files
 .cursorignore
 .cursorindexingignore
+# IDE and Editor Settings
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
 .DS_Store
+Thumbs.db
+# Local Environment Files
+.env
+.env.local
+.env*.local
+# Development Artifacts
 data/
+*.log
+*.tmp
+.cache/
+# Editor/IDE specific
+*.sublime-project
+*.sublime-workspace
+.vim/
+.emacs.d/.DS_Store
+QModel.index
+metadata.json

ARCHITECTURE.md ADDED Viewed

	@@ -0,0 +1,235 @@

+# QModel v4 Architecture — Detailed System Design
+> For a quick overview, see [README.md](README.md#architecture-overview)
+## System Vision
+A RAG system specialized **exclusively** in authenticated Qur'an and Hadith. No hallucinations, no outside knowledge—only content from verified sources.
+## Core Capabilities
+### 1. **Quran Analysis**
+- **Verse Lookup**: Find verses by topic, keyword, or Surah
+- **Word Frequency**: Count word/phrase occurrences across all 114 Surahs
+- **Topic Tafsir**: Retrieve and explain related Quranic verses
+- **Bilingual**: Arabic (Uthmani) + English (Saheeh International)
+### 2. **Hadith Operations**
+- **Authentication Status**: Verify if a Hadith is in an authenticated collection
+- **Grade Display**: Show authenticity grade (Sahih, Hasan, Da'if, etc.)
+- **Topic Search**: Find Hadiths related to topics across 7 major collections
+- **Collection Navigation**: Filter by Bukhari, Muslim, Abu Dawud, Tirmidhi, Ibn Majah, Nasa'i, Malik
+### 3. **Safety First**
+- **Confidence Gating**: Low-confidence queries return "not found" instead of LLM guess
+- **Source Attribution**: Every answer cites exact verse/Hadith with reference
+- **Grade Filtering**: Optional: only return Sahih-authenticated Hadiths
+- **Verbatim Quotes**: Copy text directly from data, no paraphrasing
+---
+## Data Pipeline
+The system follows a three-phase approach:
+**Metadata Schema**:
+```json
+{
+  "id": "surah:verse or hadith_prefix_number",
+  "arabic": "...",
+  "english": "...",
+  "source": "Surah Al-Baqarah 2:43 | Sahih al-Bukhari 1",
+  "type": "quran | hadith",
+  // Quran only
+  "surah_number": 2,
+  "surah_name_en": "Al-Baqarah",
+  "surah_name_ar": "البقرة",
+  "verse_number": 43,
+  // Hadith only
+  "collection": "Sahih al-Bukhari",
+  "grade": "Sahih",
+  "hadith_number": 1
+}
+```
+### Phase 2: Indexing
+```
+build_index.py
+├── Load Quran + Hadith JSON
+├── Encode all texts with multilingual-e5-large
+│   ├── Dual embeddings: Arabic + English per item
+│   └── Normalize before encoding
+└── Build FAISS IndexFlatIP for dense retrieval
+```
+### Phase 3: Retrieval & Ranking
+**Hybrid Search Algorithm**:
+1. Dense retrieval: FAISS semantic scoring
+2. Sparse retrieval: BM25 term-frequency ranking
+3. Fusion: 60% dense + 40% sparse
+4. Intent-aware boost: +0.08 to Hadith items when intent=hadith
+5. Type filter: Optional (quran_only / hadith_only / authenticated_only)
+---
+## Core Components
+### `fetch_data.py` — Data Acquisition
+- Fetches complete Quran and 7 Hadith collections
+- Handles network retries + CDN redirects
+- Normalizes and validates data
+- Exports `data/quran.json` and `data/hadith.json`
+### `build_index.py` — Index Construction
+- Loads datasets and embeddings model
+- Creates dual-language FAISS vectors
+- Serializes to `QModel.index` + `metadata.json`
+### `main.py` — Inference Engine
+**Three processing layers**:
+1. **Query Layer** (Rewriting & Intent Detection)
+   - `rewrite_query()` — dual-language normalization, spelling correction
+   - `detect_analysis_intent()` — detects word frequency queries
+   - `detect_language()` — routes to Arabic or English persona
+2. **Retrieval Layer** (Semantic Search)
+   - `hybrid_search()` — FAISS + BM25 fusion
+   - `count_occurrences()` — exact/stemmed word frequency across dataset
+   - Caching at query level for fast follow-ups
+3. **Generation Layer** (Safe LLM Call)
+   - `chat_with_fallback()` — Ollama with 3-model fallback chain
+   - `build_context()` — formats retrieved items with scores
+   - `build_messages()` — intent-aware prompts with few-shot examples
+   - Confidence gate: skips LLM if top_score < threshold
+**Anti-Hallucination Measures**:
+- Few-shot examples including "not found" refusal path
+- Hardcoded format rules (box/citation format required)
+- Verbatim copy rules (no reconstruction from memory)
+- Confidence threshold gating (default: 0.30)
+---
+## API Endpoints
+### `GET /ask?q=<question>&top_k=5`
+Returns structured Islamic answer with full lineage.
+**Response**:
+```json
+{
+  "question": "...",
+  "answer": "...",
+  "language": "arabic | english | mixed",
+  "intent": "tafsir | hadith | fatwa | count | general",
+  "analysis": {
+    "keyword": "محمد",
+    "total_count": 157,
+    "examples": [...]
+  },
+  "sources": [
+    {
+      "rank": 1,
+      "source": "Sahih al-Bukhari 1",
+      "type": "hadith",
+      "grade": "Sahih",
+      "_score": 0.876
+    }
+  ],
+  "top_score": 0.876,
+  "latency_ms": 342
+}
+```
+### `GET /debug/scores?q=<question>&top_k=10`
+Inspect raw retrieval scores without LLM call. Use to calibrate `CONFIDENCE_THRESHOLD`.
+### `POST /v1/chat/completions`
+OpenAI-compatible endpoint for language model clients.
+---
+## Configuration
+**`.env` priority**:
+```
+OLLAMA_HOST              # Ollama server URL
+LLM_MODEL                # Primary model (e.g. minimax-m2.7:cloud)
+EMBED_MODEL              # Embedding model (intfloat/multilingual-e5-large)
+FAISS_INDEX              # Path to QModel.index
+METADATA_FILE            # Path to metadata.json
+CONFIDENCE_THRESHOLD     # Min hybrid score for LLM call (default: 0.30)
+HADITH_BOOST             # Intent-aware boost for Hadith (default: 0.08)
+TOP_K_SEARCH             # Retrieval candidate pool (default: 20)
+TOP_K_RETURN             # Results returned to user (default: 5)
+TEMPERATURE              # LLM creativity (default: 0.2 for factual)
+```
+---
+## Deployment
+### Local Development
+```bash
+python main.py
+# API at http://localhost:8000
+# Docs at http://localhost:8000/docs
+```
+### Docker
+```bash
+docker-compose up
+# Ollama on port 11434
+# QModel on port 8000
+```
+---
+## Testing the System
+### 1. Word Frequency Query
+```
+Q: "How many times is the word 'mercy' mentioned in the Quran?"
+→ Detects 'count' intent
+→ Calls count_occurrences()
+→ Returns: 114 occurrences with examples
+```
+### 2. Hadith Authenticity Check
+```
+Q: "Is the Hadith 'Actions are judged by intentions' authentic?"
+→ Searches dataset
+→ Returns: "Sahih al-Bukhari 1 — Grade: Sahih"
+→ LLM elaborates on significance
+```
+### 3. Topic-Based Aya Retrieval
+```
+Q: "What does the Quran say about patience?"
+→ Retrieves top 5 verses about patience
+→ Returns: Verses with Tafsir and interconnections
+```
+### 4. Confidence Gate in Action
+```
+Q: "Who was Muhammad's 7th wife?" (not in dataset)
+→ Retrieval score: 0.15 (below 0.30 threshold)
+→ Returns: "Not in available dataset"
+→ LLM not called (prevents hallucination)
+```
+---
+## Roadmap: v4 Enhancements
+- [ ] Grade-based filtering: `?grade=sahih` to return only authenticated Hadiths
+- [ ] Chain of narrators: Display Isnad with full narrator details
+- [ ] Synonym expansion: Better topic matching (e.g., "mercy" → "rahma, compassion")
+- [ ] Multi-Surah topics: Topics spanning multiple Surahs
+- [ ] Batch processing: Handle multiple questions in one request
+- [ ] Streaming responses: SSE for long-form answers
+- [ ] Islamic calendar integration: Hijri date references

DOCKER.md ADDED Viewed

	@@ -0,0 +1,443 @@

+# QModel Docker Guide
+Complete guide for running QModel in Docker with both backend options.
+## Quick Start
+### Option 1: Docker Compose (Recommended)
+```bash
+# 1. Copy example config
+cp .env.example .env
+# 2. Edit .env and choose your backend (see below)
+nano .env
+# 3. Run with compose
+docker-compose up
+```
+API available at: `http://localhost:8000`
+### Option 2: Docker CLI
+```bash
+# Build image
+docker build -t qmodel .
+# Run with Ollama backend
+docker run -p 8000:8000 \
+  --env-file .env \
+  --add-host host.docker.internal:host-gateway \
+  qmodel
+# Or run with HuggingFace backend
+docker run -p 8000:8000 \
+  --env-file .env \
+  --env HF_TOKEN=your_token_here \
+  qmodel
+```
+---
+## Backend Configuration
+Configure which backend to use via `.env` file:
+### Backend 1: Ollama (Local)
+**Best for**: Development, testing, Docker Desktop
+```bash
+# .env
+LLM_BACKEND=ollama
+OLLAMA_HOST=http://host.docker.internal:11434
+OLLAMA_MODEL=llama2
+```
+**Prerequisites**:
+- Ollama installed on host machine
+- Running: `ollama serve`
+- Model pulled: `ollama pull llama2`
+**Why**:
+- ✅ Fast setup
+- ✅ No GPU required
+- ✅ Works on Docker Desktop (Mac/Windows)
+- ❌ Requires host Ollama service
+### Backend 2: HuggingFace (Remote)
+**Best for**: Production, GPU servers, containerized environments
+```bash
+# .env
+LLM_BACKEND=hf
+HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+HF_DEVICE=auto
+```
+**Prerequisites**:
+- GPU (recommended) OR significant RAM
+- HuggingFace token (for gated models)
+**Passing HF Token**:
+```bash
+# Via docker-compose
+export HF_TOKEN=your_token_here
+docker-compose up
+# Via docker run
+docker run -p 8000:8000 \
+  --env-file .env \
+  --env HF_TOKEN=your_token_here \
+  qmodel
+```
+---
+## Docker Compose Configuration
+The `docker-compose.yml` includes:
+| Setting | Value | Description |
+|---------|-------|-------------|
+| **Image** | Builds from `Dockerfile` | Python 3.11 + dependencies |
+| **Port** | `8000:8000` | API port mapping |
+| **Env File** | `.env` | Configuration source |
+| **HF Token** | From `.env` or `${HF_TOKEN}` | For HuggingFace auth |
+| **Ollama Host** | `host.docker.internal:11434` | Connect to host Ollama |
+| **Volumes** | `.:/app` | Code changes sync (dev mode) |
+| **HF Cache** | `/root/.cache/huggingface` | Persistent model cache |
+| **Networks** | `qmodel-network` | Internal network |
+| **Health Check** | `/health` endpoint | Auto-restart on failure |
+### For Production
+Modify `docker-compose.yml`:
+```yaml
+services:
+  qmodel:
+    # ... (same as above)
+    volumes:
+      # Remove live code volume
+      - huggingface_cache:/root/.cache/huggingface
+    restart: on-failure:5
+```
+---
+## Examples
+### Development with Ollama
+```bash
+# Terminal 1: Start Ollama
+ollama serve
+# Terminal 2: Run QModel
+cat > .env << EOF
+LLM_BACKEND=ollama
+OLLAMA_HOST=http://host.docker.internal:11434
+OLLAMA_MODEL=llama2
+TEMPERATURE=0.2
+CONFIDENCE_THRESHOLD=0.30
+EOF
+docker-compose up
+```
+Access: `http://localhost:8000`
+### Production with HuggingFace
+```bash
+# Create .env for production
+cat > .env << EOF
+LLM_BACKEND=hf
+HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+HF_DEVICE=auto
+TEMPERATURE=0.1
+CONFIDENCE_THRESHOLD=0.35
+ALLOWED_ORIGINS=yourdomain.com
+EOF
+# Export HF token
+export HF_TOKEN=hf_xxxxxxxxxxxxx
+# Run
+docker-compose up -d
+docker-compose logs -f
+```
+### Detached Mode
+```bash
+# Run in background
+docker-compose up -d
+# View logs
+docker-compose logs -f
+# Check status
+docker-compose ps
+# Stop
+docker-compose down
+```
+---
+## Troubleshooting
+### "Cannot connect to Ollama"
+**Symptom**: `ConnectionRefusedError` when using Ollama backend
+**Solution**:
+```bash
+# Ensure Ollama is running on host
+ollama serve
+# Verify in Docker container
+docker run --add-host host.docker.internal:host-gateway qmodel \
+  python -c "import requests; print(requests.get('http://host.docker.internal:11434/api/tags').json())"
+```
+### "HuggingFace model not found"
+**Symptom**: `OSError: ... not found`
+**Solution**:
+```bash
+# Check HF token is set
+echo $HF_TOKEN
+# If not set, export it
+export HF_TOKEN=hf_xxxxxxxxxxxxx
+docker-compose up
+```
+### "Out of memory"
+**Symptom**: Container exits with no error message
+**Solution**:
+- Use smaller model: `HF_MODEL_NAME=mistralai/Mistral-7B-Instruct-v0.2`
+- Use Ollama with `neural-chat` model
+- Increase Docker memory limits:
+```bash
+# Edit docker-compose.yml
+services:
+  qmodel:
+    deploy:
+      resources:
+        limits:
+          memory: 16G
+```
+### "Port already in use"
+**Symptom**: `Address already in use`
+**Solution**:
+```bash
+# Change port in docker-compose.yml
+ports:
+  - "8001:8000"
+# Or kill existing container
+docker-compose down
+docker system prune
+```
+---
+## Building Custom Images
+### Build for Specific Backend
+No code changes needed - just use `.env` to configure.
+### Build with Custom Requirements
+```bash
+# Edit requirements.txt, then rebuild
+docker build -t qmodel:custom .
+```
+### Push to Registry
+```bash
+# Tag for registry
+docker tag qmodel myregistry/qmodel:v4.1
+# Push
+docker push myregistry/qmodel:v4.1
+# Run from registry
+docker run -p 8000:8000 \
+  --env-file .env \
+  myregistry/qmodel:v4.1
+```
+---
+## Performance Tips
+### Docker Compose with GPU (Linux)
+```yaml
+services:
+  qmodel:
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+```
+Then set in `.env`:
+```bash
+HF_DEVICE=cuda
+```
+### Reduce Memory Usage
+```bash
+# In .env
+HF_MODEL_NAME=gpt2                  # Tiny model
+OLLAMA_MODEL=orca-mini              # Smaller Ollama model
+TOP_K_SEARCH=10                     # Fewer candidates
+```
+### Cache Management
+```bash
+# Clear HuggingFace cache
+docker-compose down
+docker volume rm qmodel_huggingface_cache
+# Or cleanup all
+docker system prune -a
+```
+---
+## Docker Networking
+### Access QModel from Host
+```bash
+# Default (works)
+curl http://localhost:8000/health
+```
+### Custom Network
+```bash
+# Create network
+docker network create qmodel-net
+# Run with network
+docker-compose -f docker-compose.yml up
+```
+### Multiple Containers
+```yaml
+# docker-compose.yml
+services:
+  qmodel:
+    networks:
+      - custom-network
+  other-service:
+    networks:
+      - custom-network
+networks:
+  custom-network:
+    driver: bridge
+```
+---
+## CI/CD Integration
+### GitHub Actions Example
+```yaml
+name: Deploy QModel
+on: [push]
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Build Docker image
+        run: docker build -t qmodel .
+      - name: Run tests
+        run: |
+          docker run -port 8000:8000 qmodel &
+          sleep 30
+          curl http://localhost:8000/health
+      - name: Push to registry
+        run: |
+          echo ${{ secrets.REGISTRY_TOKEN }} | docker login -u ${{ secrets.REGISTRY_USER }}
+          docker tag qmodel myregistry/qmodel:${{ github.sha }}
+          docker push myregistry/qmodel:${{ github.sha }}
+```
+---
+## Security Considerations
+### Secrets Management
+```bash
+# Don't commit .env with real tokens
+echo ".env" >> .gitignore
+# Use Docker secrets (Swarm mode)
+docker secret create hf_token -
+# Then use in compose:
+# HF_TOKEN=${HF_TOKEN_FILE}
+```
+### CORS Configuration
+```bash
+# In .env (restrict in production)
+ALLOWED_ORIGINS=yourdomain.com,api.yourdomain.com
+```
+### Network Isolation
+```yaml
+# docker-compose.yml
+services:
+  qmodel:
+    networks:
+      - internal
+networks:
+  internal:
+    internal: true
+```
+---
+## Reference
+- **Dockerfile**: Multi-stage build, health checks, proper layer caching
+- **docker-compose.yml**: Service definition, volumes, networking, health checks
+- **Environment**: Fully configurable via `.env`
+- **Backends**: Ollama (local) or HuggingFace (remote) via `LLM_BACKEND` variable

Dockerfile CHANGED Viewed

@@ -1,29 +1,51 @@
-# Use an official Python runtime as a parent image
 FROM python:3.11-slim
-# Set environment variables
-ENV PYTHONDONTWRITEBYTECODE 1
-ENV PYTHONUNBUFFERED 1
-# Set the working directory in the container
 WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     libopenblas-dev \
     libomp-dev \
     && rm -rf /var/lib/apt/lists/*
-# Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application code
 COPY . .
-# Expose the port the app runs on
 EXPOSE 8000
-# Command to run the application
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

+# QModel v4 - Islamic RAG API
+# =============================
+# Dockerfile for QModel API
+# Supports both Ollama and HuggingFace backends via .env configuration
+#
+# Build: docker build -t qmodel .
+# Run: docker run -p 8000:8000 --env-file .env qmodel
 FROM python:3.11-slim
+# Metadata
+LABEL maintainer="QModel Team"
+LABEL description="QModel v4 - Quran & Hadith RAG API"
+LABEL version="4.1"
+# Environment variables
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1
+# Set working directory
 WORKDIR /app
 # Install system dependencies
+# - build-essential: For compiling Python packages
+# - libopenblas-dev: For numerical operations (FAISS, numpy)
+# - libomp-dev: For OpenMP (FAISS parallelization)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     libopenblas-dev \
     libomp-dev \
+    curl \
     && rm -rf /var/lib/apt/lists/*
+# Copy requirements and install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
 COPY . .
+# Expose port for API
 EXPOSE 8000
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Start application
+# Configure via .env: LLM_BACKEND=ollama or LLM_BACKEND=hf
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

OPEN_WEBUI.md ADDED Viewed

	@@ -0,0 +1,385 @@

+# Using QModel v4 with Open-WebUI
+QModel v4 is fully compatible with **Open-WebUI** thanks to its OpenAI-compatible API endpoints. This guide shows you how to integrate them.
+## Prerequisites
+1. **QModel running** on your local machine or server
+   ```bash
+   python main.py
+   # Runs on http://localhost:8000
+   ```
+2. **Open-WebUI installed** (Docker recommended)
+   ```bash
+   docker run -d -p 3000:8080 --name open-webui ghcr.io/open-webui/open-webui:latest
+   # Runs on http://localhost:3000
+   ```
+---
+## Integration Steps
+### Step 1: Add QModel as a Custom OpenAI-Compatible Model
+In Open-WebUI:
+1. **Settings** → **Models** → **Manage Models**
+2. Click **"Connect to OpenAI-compatible API"**
+3. Enter:
+   - **API Base URL**: `http://localhost:8000/v1`
+   - **Model Name**: `QModel` (or `qmodel`)
+   - **API Key**: Leave blank (no auth required)
+4. Click **"Save & Test"**
+5. You should see: ✅ **Model connected successfully**
+### Step 2: Start Using QModel
+1. Open a **New Chat** in Open-WebUI
+2. Select **QModel** from the model dropdown
+3. Type your Islamic question:
+   ```
+   What does the Quran say about mercy?
+   ```
+4. Press Enter and get an Islamic-grounded RAG response with sources!
+---
+## API Endpoints (OpenAI-Compatible)
+### POST `/v1/chat/completions`
+Standard OpenAI chat completions endpoint.
+**Request:**
+```json
+{
+  "model": "QModel",
+  "messages": [
+    {"role": "user", "content": "What does Islam say about patience?"}
+  ],
+  "temperature": 0.2,
+  "max_tokens": 2048,
+  "top_k": 5,
+  "stream": false
+}
+```
+**Response:**
+```json
+{
+  "id": "qmodel-1234567890",
+  "object": "chat.completion",
+  "created": 1234567890,
+  "model": "QModel",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "Islam emphasizes patience as a core virtue..."
+      },
+      "finish_reason": "stop"
+    }
+  ],
+  "x_metadata": {
+    "language": "english",
+    "intent": "general",
+    "top_score": 0.876,
+    "latency_ms": 342,
+    "sources": [
+      {
+        "source": "Surah Al-Imran 3:200",
+        "type": "quran",
+        "grade": null,
+        "score": 0.876
+      }
+    ]
+  }
+}
+```
+### GET `/v1/models`
+List available models.
+**Response:**
+```json
+{
+  "object": "list",
+  "data": [
+    {
+      "id": "QModel",
+      "object": "model",
+      "created": 1234567890,
+      "owned_by": "elgendy"
+    }
+  ]
+}
+```
+---
+## Advanced Query Parameters (Open-WebUI Compatible)
+When using Open-WebUI, you can include special parameters:
+### Islamic-Specific Parameters
+**URL Query String:**
+```
+/v1/chat/completions?source_type=hadith&grade_filter=sahih&top_k=5
+```
+**Supported Parameters:**
+- `source_type`: `quran` | `hadith` | (both, default)
+- `grade_filter`: `sahih` | `hasan` | (all, default)
+- `top_k`: 1-20 (number of sources to retrieve)
+### Example Requests via curl
+```bash
+# 1. Basic query (both Quran + Hadith)
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "QModel",
+    "messages": [{"role": "user", "content": "What does Islam say about mercy?"}]
+  }'
+# 2. Quran-only query
+curl -X POST http://localhost:8000/v1/chat/completions?source_type=quran \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "QModel",
+    "messages": [{"role": "user", "content": "What does the Quran say about patience?"}]
+  }'
+# 3. Authenticated Hadiths only (Sahih grade)
+curl -X POST http://localhost:8000/v1/chat/completions?source_type=hadith&grade_filter=sahih \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "QModel",
+    "messages": [{"role": "user", "content": "Hadiths about prayer"}]
+  }'
+# 4. Streaming response
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "QModel",
+    "messages": [{"role": "user", "content": "Tell me about Zakat"}],
+    "stream": true
+  }'
+```
+---
+## Open-WebUI Features Supported
+| Feature | Status | Notes |
+|---------|--------|-------|
+| **Chat** | ✅ Full support | Normal Q&A |
+| **Streaming** | ✅ Supported | Set `stream: true` in request |
+| **Context** | ✅ Multi-turn | Open-WebUI handles conversation history |
+| **Temperature** | ✅ Configurable | Via Open-WebUI settings |
+| **Token Limits** | ✅ Supported | Via `max_tokens` parameter |
+| **Model List** | ✅ Available | Via `/v1/models` endpoint |
+| **Source Attribution** | ✅ In metadata | Via `x_metadata.sources` |
+---
+## Custom System Prompts in Open-WebUI
+To customize QModel for specific Islamic tasks, create a custom chatbot in Open-WebUI:
+1. **Home** → **+ New Chatbot**
+2. Configure:
+   - **Name**: "Islamic Scholar" (or your choice)
+   - **Model**: QModel
+   - **System Prompt**:
+     ```
+     You are an expert Islamic scholar specializing in Qur'an and Hadith.
+     Always cite sources exactly as provided.
+     Only answer from the provided Islamic context—never use outside knowledge.
+     If information is not in the dataset, say so clearly.
+     ```
+   - **Top K Sources**: 5
+   - **Temperature**: 0.1 (for consistency)
+3. **Save** and start chatting!
+---
+## Troubleshooting
+### Issue: "Failed to connect to QModel"
+**Solutions:**
+1. Check QModel is running: `curl http://localhost:8000/health`
+2. Verify API Base URL is correct: `http://localhost:8000/v1`
+3. Check firewall: Port 8000 must be accessible
+4. Check logs: `python main.py` to see startup messages
+### Issue: "No sources in response"
+**Solutions:**
+1. Check `/debug/scores` endpoint directly:
+   ```bash
+   curl "http://localhost:8000/debug/scores?q=patience&top_k=10"
+   ```
+2. Adjust `CONFIDENCE_THRESHOLD` in `.env` if retrievals are low-quality
+3. Try synonyms: "mercy" instead of "compassion"
+### Issue: "Assistant returns 'Not found'"
+**This is expected behavior!** QModel has safety checks:
+1. If retrieval score is too low (< 0.30), it returns "not found"
+2. This prevents hallucinations
+3. Try more specific queries or adjust `CONFIDENCE_THRESHOLD`
+---
+## Configuration for Open-WebUI
+### Recommended Settings
+For best results with Open-WebUI:
+```env
+# More conservative (fewer hallucinations)
+CONFIDENCE_THRESHOLD=0.40
+TEMPERATURE=0.1
+HADITH_BOOST=0.08
+# More liberal (more answers, higher hallucination risk)
+CONFIDENCE_THRESHOLD=0.20
+TEMPERATURE=0.3
+HADITH_BOOST=0.05
+```
+### Docker Compose Integration
+To run both QModel and Open-WebUI together:
+```yaml
+version: '3.8'
+services:
+  qmodel:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - LLM_BACKEND=ollama
+      - OLLAMA_HOST=http://ollama:11434
+    depends_on:
+      - ollama
+  ollama:
+    image: ollama/ollama:latest
+    ports:
+      - "11434:11434"
+  web-ui:
+    image: ghcr.io/open-webui/open-webui:latest
+    ports:
+      - "3000:8080"
+    depends_on:
+      - qmodel
+```
+Run: `docker-compose up`
+---
+## Using QModel in Open-WebUI Workflows
+### Example 1: Islamic Q&A Chatbot
+1. Create chatbot with system prompt about Islamic knowledge
+2. Select QModel as backend
+3. Set temperature to 0.1 for consistency
+4. Enable web search toggle (optional, for cross-verification)
+### Example 2: Hadith Research Tool
+1. Create chatbot: "Hadith Researcher"
+2. System prompt:
+   ```
+   You are a Hadith researcher. For each query:
+   1. Search authenticated Hadiths only (Sahih grade)
+   2. Display the full text with authenticity grade
+   3. Explain the Hadith's significance
+   4. Always cite the collection and number
+   ```
+3. Enable grade filtering: `grade_filter=sahih`
+### Example 3: Qur'anic Study Assistant
+1. Create chatbot: "Qur'an Tafsir"
+2. Set `source_type=quran` in parameters
+3. System prompt focusing on Qur'anic interpretation
+4. Enable multi-turn for deeper exploration
+---
+## API Testing
+### Test with Open-WebUI's Developer Tools
+1. Open Open-WebUI console (F12)
+2. Go to **Network** tab
+3. Send a message to QModel
+4. Inspect the request/response to `/v1/chat/completions`
+### Test with cURL
+```bash
+# 1. Health check
+curl http://localhost:8000/health | jq
+# 2. List models
+curl http://localhost:8000/v1/models | jq
+# 3. Simple chat
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model":"QModel","messages":[{"role":"user","content":"Assalam alaikum"}]}' | jq
+```
+---
+## Performance Tips
+### For Optimal Open-WebUI Experience
+1. **Use Ollama locally** for responsive chat (400-800ms per query)
+2. **Set `max_tokens=1024`** to avoid long waits
+3. **Use temperature=0.1** for reliable, consistent answers
+4. **Increase `CACHE_TTL`** for frequently asked questions
+5. **Reduce `TOP_K_SEARCH`** if queries are slow (default 20)
+---
+## Security Notes
+### For Production Deployments
+1. **Restrict CORS**: Set `ALLOWED_ORIGINS=your-domain.com` in `.env`
+2. **Use HTTPS**: Proxy through nginx with TLS
+3. **Rate limit**: Add rate limiting middleware (not in v4, but recommended)
+4. **Authentication**: Consider adding API key validation layer
+5. **Network**: Don't expose QModel directly to the internet without auth
+---
+## Support
+- 📖 Full setup guide: See `SETUP.md`
+- 🔍 Debugging: Use `/debug/scores` to inspect retrievals
+- 💬 Questions about Open-WebUI: See https://docs.openwebui.com
+- 🕌 Islamic knowledge: See `ARCHITECTURE.md` for system details
+---
+**Happy chatting with QModel + Open-WebUI! 🕌**

README.md CHANGED Viewed

@@ -1,17 +1,295 @@
 ---
-license: mit
-language:
-- ar
-- en
-base_model:
-- Qwen/Qwen2.5-72B-Instruct
-pipeline_tag: question-answering
-tags:
-- queean
-- hadith
-- islam
-- abdullah
-- elgendy
-metrics:
-- accuracy
----

+# QModel v4 — Islamic RAG System
+**Specialized Qur'an & Hadith Knowledge System with Dual LLM Support**
+> A production-ready Retrieval-Augmented Generation system specialized exclusively in authenticated Islamic knowledge. No hallucinations, no outside knowledge—only content from verified sources.
+![Version](https://img.shields.io/badge/version-4.0.0-blue)
+![Backend](https://img.shields.io/badge/backend-ollama%20%7C%20huggingface-green)
+![Status](https://img.shields.io/badge/status-production--ready-success)
 ---
+## Features
+### 📖 Qur'an Capabilities
+- **Verse Lookup**: Find verses by topic or keyword
+- **Word Frequency**: Count occurrences with Surah breakdown
+- **Bilingual**: Full Arabic + English translation support
+- **Tafsir Integration**: AI-powered contextual interpretation
+### 📚 Hadith Capabilities
+- **Authenticity Verification**: Check if Hadith is in authenticated collections
+- **Grade Display**: Show Sahih/Hasan/Da'if authenticity levels
+- **Topic Search**: Find relevant Hadiths across 9 major collections
+- **Collection Navigation**: Filter by Bukhari, Muslim, Abu Dawud, etc.
+### 🛡️ Safety Features
+- **Confidence Gating**: Low-confidence queries return "not found" instead of guesses
+- **Source Attribution**: Every answer cites exact verse/Hadith reference
+- **Verbatim Quotes**: Text copied directly from data, never paraphrased
+- **Anti-Hallucination**: Hardened prompts with few-shot "not found" examples
+### 🚀 Integration
+- **OpenAI-Compatible API**: Use with Open-WebUI, Langchain, or any OpenAI client
+- **OpenAI Schema**: Full support for `/v1/chat/completions` and `/v1/models`
+- **Streaming Responses**: SSE streaming for long-form answers
+### ⚙️ Technical
+- **Dual LLM Backend**: Ollama (dev) + HuggingFace (prod)
+- **Hybrid Search**: Dense (FAISS) + Sparse (BM25) scoring
+- **Async API**: FastAPI with async/await throughout
+- **Caching**: TTL-based LRU cache for frequent queries
+- **Scale**: 6,236 Quranic verses + 41,390 Hadiths indexed
+---
+## Quick Start (5 minutes)
+```bash
+# 1. Install
+git clone https://github.com/elgendy/QModel.git && cd QModel
+python3 -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+# 2. Configure (choose one)
+# For local development - Ollama:
+export LLM_BACKEND=ollama
+export OLLAMA_MODEL=llama2
+# Make sure Ollama is running: ollama serve
+# OR for production - HuggingFace:
+export LLM_BACKEND=hf
+export HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+# 3. Run
+python main.py
+# 4. Query
+curl "http://localhost:8000/ask?q=What%20does%20Islam%20say%20about%20mercy?"
+```
+API docs: http://localhost:8000/docs
+---
+## Example Queries
+```bash
+# Basic question
+curl "http://localhost:8000/ask?q=What%20does%20Islam%20say%20about%20mercy?"
+# Word frequency
+curl "http://localhost:8000/ask?q=How%20many%20times%20is%20mercy%20mentioned?"
+# Authentic Hadiths only
+curl "http://localhost:8000/ask?q=prayer&source_type=hadith&grade_filter=sahih"
+# Verify Hadith
+curl "http://localhost:8000/hadith/verify?q=Actions%20are%20judged%20by%20intentions"
+```
+---
+## Documentation
+| Document | Purpose |
+|----------|---------|
+| **[SETUP.md](SETUP.md)** | Installation, configuration (both backends), API endpoints, examples |
+| **[DOCKER.md](DOCKER.md)** | Docker deployment, production setup, troubleshooting |
+| **[ARCHITECTURE.md](ARCHITECTURE.md)** | System design, data pipeline, core components |
+| **[OPEN_WEBUI.md](OPEN_WEBUI.md)** | Integration with Open-WebUI chat interface |
+---
+## Key Decisions
+### Backend Selection
+- **Ollama** — Fast setup, no GPU, great for development, `LLM_BACKEND=ollama`
+- **HuggingFace** — Production-grade, better quality, GPU recommended, `LLM_BACKEND=hf`
+Both are equally supported via the same `.env` configuration. Just set `LLM_BACKEND` and restart.
+### Data
+- **47,626 documents**: 6,236 Quranic verses + 41,390 hadiths from 9 canonical collections
+- **Pre-built**: `metadata.json` and `QModel.index` included, ready to use
+- **Dual-language**: Arabic and English support
+---
+## Open-WebUI Integration
+QModel integrates seamlessly with Open-WebUI for a chat interface:
+```bash
+# Start QModel
+python main.py
+# Start Open-WebUI (Docker)
+docker run -p 3000:8080 ghcr.io/open-webui/open-webui:latest
+# In Open-WebUI: Settings → Models → Add OpenAI-compatible
+# API Base: http://localhost:8000/v1
+# Model: QModel
+```
+See [OPEN_WEBUI.md](OPEN_WEBUI.md) for detailed integration guide.
+---
+## API Reference (Quick)
+### Main Query
+```
+GET /ask?q=<question>&top_k=5&source_type=<quran|hadith>&grade_filter=<sahih|hasan>
+```
+**Response includes:**
+- AI-generated answer
+- Listed sources with scores
+- Language detection (Arabic/English)
+- Query intent classification
+### Other Endpoints
+- `GET /debug/scores?q=<question>&top_k=10` — Inspect raw retrieval scores
+- `GET /hadith/verify?q=<hadith_text>` — Check hadith authenticity
+- `POST /v1/chat/completions` — OpenAI-compatible endpoint
+- `GET /health` — Health check
+See [SETUP.md](SETUP.md) for full endpoint documentation.
+---
+## Configuration
+All configuration via environment variables (no code changes needed):
+```bash
+# Backend (required)
+LLM_BACKEND=ollama              # or: hf
+# Ollama settings
+OLLAMA_HOST=http://localhost:11434
+OLLAMA_MODEL=llama2             # or: mistral, neural-chat
+# HuggingFace settings
+HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+HF_DEVICE=auto                  # auto, cuda, or cpu
+# Quality tuning
+TEMPERATURE=0.2                 # 0=deterministic, 1=creative
+CONFIDENCE_THRESHOLD=0.30       # Min score for LLM call
+TOP_K_RETURN=5                  # Results per query
+```
+See [SETUP.md](SETUP.md) for comprehensive configuration reference.
+---
+## Performance
+| Operation | Time | Backend |
+|-----------|------|---------|
+| Query (cached) | ~50ms | Both |
+| Query (Ollama) | 400-800ms | Ollama |
+| Query (HF GPU) | 500-1500ms | CUDA |
+| Query (HF CPU) | 2-5s | CPU |
+---
+## Deployment
+### Local Development
+```bash
+python main.py
+```
+### Docker (with Ollama backend)
+```bash
+docker-compose up
+```
+### Docker (with HuggingFace backend)
+Set `LLM_BACKEND=hf` in `.env` then `docker-compose up`
+See [DOCKER.md](DOCKER.md) for production deployment, troubleshooting, and advanced configuration.
+---
+## Data Sources
+- **Qur'an**: [risan/quran-json](https://github.com/risan/quran-json) — 114 Surahs, 6,236 verses
+- **Hadith**: [AhmedBaset/hadith-json](https://github.com/AhmedBaset/hadith-json) — 9 canonical collections, 41,390 hadiths
+---
+## Architecture Overview
+```
+User Query
+    ↓
+Query Rewriting & Intent Detection
+    ↓
+Hybrid Search (FAISS dense + BM25 sparse)
+    ↓
+Filtering & Ranking
+    ↓
+Confidence Gate (skip LLM if low-scoring)
+    ↓
+LLM Generation (Ollama or HuggingFace)
+    ↓
+Formatted Response with Sources
+```
+See [ARCHITECTURE.md](ARCHITECTURE.md) for detailed system design.
+---
+## Troubleshooting
+| Issue | Solution |
+|-------|----------|
+| "Service is initialising" | Wait 60-90s for embeddings model to load |
+| Low retrieval scores | Check `/debug/scores`, try synonyms, lower threshold |
+| "Model not found" (HF) | Run `huggingface-cli login` |
+| Out of memory | Use smaller model or CPU backend |
+| No results | Verify data files exist: `metadata.json` and `QModel.index` |
+See [SETUP.md](SETUP.md) and [DOCKER.md](DOCKER.md) for more detailed troubleshooting.
+---
+## What's New in v4
+✨ **Dual LLM Backend** — Ollama (dev) + HuggingFace (prod)
+✨ **Grade Filtering** — Return only Sahih/Hasan authenticated Hadiths
+✨ **Source Filtering** — Quran-only or Hadith-only queries
+✨ **Hadith Verification** — `/hadith/verify` endpoint
+✨ **Enhanced Frequency** — Word counts by Surah
+✨ **OpenAI Compatible** — Use with any OpenAI client
+✨ **Production Ready** — Structured logging, error handling, async throughout
+---
+## Next Steps
+1. **Get Started**: See [SETUP.md](SETUP.md)
+2. **Integrate with Open-WebUI**: See [OPEN_WEBUI.md](OPEN_WEBUI.md)
+3. **Deploy with Docker**: See [DOCKER.md](DOCKER.md)
+4. **Understand Architecture**: See [ARCHITECTURE.md](ARCHITECTURE.md)
+---
+## License
+This project uses open-source data from:
+- [Qur'an JSON](https://github.com/risan/quran-json) — Open source
+- [Hadith API](https://github.com/AhmedBaset/hadith-json) — Open source
+See individual repositories for license details.
+---
+**Made with ❤️ for Islamic scholarship.**
+Version 4.0.0 | March 2025 | Production-Ready

SETUP.md ADDED Viewed

	@@ -0,0 +1,590 @@

+# QModel v4 Setup & Deployment Guide
+## Quick Start
+### 1. Prerequisites
+- Python 3.10+
+- 16 GB RAM minimum (for embeddings + LLM)
+- GPU recommended for HuggingFace backend
+- Ollama installed (for local development) OR internet access (for HuggingFace)
+### 2. Installation
+```bash
+# Clone and enter project
+cd /Users/elgendy/Projects/QModel
+# Create virtual environment
+python3 -m venv .venv
+source .venv/bin/activate
+# Install dependencies
+pip install -r requirements.txt
+```
+### 3. Data & Index
+The project includes pre-built data files:
+- `metadata.json` — 47,626 documents (6,236 Quran verses + 41,390 hadiths from 9 canonical collections)
+- `QModel.index` — FAISS search index (pre-generated)
+If you need to rebuild the index after dataset changes:
+```bash
+python build_index.py
+```
+---
+## Backend Configuration
+QModel supports two LLM backends. Choose based on your environment:
+| Backend | Pros | Cons | When to Use |
+|---------|------|------|------------|
+| **Ollama** (local) | Fast setup, no GPU needed, no model downloads, free | Smaller models, limited customization | Development, testing, resource-constrained |
+| **HuggingFace** (remote) | Larger models, better quality, full control | Requires GPU or significant RAM, slower downloads | Production, high-quality responses |
+### LLM Backend Selection
+**Option 1: Local Ollama (Development)**
+For development, testing, and when you already have Ollama running locally:
+```bash
+LLM_BACKEND=ollama
+OLLAMA_HOST=http://localhost:11434
+OLLAMA_MODEL=llama2              # or: mistral, neural-chat, orca-mini
+```
+**Available Ollama Models:**
+- `llama2` — Fast, good quality (default, recommended)
+- `mistral` — Better Arabic support
+- `neural-chat` — Good balance
+- `openchat` — Good instruction following
+- `orca-mini` — Lightweight
+**Option 2: Remote HuggingFace (Production)**
+For production deployments with better quality and control:
+```bash
+LLM_BACKEND=hf
+HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct    # Excellent Arabic support
+HF_DEVICE=auto                           # auto | cuda | cpu
+HF_MAX_NEW_TOKENS=2048
+```
+**Recommended HuggingFace Models:**
+- `Qwen/Qwen2-7B-Instruct` — Excellent Arabic, strong reasoning (default)
+- `mistralai/Mistral-7B-Instruct-v0.2` — Very capable, fast
+- `meta-llama/Llama-2-13b-chat-hf` — Larger, needs HF token
+**Device Options:**
+- `auto` — Auto-detect (GPU if available, else CPU)
+- `cuda` — Force GPU (requires NVIDIA GPU)
+- `cpu` — Force CPU (slower, but works everywhere)
+### Complete Environment Variables Reference
+#### Backend Selection
+| Variable | Default | Options | Example |
+|----------|---------|---------|---------|
+| `LLM_BACKEND` | `hf` | `ollama`, `hf` | `ollama` |
+#### Ollama Backend
+| Variable | Default | Description | Example |
+|----------|---------|-------------|---------|
+| `OLLAMA_HOST` | `http://localhost:11434` | Ollama server URL | `http://localhost:11434` |
+| `OLLAMA_MODEL` | `llama2` | Model name | `mistral` |
+#### HuggingFace Backend
+| Variable | Default | Description | Example |
+|----------|---------|-------------|---------|
+| `HF_MODEL_NAME` | `Qwen/Qwen2-7B-Instruct` | Model ID | `Qwen/Qwen2-7B-Instruct` |
+| `HF_DEVICE` | `auto` | Device to use | `cuda` |
+| `HF_MAX_NEW_TOKENS` | `2048` | Max output length | `2048` |
+#### Embedding & Data
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `EMBED_MODEL` | `intfloat/multilingual-e5-large` | Embedding model (keep default) |
+| `FAISS_INDEX` | `QModel.index` | Index file path |
+| `METADATA_FILE` | `metadata.json` | Dataset file |
+#### Retrieval & Ranking
+| Variable | Default | Range | Purpose |
+|----------|---------|-------|---------|
+| `TOP_K_SEARCH` | `20` | 5-100 | Candidate pool (⬆️ = slower but more coverage) |
+| `TOP_K_RETURN` | `5` | 1-20 | Results shown to user |
+| `RERANK_ALPHA` | `0.6` | 0.0-1.0 | Dense (0.6) vs Sparse (0.4) weighting |
+#### Generation
+| Variable | Default | Range | Purpose |
+|----------|---------|-------|---------|
+| `TEMPERATURE` | `0.2` | 0.0-1.0 | 0.0=deterministic, 1.0=creative (use 0.1-0.2 for religious) |
+| `MAX_TOKENS` | `2048` | 512-4096 | Max response length |
+#### Safety & Quality
+| Variable | Default | Range | Purpose |
+|----------|---------|-------|---------|
+| `CONFIDENCE_THRESHOLD` | `0.30` | 0.0-1.0 | Min score to call LLM (⬆️ = fewer hallucinations) |
+| `HADITH_BOOST` | `0.08` | 0.0-1.0 | Score boost for hadith on hadith queries |
+#### Other Settings
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `CACHE_SIZE` | `512` | Query response cache entries |
+| `CACHE_TTL` | `3600` | Cache expiry in seconds |
+| `ALLOWED_ORIGINS` | `*` | CORS origins (use specific domains in production) |
+| `MAX_EXAMPLES` | `3` | Few-shot examples in system prompt |
+### Configuration Examples
+**Development (Ollama) - Recommended for getting started**
+```bash
+LLM_BACKEND=ollama
+OLLAMA_HOST=http://localhost:11434
+OLLAMA_MODEL=llama2
+EMBED_MODEL=intfloat/multilingual-e5-large
+FAISS_INDEX=QModel.index
+METADATA_FILE=metadata.json
+TOP_K_SEARCH=20
+TOP_K_RETURN=5
+TEMPERATURE=0.2
+CONFIDENCE_THRESHOLD=0.30
+ALLOWED_ORIGINS=*
+```
+**Production (HuggingFace + GPU) - Best quality, uses GPU**
+```bash
+LLM_BACKEND=hf
+HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+HF_DEVICE=cuda
+EMBED_MODEL=intfloat/multilingual-e5-large
+FAISS_INDEX=QModel.index
+METADATA_FILE=metadata.json
+TOP_K_SEARCH=30         # More candidates for better quality
+TOP_K_RETURN=5
+TEMPERATURE=0.1         # More deterministic
+CONFIDENCE_THRESHOLD=0.35
+ALLOWED_ORIGINS=yourdomain.com,api.yourdomain.com
+```
+**Production (HuggingFace + CPU) - CPU-only, slower but no GPU required**
+```bash
+LLM_BACKEND=hf
+HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
+HF_DEVICE=cpu
+TEMPERATURE=0.1
+MAX_TOKENS=1024         # Reduce for faster responses
+CONFIDENCE_THRESHOLD=0.35
+```
+### Tuning Tips
+**For Better Results:**
+- Increase `TOP_K_SEARCH` (costs slightly more compute)
+- Lower `CONFIDENCE_THRESHOLD` (may get some hallucinations)
+- Use larger model with more parameters
+- Set `TEMPERATURE=0.1` for most consistent answers
+**For Faster Performance:**
+- Lower `TOP_K_SEARCH` and `TOP_K_RETURN`
+- Use Ollama backend (faster inference)
+- Reduce `MAX_TOKENS`
+- Set `HF_DEVICE=cpu` if using HF (faster than auto-selecting)
+**For More Accurate/Conservative Answers:**
+- Increase `CONFIDENCE_THRESHOLD` (skip borderline queries)
+- Lower `TEMPERATURE` (more deterministic)
+- Use larger model (7B+ parameters)
+**For CPU-Only (No GPU Available):**
+- Use Ollama backend with `neural-chat` model
+- Set `HF_DEVICE=cpu` if using HF
+- Reduce `MAX_TOKENS` to 1024
+---
+## Running QModel
+### Step-by-Step: Starting the API
+1. **Create `.env` file**:
+   ```bash
+   cp .env.example .env
+   # Edit .env and choose your backend (see Configuration section above)
+   ```
+2. **Start the backend service**:
+   **If using Ollama:**
+   ```bash
+   # Terminal 1: Start Ollama daemon
+   ollama serve
+   # Terminal 2: Pull a model (first time only)
+   ollama pull llama2    # or: mistral, neural-chat
+   ```
+   **If using HuggingFace:**
+   - No separate service needed, models download automatically
+3. **Start QModel API**:
+   ```bash
+   python main.py
+   ```
+API available at `http://localhost:8000`
+View interactive docs: `http://localhost:8000/docs`
+### Docker Option
+```bash
+# Configure your backend in .env (see Configuration section)
+cp .env.example .env
+nano .env               # Choose LLM_BACKEND=ollama or hf
+# Run with Docker Compose
+docker-compose up
+```
+For full Docker documentation (including production deployment, troubleshooting, and multi-container setup), see **[DOCKER.md](DOCKER.md)**.
+---
+## API Endpoints
+### Main Query Endpoint
+```bash
+GET /ask?q=<question>&top_k=5&source_type=<filter>&grade_filter=<filter>
+```
+**Parameters:**
+- `q` (required): Your Islamic question
+- `top_k`: Number of sources to retrieve (1-20, default: 5)
+- `source_type`: Filter by source type
+  - `quran` — Quranic verses only
+  - `hadith` — Hadiths only
+  - `null` (default) — Both
+- `grade_filter`: Filter Hadith by authenticity grade
+  - `sahih` — Only Sahih-graded Hadiths
+  - `hasan` — Sahih + Hasan
+  - `null` (default) — All grades
+**Example Requests:**
+```bash
+# General question
+curl "http://localhost:8000/ask?q=What%20does%20Islam%20say%20about%20mercy?"
+# Quran-only with word frequency
+curl "http://localhost:8000/ask?q=How%20many%20times%20is%20mercy%20mentioned?&source_type=quran"
+# Authentic Hadiths only
+curl "http://localhost:8000/ask?q=Hadiths%20about%20prayer&source_type=hadith&grade_filter=sahih"
+```
+**Response:**
+```json
+{
+  "question": "What does Islam say about mercy?",
+  "answer": "Islam emphasizes mercy as a core value...",
+  "language": "english",
+  "intent": "general",
+  "analysis": null,
+  "sources": [
+    {
+      "source": "Surah Al-Baqarah 2:178",
+      "type": "quran",
+      "grade": null,
+      "arabic": "...",
+      "english": "...",
+      "_score": 0.876
+    }
+  ],
+  "top_score": 0.876,
+  "latency_ms": 342
+}
+```
+---
+### Hadith Verification Endpoint
+```bash
+GET /hadith/verify?q=<hadith_text>&collection=<filter>
+```
+**Purpose:** Quick authenticity check for a Hadith
+**Example:**
+```bash
+curl "http://localhost:8000/hadith/verify?q=Actions%20are%20judged%20by%20intentions"
+```
+**Response:**
+```json
+{
+  "query": "Actions are judged by intentions",
+  "found": true,
+  "collection": "Sahih al-Bukhari",
+  "grade": "Sahih",
+  "reference": "Sahih al-Bukhari 1",
+  "arabic": "إنما الأعمال بالنيات",
+  "english": "Verily, actions are judged by intentions...",
+  "latency_ms": 156
+}
+```
+---
+### Debug Endpoint
+```bash
+GET /debug/scores?q=<question>&top_k=10
+```
+**Purpose:** Inspect raw retrieval scores without LLM call. Use to calibrate `CONFIDENCE_THRESHOLD`.
+**Example:**
+```bash
+curl "http://localhost:8000/debug/scores?q=patience&top_k=10"
+```
+**Response:**
+```json
+{
+  "intent": "general",
+  "threshold": 0.3,
+  "results": [
+    {
+      "rank": 1,
+      "source": "Surah Al-Baqarah 2:45",
+      "type": "quran",
+      "grade": null,
+      "_dense": 0.8234,
+      "_sparse": 0.5421,
+      "_score": 0.7234
+    }
+  ]
+}
+```
+Use this to fine-tune `CONFIDENCE_THRESHOLD`. If queries you expect to work have `_score < threshold`, lower the threshold.
+---
+### Health & Metadata
+```bash
+# Health check
+curl http://localhost:8000/health
+# List available models
+curl http://localhost:8000/v1/models
+# Interactive API docs
+http://localhost:8000/docs
+```
+---
+## Query Examples
+### 1. Word Frequency Analysis
+**Question:** "How many times is the word 'mercy' mentioned in the Quran?"
+**System detects:** `intent=count`
+**Response includes:**
+```json
+{
+  "analysis": {
+    "keyword": "mercy",
+    "total_count": 87,
+    "by_surah": {
+      "2": {"name": "Al-Baqarah", "count": 12},
+      "7": {"name": "Al-A'raf", "count": 8},
+      ...
+    }
+  }
+}
+```
+---
+### 2. Topic-Based Aya Retrieval
+**Question:** "What does the Quran say about patience?"
+**System detects:** `intent=tafsir`
+**Response:**
+- Retrieves top 5 verses about patience
+- LLM explains each with Tafsir
+- Shows interconnections between verses
+---
+### 3. Hadith Authentication
+**Question:** "Is the Hadith 'Actions are judged by intentions' authentic?"
+**System detects:** `intent=auth`
+**LLM response:**
+- "Yes, this is found in Sahih al-Bukhari 1"
+- "Grade: Sahih (authentic)"
+- "Explanation: This Hadith establishes the principle of intention..."
+---
+### 4. Bilingual Support
+**Arabic Question:** "ما أهمية الصبر في الإسلام؟"
+**System detects:** Language = arabic
+**Response:** Full Arabic response with proper vocalization
+---
+## Tuning & Optimization
+### Confidence Threshold
+The `CONFIDENCE_THRESHOLD` (default 0.30) controls when to call the LLM:
+- **Too high (e.g., 0.70)**: Many queries rejected as "not found" (safer but less helpful)
+- **Too low (e.g., 0.10)**: LLM called on weak matches (more hallucinations)
+- **Sweet spot (0.30-0.50)**: Most queries get through, but low-quality matches rejected
+**To calibrate:**
+1. Run `/debug/scores` on representative queries
+2. Check what `_score` values are returned
+3. Adjust `CONFIDENCE_THRESHOLD` in `.env`
+4. Restart service
+---
+### Temperature
+- **0.0**: Deterministic (best for factual Islamic answers)
+- **0.2**: Slightly creative (default)
+- **0.5+**: More creative (not recommended for religious content)
+---
+### Model Selection
+#### For Development (Ollama)
+- **llama2** — Fastest, good quality, easy setup
+- **mistral** — Better Arabic, slightly slower
+- **neural-chat** — Good balance
+```bash
+ollama pull llama2
+OLLAMA_MODEL=llama2 python main.py
+```
+#### For Production (HuggingFace)
+- **Qwen/Qwen2-7B-Instruct** — Strong Arabic, 7B params
+- **mistralai/Mistral-7B-Instruct-v0.2** — Very capable
+- **meta-llama/Llama-2-13b-chat-hf** — Larger, better quality (requires HF token)
+```bash
+HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct python main.py
+```
+---
+## Troubleshooting
+### Issue: "Service is still initialising"
+**Solution:** Wait 60-90 seconds for embedding model to load. Check logs:
+```bash
+tail -f <logfile>
+```
+### Issue: Low retrieval scores
+**Cause:** Queries don't match dataset language better
+**Solution:**
+1. Check `/debug/scores` output
+2. Ensure query is in Arabic or clear English
+3. Try synonyms (e.g., "mercy" vs "compassion")
+4. Lower `CONFIDENCE_THRESHOLD` in `.env`
+### Issue: LLM model not found (HF backend)
+**Solution:**
+```bash
+huggingface-cli login
+export HF_TOKEN=<your_token>
+```
+### Issue: Out of memory
+**Solution:**
+- Use `OLLAMA_MODEL=neural-chat` (smaller)
+- Set `HF_DEVICE=cpu` (slower but uses RAM instead of VRAM)
+- Reduce `TOP_K_SEARCH` in `.env`
+---
+## Production Checklist
+- [ ] Test with at least 10 representative queries
+- [ ] Verify `/debug/scores` on low-confidence queries
+- [ ] Adjust `CONFIDENCE_THRESHOLD` to acceptable false-positive rate
+- [ ] Set `ALLOWED_ORIGINS` to your domain only (security)
+- [ ] Use production-grade LLM model (Qwen 7B+ or Mistral)
+- [ ] Set `TEMPERATURE=0.1` for maximum consistency
+- [ ] Monitor first 100 queries for quality
+- [ ] Enable access logging and error tracking
+---
+## Architecture Files
+- **main.py** — Core API + RAG pipeline (LLM backend abstraction, retrieval, generation)
+- **build_index.py** — FAISS index generation from metadata
+- **enrich_dataset.py** — Dataset enrichment script (fetch hadith collections, deduplicate)
+- **metadata.json** — Combined dataset: 6,236 Quran verses + 41,390 hadiths
+- **QModel.index** — FAISS vector index (pre-built, ready to use)
+- **ARCHITECTURE.md** — Detailed system design
+- **requirements.txt** — Python dependencies
+---
+## Next Steps
+After setup, consider:
+1. Grade filtering: Try `?grade_filter=sahih` for authenticated-only results
+2. Source filtering: Use `?source_type=quran` vs `?source_type=hadith`
+3. Batch processing: Add endpoint for multiple questions
+4. Webhook integration: Stream answers as they generate
+5. Caching improvements: Persistent Redis cache for production
+---
+## Support
+For issues:
+1. Check logs: `python main.py` (stdout)
+2. Test endpoints: http://localhost:8000/docs
+3. Review `/debug/scores` for retrieval quality
+4. Check `.env` configuration
+Happy querying! 🕌

build_index.py CHANGED Viewed

@@ -1,69 +1,79 @@
 import json
-import time
 import numpy as np
 import faiss
 from sentence_transformers import SentenceTransformer
-# ── Config ─────────────────────────────────────────────────────────────────────
-EMBED_MODEL  = "intfloat/multilingual-e5-large"
-BATCH_SIZE   = 128    # Increase to 256 if you have ≥16 GB RAM and no GPU OOM
-SHOW_PROGRESS = True  # tqdm progress bar per batch
-# ── Load model ─────────────────────────────────────────────────────────────────
-print(f"⏳  Loading model: {EMBED_MODEL}")
-t0 = time.perf_counter()
-model = SentenceTransformer(EMBED_MODEL)
-print(f"✅  Model loaded in {time.perf_counter()-t0:.1f}s")
-# ── Load data ──────────────────────────────────────────────────────────────────
-with open("data/quran.json", "r", encoding="utf-8") as f:
-    quran = json.load(f)
-for item in quran:
-    item["type"] = "quran"
-with open("data/hadith.json", "r", encoding="utf-8") as f:
-    hadith = json.load(f)
-for item in hadith:
-    item["type"] = "hadith"
-data = quran + hadith
-print(f"📊  Dataset: {len(quran):,} Quran verses + {len(hadith):,} Hadiths = {len(data):,} items")
-# ── Build text pairs ────────────────────────────────────────────────────────────
-# Each item → 2 texts (Arabic + English), indexed as item_idx * 2 and item_idx * 2 + 1
-texts = []
-for item in data:
-    source = item.get("source") or item.get("reference") or ""
-    texts.append(f"passage: {source} Arabic: {item['arabic']}")
-    texts.append(f"passage: {source} English: {item['english']}")
-print(f"📝  Encoding {len(texts):,} texts (batch_size={BATCH_SIZE}) …")
-t1 = time.perf_counter()
-# ── Encode ───────────────────────────────────────────────────────────────��─────
-# show_progress_bar gives a tqdm bar so you can see throughput + ETA
-embeddings = model.encode(
-    texts,
-    batch_size=BATCH_SIZE,
-    normalize_embeddings=True,
-    show_progress_bar=SHOW_PROGRESS,
-    convert_to_numpy=True,
-)
-elapsed = time.perf_counter() - t1
-rate    = len(texts) / elapsed
-print(f"\n✅  Encoded {len(texts):,} texts in {elapsed:.0f}s  ({rate:.0f} texts/sec)")
-# ── Build FAISS index ──────────────────────────────────────────────────────────
-print("🔨  Building FAISS index …")
-dim   = embeddings.shape[1]
-index = faiss.IndexFlatIP(dim)
-index.add(embeddings.astype("float32"))   # IP needs float32
-faiss.write_index(index, "QModel.index")
-print(f"✅  FAISS index saved  (vectors: {index.ntotal:,}, dim: {dim})")
-# ── Save metadata ──────────────────────────────────────────────────────────────
-with open("metadata.json", "w", encoding="utf-8") as f:
-    json.dump(data, f, ensure_ascii=False, indent=2)
-print("✅  metadata.json saved")
-print(f"\n🎉  Index built in {time.perf_counter()-t0:.0f}s total")

+#!/usr/bin/env python3
+"""
+Regenerate FAISS index with enriched metadata.
+This script loads the enriched metadata and generates embeddings for all documents.
+"""
 import json
 import numpy as np
+from pathlib import Path
 import faiss
 from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+def generate_embeddings(model_name: str = "intfloat/multilingual-e5-large"):
+    """Generate embeddings for all documents in metadata.json"""
+    metadata_path = Path("/Users/elgendy/Projects/QModel/metadata.json")
+    index_path = Path("/Users/elgendy/Projects/QModel/QModel.index")
+    # Load metadata
+    print("Loading metadata...")
+    with open(metadata_path, 'r', encoding='utf-8') as f:
+        documents = json.load(f)
+    print(f"Total documents: {len(documents)}")
+    # Load embedding model
+    print(f"\nLoading embedding model: {model_name}")
+    model = SentenceTransformer(model_name)
+    embedding_dim = model.get_sentence_embedding_dimension()
+    print(f"Embedding dimension: {embedding_dim}")
+    # Prepare texts for embedding
+    all_texts = []
+    for doc in documents:
+        if doc.get("type") == "quran":
+            # For Quran: use Tafseer/meaning + Sura name
+            text = f"{doc.get('surah_name_en', '')} {doc.get('english', '')}"
+        else:  # hadith
+            # For Hadith: use collection + Arabic text (for better semantic matching)
+            text = f"{doc.get('collection', '')} {doc.get('arabic', '')} {doc.get('english', '')}"
+        all_texts.append(text.strip())
+    # Generate embeddings in batches for efficiency
+    print(f"\nGenerating embeddings for {len(all_texts)} documents...")
+    batch_size = 32
+    all_embeddings = []
+    for i in tqdm(range(0, len(all_texts), batch_size), desc="Embedding batches"):
+        batch_texts = all_texts[i:i + batch_size]
+        batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
+        all_embeddings.extend(batch_embeddings)
+    embeddings = np.array(all_embeddings, dtype=np.float32)
+    print(f"Generated embeddings shape: {embeddings.shape}")
+    # Create FAISS index
+    print("\nCreating FAISS index...")
+    index = faiss.IndexFlatIP(embedding_dim)  # Inner product (cosine on normalized)
+    faiss.normalize_L2(embeddings)
+    index.add(embeddings)
+    # Save index
+    print(f"Saving FAISS index to {index_path}")
+    faiss.write_index(index, str(index_path))
+    print(f"\n{'='*60}")
+    print("Index Generation Complete")
+    print(f"{'='*60}")
+    print(f"Documents indexed: {len(documents)}")
+    print(f"Embeddings generated: {len(all_embeddings)}")
+    print(f"Index file size: {index_path.stat().st_size / (1024*1024):.2f} MB")
+    print(f"Index capacity: {index.ntotal}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    generate_embeddings()

docker-compose.yml CHANGED Viewed

@@ -1,16 +1,54 @@
 services:
   qmodel:
     build: .
     ports:
       - "8000:8000"
     env_file:
       - .env
     environment:
-      - HF_TOKEN=${HF_TOKEN}
-      - OLLAMA_HOST=http://host.docker.internal:11434
     volumes:
       - .:/app
-    # Restart policy
-    restart: always
     extra_hosts:
       - "host.docker.internal:host-gateway"

+# QModel Docker Compose Configuration
+# ====================================
+# Configure via .env file:
+#   LLM_BACKEND=ollama   (default: local Ollama on host machine)
+#   LLM_BACKEND=hf       (HuggingFace backend)
+#
+# Usage:
+#   docker-compose up                    # Uses backend from .env
+#   docker-compose up -d                 # Run in background
+#   docker-compose logs -f               # View logs
+#   docker-compose down                  # Stop services
+version: "3.8"
 services:
   qmodel:
     build: .
+    container_name: qmodel-api
     ports:
       - "8000:8000"
     env_file:
       - .env
     environment:
+      # Pass through HF token if using HuggingFace backend
+      - HF_TOKEN=${HF_TOKEN:-}
+      # Ollama host: use Docker host IP for local Ollama
+      - OLLAMA_HOST=${OLLAMA_HOST:-http://host.docker.internal:11434}
     volumes:
+      # Mount current directory for live code changes (development)
       - .:/app
+      # Cache HuggingFace models to avoid re-downloading
+      - huggingface_cache:/root/.cache/huggingface
+    # Restart automatically if container exits
+    restart: on-failure:3
     extra_hosts:
+      # Allow container to reach host.docker.internal on Mac/Windows
       - "host.docker.internal:host-gateway"
+    networks:
+      - qmodel-network
+    # Health check for orchestration
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+networks:
+  qmodel-network:
+    driver: bridge
+volumes:
+  # Persistent cache for HuggingFace models
+  huggingface_cache:

enrich_dataset.py ADDED Viewed

	@@ -0,0 +1,210 @@

+#!/usr/bin/env python3
+"""
+Script to enrich the QModel dataset with hadith collections from GitHub.
+Fetches Musnad Ahmad and other major hadith collections from:
+https://github.com/AhmedBaset/hadith-json/tree/main/db/by_book/the_9_books
+"""
+import json
+import requests
+from typing import Dict, List
+from collections import defaultdict
+# The 9 canonical hadith books
+HADITH_BOOKS = {
+    "ahmed.json": {
+        "collection": "Musnad Ahmad",
+        "id_prefix": "ahmad",
+        "grade": "Hasan/Sahih",
+        "author": "Imam Ahmad ibn Hanbal"
+    },
+    "bukhari.json": {
+        "collection": "Sahih al-Bukhari",
+        "id_prefix": "bukhari",
+        "grade": "Sahih",
+        "author": "Muhammad al-Bukhari"
+    },
+    "muslim.json": {
+        "collection": "Sahih Muslim",
+        "id_prefix": "muslim",
+        "grade": "Sahih",
+        "author": "Muslim ibn al-Hajjaj"
+    },
+    "abudawud.json": {
+        "collection": "Sunan Abu Dawood",
+        "id_prefix": "abudawud",
+        "grade": "Hasan",
+        "author": "Abu Dawood Sulaiman"
+    },
+    "tirmidhi.json": {
+        "collection": "Jami' at-Tirmidhi",
+        "id_prefix": "tirmidhi",
+        "grade": "Hasan",
+        "author": "Al-Tirmidhi"
+    },
+    "ibnmajah.json": {
+        "collection": "Sunan Ibn Majah",
+        "id_prefix": "ibnmajah",
+        "grade": "Hasan",
+        "author": "Ibn Majah al-Qazwini"
+    },
+    "nasai.json": {
+        "collection": "Sunan an-Nasai",
+        "id_prefix": "nasai",
+        "grade": "Sahih",
+        "author": "Ahmad al-Nasai"
+    },
+    "malik.json": {
+        "collection": "Muwatta Malik",
+        "id_prefix": "malik",
+        "grade": "Sahih",
+        "author": "Malik ibn Anas"
+    },
+    "darimi.json": {
+        "collection": "Sunan al-Darimi",
+        "id_prefix": "darimi",
+        "grade": "Hasan",
+        "author": "Al-Darimi"
+    }
+}
+BASE_URL = "https://raw.githubusercontent.com/AhmedBaset/hadith-json/main/db/by_book/the_9_books"
+def fetch_hadith_book(filename: str) -> Dict:
+    """Fetch a hadith book JSON from GitHub."""
+    url = f"{BASE_URL}/{filename}"
+    print(f"Fetching {filename}...")
+    response = requests.get(url, timeout=30)
+    response.raise_for_status()
+    return response.json()
+def transform_hadith(hadith: Dict, book_config: Dict, book_data: Dict) -> Dict:
+    """Transform hadith from GitHub format to our metadata format."""
+    # Find chapter name if available
+    chapter_name = ""
+    if "chapterId" in hadith:
+        for chapter in book_data.get("chapters", []):
+            if chapter.get("id") == hadith.get("chapterId"):
+                chapter_name = chapter.get("arabic", "")
+                break
+    # Build the reference string
+    hadith_num = hadith.get("idInBook", hadith.get("id", ""))
+    reference = f"{book_config['collection']} {hadith_num}"
+    # Combine narrator and text for English
+    english_parts = []
+    if isinstance(hadith.get("english"), dict):
+        if hadith["english"].get("narrator"):
+            english_parts.append(hadith["english"]["narrator"])
+        if hadith["english"].get("text"):
+            english_parts.append(hadith["english"]["text"])
+        english = " ".join(english_parts)
+    else:
+        english = str(hadith.get("english", ""))
+    return {
+        "id": f"{book_config['id_prefix']}_{hadith_num}",
+        "arabic": hadith.get("arabic", ""),
+        "english": english,
+        "reference": reference,
+        "hadith_number": hadith_num,
+        "collection": book_config["collection"],
+        "chapter": chapter_name,
+        "grade": "",  # Will be inferred by main.py's infer_hadith_grade()
+        "type": "hadith",
+        "author": book_config["author"]
+    }
+def load_existing_metadata(filepath: str) -> List[Dict]:
+    """Load existing metadata.json file."""
+    print(f"Loading existing metadata from {filepath}...")
+    with open(filepath, 'r', encoding='utf-8') as f:
+        return json.load(f)
+def save_enriched_metadata(filepath: str, data: List[Dict], stats: Dict) -> None:
+    """Save enriched metadata to file."""
+    print(f"Saving enriched metadata to {filepath}...")
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    print("\n" + "="*60)
+    print("Dataset Enrichment Summary")
+    print("="*60)
+    print(f"Total documents: {len(data)}")
+    print(f"\nBreakdown by collection:")
+    for collection, count in sorted(stats.items()):
+        print(f"  {collection}: {count}")
+    print("="*60)
+def main():
+    """Main enrichment process."""
+    # Load existing metadata
+    metadata_path = "/Users/elgendy/Projects/QModel/metadata.json"
+    existing_data = load_existing_metadata(metadata_path)
+    # Track which existing hadiths we have
+    existing_ids = {item["id"] for item in existing_data if item.get("type") == "hadith"}
+    print(f"Existing hadith entries: {len(existing_ids)}")
+    # New hadiths to add
+    new_hadiths = []
+    stats = defaultdict(int)
+    # Count existing Quran verses
+    for item in existing_data:
+        if item.get("type") == "quran":
+            stats["Quran"] += 1
+        elif item.get("type") == "hadith":
+            collection = item.get("collection", "Unknown")
+            stats[collection] += 1
+    # Fetch and process each hadith book
+    for filename, book_config in HADITH_BOOKS.items():
+        try:
+            book_data = fetch_hadith_book(filename)
+            hadiths = book_data.get("hadiths", [])
+            skipped = 0
+            added = 0
+            for hadith in hadiths:
+                # Transform to our format
+                transformed = transform_hadith(hadith, book_config, book_data)
+                # Check if we already have this hadith
+                if transformed["id"] in existing_ids:
+                    skipped += 1
+                    continue
+                new_hadiths.append(transformed)
+                existing_ids.add(transformed["id"])
+                added += 1
+            collection_name = book_config["collection"]
+            stats[collection_name] += added
+            print(f"  ✓ {filename}: {added} new hadiths added, {skipped} already exist")
+        except Exception as e:
+            print(f"  ✗ Error fetching {filename}: {e}")
+    # Merge with existing data
+    enriched_data = existing_data + new_hadiths
+    print(f"\nTotal new hadiths added: {len(new_hadiths)}")
+    print(f"Total documents after enrichment: {len(enriched_data)}")
+    # Save enriched metadata
+    save_enriched_metadata(metadata_path, enriched_data, stats)
+if __name__ == "__main__":
+    main()

main.py CHANGED Viewed

@@ -1,15 +1,23 @@
 """
-QModel v3.1 — Islamic RAG API
-Fixes over v3:
-  • Confidence gate: blocks LLM call when top retrieval score is too low →
-    returns a safe "not in dataset" answer instead of hallucinating
-  • Hardened anti-hallucination prompt: explicit rule against reconstructing
-    or completing any Hadith from memory; citation must match context verbatim
-  • Hadith type-boost: intent=hadith raises _score for Hadith items so they
-    are not outranked by Quran verses on Hadith-specific queries
-  • top_score exposed in AskResponse and /v1/chat/completions x_metadata
-    so callers can implement their own confidence thresholds
-  • Few-shot example updated to show a correct "not found" refusal path
 """
 from __future__ import annotations
@@ -23,14 +31,14 @@ import re
 import time
 from collections import Counter, OrderedDict
 from contextlib import asynccontextmanager
-from typing import Dict, List, Optional
 import faiss
 import numpy as np
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
-import ollama
 from pydantic import BaseModel, Field, validator
 from sentence_transformers import SentenceTransformer
@@ -47,42 +55,175 @@ logger = logging.getLogger("qmodel")
 # ═══════════════════════════════════════════════════════════════════════
-# CONFIG
 # ═══════════════════════════════════════════════════════════════════════
 class Config:
     OLLAMA_HOST:          str   = os.getenv("OLLAMA_HOST", "http://localhost:11434")
-    LLM_MODEL:            str   = os.getenv("LLM_MODEL",        "minimax-m2.7:cloud")
-    EMBED_MODEL:          str   = os.getenv("EMBED_MODEL",       "intfloat/multilingual-e5-large")
-    FAISS_INDEX:          str   = os.getenv("FAISS_INDEX",       "QModel.index")
-    METADATA_FILE:        str   = os.getenv("METADATA_FILE",     "metadata.json")
-    TOP_K_SEARCH:         int   = int(os.getenv("TOP_K_SEARCH",  20))
-    TOP_K_RETURN:         int   = int(os.getenv("TOP_K_RETURN",   5))
-    MAX_TOKENS:           int   = int(os.getenv("MAX_TOKENS",   2048))
-    TEMPERATURE:          float = float(os.getenv("TEMPERATURE",  0.2))
-    CACHE_SIZE:           int   = int(os.getenv("CACHE_SIZE",    512))
-    CACHE_TTL:            int   = int(os.getenv("CACHE_TTL",    3600))
-    RERANK_ALPHA:         float = float(os.getenv("RERANK_ALPHA", 0.6))
-    ALLOWED_ORIGINS:      str   = os.getenv("ALLOWED_ORIGINS",  "*")
-    MAX_EXAMPLES:         int   = int(os.getenv("MAX_EXAMPLES",   3))
-    # ── NEW: minimum hybrid score to allow an LLM answer ──────────────
-    # Below this threshold the pipeline returns a safe "not found" reply
-    # without calling the LLM at all, preventing hallucination.
-    # Tune upward (e.g. 0.70) to be stricter; downward (0.50) to be looser.
-    CONFIDENCE_THRESHOLD: float = float(os.getenv("CONFIDENCE_THRESHOLD", 0.30))
-    # ── NEW: score bonus applied to Hadith items when intent == "hadith"
-    # Prevents Quran verses from outranking relevant Hadiths on Sunnah queries.
     HADITH_BOOST:         float = float(os.getenv("HADITH_BOOST", 0.08))
 cfg = Config()
-# ─────────────────────────────────────────────────────────────────────────────
-# MODEL FALLBACK CHAIN
-# ─────────────────────────────────────────────────────────────────────────────
-_FALLBACK_MODELS: List[str] = [
-    "minimax-m2.7:cloud",    # primary — 14 GB, best quality
-    "gavtoken/minimax:latest", # cloud fallback — strong Arabic
-    "llama3.1:latest",       # local fallback — 4.9 GB
-]
 # ═══════════════════════════════════════════════════════════════════════
@@ -126,43 +267,10 @@ analysis_cache = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL)
 rewrite_cache  = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL * 6)
-# ═══════════════════════════════════════════════════════════════════════
-# RESILIENT LLM CALLER  — auto-fallback across Ollama models
-# ═══════════════════════════════════════════════════════════════════════
-def chat_with_fallback(
-    messages: List[dict],
-    max_tokens: int = cfg.MAX_TOKENS,
-    temperature: float = cfg.TEMPERATURE,
-) -> str:
-    primary = cfg.LLM_MODEL
-    models  = [primary] + [m for m in _FALLBACK_MODELS if m != primary]
-    last_err: Exception = RuntimeError("No Ollama models available")
-    for model in models:
-        try:
-            logger.info("LLM → %s (Ollama)", model)
-            client   = ollama.Client(host=cfg.OLLAMA_HOST)
-            response = client.chat(
-                model=model,
-                messages=messages,
-                options={"num_predict": max_tokens, "temperature": temperature},
-            )
-            content = response["message"]["content"].strip()
-            if content:
-                if model != primary:
-                    logger.warning("Fell back to: %s", model)
-                return content
-        except Exception as exc:
-            logger.error("Skip %s — %s", model, exc)
-            last_err = exc
-    raise RuntimeError(f"All LLM models failed. Last error: {last_err}")
 # ═══════════════════════════════════════════════════════════════════════
 # ARABIC NLP  — normalisation + light stemming
 # ═══════════════════════════════════════════════════════════════════════
-_DIACRITICS   = re.compile(r"[\u064B-\u0652\u0670\u0671\u0653\u0654\u0655]")
 _ALEF_VARS    = re.compile(r"[أإآٱ]")
 _WAW_HAMZA    = re.compile(r"ؤ")
 _YA_HAMZA     = re.compile(r"ئ")
@@ -177,12 +285,11 @@ _SPELLING_MAP: Dict[str, str] = {
     "قران":    "قرآن",
     "القران":  "القرآن",
     "اللہ":    "الله",
-    "الرّحمن": "الرحمن",
-    "محمّد":   "محمد",
 }
 def normalize_arabic(text: str, *, aggressive: bool = False) -> str:
     text = _DIACRITICS.sub("", text)
     text = _TATWEEL.sub("", text)
     text = _ALEF_VARS.sub("ا", text)
@@ -207,12 +314,14 @@ _AR_SUFFIXES = re.compile(
 def light_stem(word: str) -> str:
     w = _AR_PREFIXES.sub("", word)
     w = _AR_SUFFIXES.sub("", w)
     return w if len(w) >= 2 else word
 def tokenize_ar(text: str) -> List[str]:
     norm = normalize_arabic(text, aggressive=True).lower()
     return [light_stem(t) for t in norm.split() if t]
@@ -225,7 +334,8 @@ _ARABIC_SCRIPT = re.compile(
 )
-def detect_language(text: str) -> str:
     ar    = len(_ARABIC_SCRIPT.findall(text))
     en    = len(re.findall(r"[a-zA-Z]", text))
     tot   = ar + en or 1
@@ -238,6 +348,7 @@ def detect_language(text: str) -> str:
 def language_instruction(lang: str) -> str:
     return {
         "arabic": (
             "يجب أن تكون الإجابة كاملةً باللغة العربية الفصحى تماماً. "
@@ -263,19 +374,26 @@ Reply ONLY with a valid JSON object — no markdown, no preamble:
   "ar_query": "<query in clear Arabic فصحى, ≤25 words>",
   "en_query": "<query in clear English, ≤25 words>",
   "keywords": ["<3-7 key Arabic or English terms from the question>"],
-  "intent": "<one of: fatwa | tafsir | hadith | count | general>"
 }
-Rules:
-- Fix spelling errors (e.g. "quran" → "Quran", "قران" → "قرآن").
-- Expand abbreviations (e.g. "pbuh" → "peace be upon him / صلى الله عليه وسلم").
-- Do NOT answer the question — only rephrase it for search.
-- 'count' intent = user wants the frequency or number of occurrences of a word/name.
-- 'hadith' intent = user quotes or asks about a specific Hadith text or its authenticity.
 """
-async def rewrite_query(raw: str) -> Dict:
     cached = await rewrite_cache.get(raw)
     if cached:
         return cached
@@ -287,7 +405,7 @@ async def rewrite_query(raw: str) -> Dict:
         "intent":   "general",
     }
     try:
-        text = chat_with_fallback(
             messages=[
                 {"role": "system", "content": _REWRITE_SYSTEM},
                 {"role": "user",   "content": raw},
@@ -300,9 +418,7 @@ async def rewrite_query(raw: str) -> Dict:
         for k in ("ar_query", "en_query", "keywords", "intent"):
             result.setdefault(k, fallback[k])
         await rewrite_cache.set(result, raw)
-        logger.info(
-            "Rewrite: intent=%s ar=%s", result["intent"], result["ar_query"][:60]
-        )
         return result
     except Exception as exc:
         logger.warning("Query rewrite failed (%s) — using fallback", exc)
@@ -310,7 +426,7 @@ async def rewrite_query(raw: str) -> Dict:
 # ═══════════════════════════════════════════════════════════════════════
-# INTENT DETECTION  (frequency / count queries)
 # ═══════════════════════════════════════════════════════════════════════
 _COUNT_EN = re.compile(
     r"\b(how many|count|number of|frequency|occurrences? of|how often|"
@@ -321,15 +437,17 @@ _COUNT_AR = re.compile(
     r"(كم مرة|كم عدد|كم تكرر|عدد مرات|تكرار|كم ذُكر|كم وردت?)"
 )
-_INTENT_SYSTEM = """\
-You are an intent classifier for an Islamic Q&A system.
-Determine if the query asks for the COUNT or FREQUENCY of a specific word or name.
-Reply ONLY with valid JSON, no markdown:
-{"analysis": true, "keyword": "<exact Arabic or English word to count>"} or {"analysis": false}
-"""
 async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
     if rewrite.get("intent") == "count":
         kws = rewrite.get("keywords", [])
         return kws[0] if kws else None
@@ -337,27 +455,13 @@ async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
     if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
         return None
-    try:
-        raw = chat_with_fallback(
-            messages=[
-                {"role": "system", "content": _INTENT_SYSTEM},
-                {"role": "user",   "content": query},
-            ],
-            max_tokens=60,
-            temperature=0.0,
-        )
-        raw = re.sub(r"```(?:json)?\n?|\n?```", "", raw).strip()
-        res = json.loads(raw)
-        if res.get("analysis"):
-            return res.get("keyword")
-    except Exception as exc:
-        logger.warning("Intent detection failed (%s) — heuristic fallback", exc)
-        for pat in (_COUNT_EN, _COUNT_AR):
-            m = pat.search(query)
-            if m:
-                tail = query[m.end():].strip().split()
-                if tail:
-                    return tail[-1]
     return None
@@ -365,6 +469,7 @@ async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
 # OCCURRENCE ANALYSIS  (exact + stemmed matching)
 # ═══════════════════════════════════════════════════════════════════════
 async def count_occurrences(keyword: str, dataset: list) -> dict:
     cached = await analysis_cache.get(keyword)
     if cached:
         return cached
@@ -372,27 +477,41 @@ async def count_occurrences(keyword: str, dataset: list) -> dict:
     kw_norm  = normalize_arabic(keyword, aggressive=True).lower()
     kw_stem  = light_stem(kw_norm)
     count    = 0
     examples: list = []
     for item in dataset:
-        ar_norm  = normalize_arabic(item.get("arabic", ""), aggressive=True)
         combined = f"{ar_norm} {item.get('english', '')}".lower()
         exact    = combined.count(kw_norm)
         stemmed  = combined.count(kw_stem) - exact if kw_stem != kw_norm else 0
         occ      = exact + stemmed
         if occ > 0:
             count += occ
             if len(examples) < cfg.MAX_EXAMPLES:
                 examples.append({
-                    "arabic":  item.get("arabic", ""),
                     "english": item.get("english", ""),
-                    "source":  item.get("source") or item.get("reference", ""),
                 })
     result = {
         "keyword":     keyword,
         "kw_stemmed":  kw_stem,
         "total_count": count,
         "examples":    examples,
     }
     await analysis_cache.set(result, keyword)
@@ -400,7 +519,7 @@ async def count_occurrences(keyword: str, dataset: list) -> dict:
 # ═══════════════════════════════════════════════════════════════════════
-# HYBRID SEARCH  — dense FAISS + BM25 re-ranking + hadith type-boost
 # ═══════════════════════════════════════════════════════════════════════
 def _bm25_score(
     query_terms: List[str],
@@ -409,6 +528,7 @@ def _bm25_score(
     k1: float = 1.5,
     b: float  = 0.75,
 ) -> float:
     doc_tokens = tokenize_ar(doc_text)
     dl         = len(doc_tokens)
     tf         = Counter(doc_tokens)
@@ -426,8 +546,12 @@ async def hybrid_search(
     index: faiss.Index,
     dataset: list,
     top_n: int = cfg.TOP_K_RETURN,
 ) -> list:
-    cached = await search_cache.get(raw_query, top_n)
     if cached:
         return cached
@@ -444,14 +568,29 @@ async def hybrid_search(
     distances, indices = index.search(fused.reshape(1, -1), cfg.TOP_K_SEARCH)
-    # ��─ 2. De-duplicate candidates ─────────────────────────────────────
     seen: set  = set()
     candidates = []
     for dist, idx in zip(distances[0], indices[0]):
         item_idx = int(idx) // 2
         if item_idx not in seen and 0 <= item_idx < len(dataset):
             seen.add(item_idx)
-            candidates.append({**dataset[item_idx], "_dense": float(dist)})
     # ── 3. BM25 sparse scoring ─────────────────────────────────────────
     query_terms = [
@@ -466,16 +605,32 @@ async def hybrid_search(
         doc        = c.get("arabic", "") + " " + c.get("english", "")
         c["_sparse"] = _bm25_score(query_terms, doc, avg_dl)
     # ── 4. Score fusion ────────────────────────────────────────────────
     α          = cfg.RERANK_ALPHA
-    max_sparse = max((c["_sparse"] for c in candidates), default=1.0) or 1.0
     intent     = rewrite.get("intent", "general")
     for c in candidates:
         base_score = α * c["_dense"] + (1 - α) * c["_sparse"] / max_sparse
-        # ── FIX: boost Hadith items when the query is about a Hadith ──
-        # This prevents Quran verses from always outranking Hadiths on
-        # Sunnah-specific queries purely due to embedding distance.
         if intent == "hadith" and c.get("type") == "hadith":
             base_score += cfg.HADITH_BOOST
         c["_score"] = base_score
@@ -483,21 +638,20 @@ async def hybrid_search(
     candidates.sort(key=lambda x: x["_score"], reverse=True)
     results = candidates[:top_n]
-    await search_cache.set(results, raw_query, top_n)
     return results
-def build_context(results: list, intent: str = "general") -> str:
     lines = []
     for i, r in enumerate(results, 1):
         source    = r.get("source") or r.get("reference") or "Unknown Source"
-        item_type = (
-            "Quranic Verse"
-            if re.search(r"سورة|surah|quran", source, re.I)
-            else "Hadith"
-        )
         lines.append(
-            f"[{i}] 📌 {item_type} | {source} | score: {r.get('_score', 0):.3f}\n"
             f"    Arabic : {r.get('arabic', '')}\n"
             f"    English: {r.get('english', '')}"
         )
@@ -505,57 +659,60 @@ def build_context(results: list, intent: str = "general") -> str:
 # ═══════════════════════════════════════════════════════════════════════
-# PROMPT ENGINEERING  — intent-aware, chain-of-thought, few-shot
 # ═════════════════════════════════════════════��═════════════════════════
 _PERSONA = (
-    "You are Sheikh QModel, a meticulous Islamic scholar-assistant with deep expertise "
-    "in Tafsir (Quranic exegesis), Hadith sciences, Fiqh, and Arabic linguistics. "
-    "You respond with the rigour of a classical scholar and the clarity of a modern educator."
 )
 _TASK_INSTRUCTIONS: Dict[str, str] = {
     "tafsir": (
-        "The user asks about a Quranic verse or its interpretation. Steps:\n"
-        "1. Identify the verse(s) from the context below.\n"
-        "2. Provide in-depth Tafsir: linguistic analysis, occasion of revelation "
-        "(Asbab al-Nuzul) if present.\n"
-        "3. Draw connections to related verses in the context.\n"
-        "4. Answer the user's specific question directly."
     ),
     "hadith": (
         "The user asks about a Hadith. Steps:\n"
-        "1. Locate the relevant Hadith(s) ONLY from the context block below.\n"
-        "2. Quote the Arabic text and English translation EXACTLY as they appear "
-        "in the context — do not alter, complete, or paraphrase the wording.\n"
-        "3. Elaborate on meaning, legal and spiritual implications (Fiqh / Tarbiya).\n"
-        "4. Note any related Hadiths present in the context.\n"
-        "CRITICAL: If the specific Hadith the user mentions is NOT present verbatim "
-        "in the context, say so clearly. Do NOT reconstruct it from memory."
     ),
     "fatwa": (
-        "The user seeks a religious ruling or guidance. Steps:\n"
-        "1. Gather ALL relevant evidence (Quran + Sunnah) from the context.\n"
-        "2. Reason step-by-step from the evidence to a conclusion.\n"
-        "3. If the context is insufficient for a clear ruling, state so explicitly. "
-        "Do NOT speculate."
     ),
     "count": (
-        "The user asks for the frequency or count of a word/name. Steps:\n"
-        "1. State the ANALYSIS RESULT prominently at the top.\n"
-        "2. List up to 3 example occurrences from the context with their sources.\n"
-        "3. Briefly comment on the significance of this repetition."
     ),
     "general": (
         "The user has a general Islamic question. Steps:\n"
         "1. Give a direct answer first.\n"
-        "2. Support with evidence from the context.\n"
         "3. Conclude with a summary."
     ),
 }
-# ── FIX: hardened anti-hallucination rules ────────────────────────────────────
 _FORMAT_RULES = """\
-For EVERY piece of supporting evidence, use this exact format:
 ┌─────────────────────────────────────────────┐
 │  ❝ {Arabic text} ❞
@@ -563,50 +720,14 @@ For EVERY piece of supporting evidence, use this exact format:
 │  📖 Source: {exact citation from context}
 └─────────────────────────────────────────────┘
-ABSOLUTE RULES — violations are unacceptable:
-• Use ONLY content from the Islamic Context block below. Zero outside knowledge.
-• Copy Arabic text and translations VERBATIM from the context. Never paraphrase,
-  complete, or reconstruct a Hadith or verse from memory.
-• If a specific Hadith or verse the user asks about is NOT present in the context
-  block → respond ONLY with:
-    "هذا الحديث/الآية غير موجود في قاعدة البيانات المتاحة. يُرجى التحقق من مصادر موثوقة."
-  (Arabic query) or:
-    "This Hadith/verse is not in the available dataset. Please verify with a trusted source."
-  (English query). Do NOT add anything else.
-• Never cite a reference that does not appear in the context block.
-• Never invent, guess, or infer content that is not explicitly in the context.
-• End every response with:
-  - Arabic  → "والله أعلم."
-  - English → "And Allah knows best."
-"""
-# ── FIX: few-shot now includes a "not found" refusal example ─────────────────
-_FEW_SHOT = """\
-=== STRUCTURAL EXAMPLE A — evidence found (mimic structure, do not copy content) ===
-Question: What does Islam say about the importance of prayer?
-[Step 1 — Direct Answer]
-Prayer (Salah) is one of the Five Pillars and is described in the provided texts
-as the first act of worship a Muslim will be accountable for.
-[Step 2 — Supporting Evidence]
-┌─────────────────────────────────────────────┐
-│  ❝ أَقِيمُوا الصَّلَاةَ ❞
-│  📝 Translation: Establish prayer.
-│  📖 Source: Surah Al-Baqarah 2:43
-└─────────────────────────────────────────────┘
-[Step 3 — Conclusion]
-The evidence shows prayer is central to the Muslim's covenant with Allah.
-And Allah knows best.
-=== STRUCTURAL EXAMPLE B — evidence NOT found (mandatory refusal path) ===
-Question: ما أحاديث الصبر الواردة في السنة؟
-(No matching Hadith appears in the Islamic Context block)
-هذا الحديث/الآية غير موجود في قاعدة البيانات المتاحة. يُرجى التحقق من مصادر موثوقة.
-والله أعلم.
-=== END EXAMPLES ===\
 """
 _SYSTEM_TEMPLATE = """\
@@ -620,9 +741,7 @@ _SYSTEM_TEMPLATE = """\
 === OUTPUT FORMAT ===
 {fmt}
-{few_shot}
-=== ISLAMIC CONTEXT (your ONLY source of truth) ===
 {context}
 === END CONTEXT ===
 """
@@ -635,12 +754,16 @@ def build_messages(
     intent: str,
     analysis: Optional[dict] = None,
 ) -> List[dict]:
     if analysis:
         analysis_block = (
             f"\n[ANALYSIS RESULT]\n"
-            f"The keyword «{analysis['keyword']}» "
-            f"(root form: «{analysis.get('kw_stemmed', '')}») "
-            f"appears {analysis['total_count']} times in the dataset.\n"
         )
         context = analysis_block + context
@@ -649,7 +772,6 @@ def build_messages(
         lang_instruction=language_instruction(lang),
         task=_TASK_INSTRUCTIONS.get(intent, _TASK_INSTRUCTIONS["general"]),
         fmt=_FORMAT_RULES,
-        few_shot=_FEW_SHOT,
         context=context,
     )
@@ -664,28 +786,59 @@ def build_messages(
     ]
-# ═══════════════════════════════════════════════════════════════════════
-# SAFE "NOT FOUND" FALLBACK ANSWER
-# ═══════════════════════════════════════════════════════════════════════
 def _not_found_answer(lang: str) -> str:
-    """
-    Returned instead of calling the LLM when retrieval confidence is too low.
-    Prevents hallucination on queries where the dataset has no relevant content.
-    """
     if lang == "arabic":
         return (
-            "لم أجد في قاعدة البيانات المتاحة ما يكفي للإجابة على هذا السؤال بدقة.\n"
-            "يُرجى الرجوع إلى ��صادر إسلامية موثوقة للتحقق من المعلومات.\n"
             "والله أعلم."
         )
     return (
         "The available dataset does not contain sufficient information to answer "
-        "this question accurately.\n"
-        "Please refer to trusted Islamic sources to verify.\n"
         "And Allah knows best."
     )
 # ═══════════════════════════════════════════════════════════════════════
 # APP STATE
 # ═══════════════════════════════════════════════════════════════════════
@@ -693,6 +846,7 @@ class AppState:
     embed_model: Optional[SentenceTransformer] = None
     faiss_index: Optional[faiss.Index]         = None
     dataset:     Optional[list]                = None
     ready:       bool                          = False
@@ -701,6 +855,7 @@ state = AppState()
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     logger.info("⏳  Loading embed model:  %s", cfg.EMBED_MODEL)
     state.embed_model = SentenceTransformer(cfg.EMBED_MODEL)
@@ -711,20 +866,19 @@ async def lifespan(app: FastAPI):
     with open(cfg.METADATA_FILE, "r", encoding="utf-8") as f:
         state.dataset = json.load(f)
-    primary = cfg.LLM_MODEL
-    try:
-        client = ollama.Client(host=cfg.OLLAMA_HOST)
-        client.chat(model=primary, messages=[{"role": "user", "content": "ping"}])
-        logger.info("✅  Primary Ollama model reachable: %s", primary)
-    except Exception as exc:
-        logger.warning(
-            "Primary model %s not reachable (%s). Will use fallback chain.", primary, exc
-        )
     state.ready = True
     logger.info(
-        "✅  QModel v3.1 ready | dataset=%d | faiss=%d | confidence_threshold=%.2f",
-        len(state.dataset), state.faiss_index.ntotal, cfg.CONFIDENCE_THRESHOLD,
     )
     yield
     state.ready = False
@@ -735,9 +889,9 @@ async def lifespan(app: FastAPI):
 # FASTAPI APP
 # ═══════════════════════════════════════════════════════════════════════
 app = FastAPI(
-    title="QModel v3.1 — Islamic RAG API",
-    description="High-fidelity Retrieval-Augmented Generation over Qur'an & Sunnah",
-    version="3.1.0",
     lifespan=lifespan,
 )
@@ -758,47 +912,108 @@ class ChatMessage(BaseModel):
     content: str = Field(..., min_length=1, max_length=4000)
-class ChatCompletionRequest(BaseModel):
-    model:       str             = "QModel"
-    messages:    List[ChatMessage]
-    temperature: Optional[float] = Field(cfg.TEMPERATURE, ge=0.0, le=2.0)
-    max_tokens:  Optional[int]   = Field(cfg.MAX_TOKENS,  ge=1,   le=8192)
-    stream:      Optional[bool]  = False
-    top_k:       Optional[int]   = Field(cfg.TOP_K_RETURN, ge=1,  le=20)
-    @validator("messages")
-    def has_user_message(cls, v):
-        if not any(m.role == "user" for m in v):
-            raise ValueError("At least one user message is required")
-        return v
 class AnalysisResult(BaseModel):
     keyword:     str
     kw_stemmed:  str
     total_count: int
     examples:    List[dict]
 class AskResponse(BaseModel):
-    question:   str
-    answer:     str
-    language:   str
-    intent:     str
-    analysis:   Optional[AnalysisResult] = None
-    sources:    List[dict]
-    top_score:  float   # FIX: expose top retrieval score for caller transparency
-    latency_ms: int
 # ═══════════════════════════════════════════════════════════════════════
-# CORE ASYNC RAG PIPELINE
 # ═══════════════════════════════════════════════════════════════════════
-async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict:
     t0 = time.perf_counter()
     # 1. Query rewriting
-    rewrite = await rewrite_query(question)
     intent  = rewrite.get("intent", "general")
     # 2. Intent detection + hybrid search — concurrently
@@ -807,7 +1022,7 @@ async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict
         hybrid_search(
             question, rewrite,
             state.embed_model, state.faiss_index, state.dataset,
-            top_k,
         ),
     )
     analysis_kw, results = await asyncio.gather(kw_task, search_task)
@@ -827,14 +1042,10 @@ async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict
         intent, top_score, cfg.CONFIDENCE_THRESHOLD,
     )
-    # ── FIX: confidence gate ───────────────────────────────────────────
-    # If the best retrieved result is below the threshold, skip the LLM
-    # entirely and return a safe "not in dataset" answer.
-    # This is the primary defence against hallucination on Hadith queries
-    # where the dataset has no matching content.
     if top_score < cfg.CONFIDENCE_THRESHOLD:
         logger.warning(
-            "Low confidence (%.3f < %.2f) — returning safe fallback, skipping LLM",
             top_score, cfg.CONFIDENCE_THRESHOLD,
         )
         return {
@@ -847,24 +1058,19 @@ async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict
             "latency_ms": int((time.perf_counter() - t0) * 1000),
         }
-    # 5. Build context + prompt
-    context  = build_context(results, intent)
     messages = build_messages(context, question, lang, intent, analysis)
-    # 6. LLM call (sync client → threadpool)
-    loop = asyncio.get_event_loop()
     try:
-        answer = await loop.run_in_executor(
-            None,
-            lambda: chat_with_fallback(
-                messages,
-                max_tokens=cfg.MAX_TOKENS,
-                temperature=cfg.TEMPERATURE,
-            ),
         )
-    except RuntimeError as exc:
-        logger.error("All LLM models failed: %s", exc)
-        raise HTTPException(status_code=502, detail=str(exc))
     latency = int((time.perf_counter() - t0) * 1000)
     logger.info(
@@ -896,38 +1102,219 @@ def _check_ready():
 # ═══════════════════════════════════════════════════════════════════════
 @app.get("/health", tags=["ops"])
 def health():
     return {
         "status":               "ok" if state.ready else "initialising",
-        "version":              "3.1.0",
         "dataset_size":         len(state.dataset)        if state.dataset    else 0,
         "faiss_total":          state.faiss_index.ntotal  if state.faiss_index else 0,
         "confidence_threshold": cfg.CONFIDENCE_THRESHOLD,
-        "hadith_boost":         cfg.HADITH_BOOST,
     }
-@app.get("/v1/models", tags=["models"])
 def list_models():
-    return {
-        "object": "list",
-        "data": [{
-            "id":          "QModel",
-            "object":      "model",
-            "created":     int(time.time()),
-            "owned_by":    "elgendy",
-            "description": "Islamic RAG over Qur'an & Sunnah (v3.1)",
         }],
     }
 @app.get("/debug/scores", tags=["ops"])
 async def debug_scores(
-    q:     str = Query(..., min_length=1, max_length=1000),
     top_k: int = Query(10, ge=1, le=20),
 ):
-    """Returns raw retrieval scores without calling the LLM. Use to calibrate CONFIDENCE_THRESHOLD."""
     _check_ready()
-    rewrite = await rewrite_query(q)
     results = await hybrid_search(q, rewrite, state.embed_model, state.faiss_index, state.dataset, top_k)
     return {
         "intent":    rewrite.get("intent"),
@@ -937,54 +1324,16 @@ async def debug_scores(
                 "rank":    i + 1,
                 "source":  r.get("source") or r.get("reference"),
                 "type":    r.get("type"),
                 "_dense":  round(r.get("_dense", 0), 4),
                 "_sparse": round(r.get("_sparse", 0), 4),
                 "_score":  round(r.get("_score", 0), 4),
-                "snippet": r.get("english", "")[:80],
             }
             for i, r in enumerate(results)
         ],
     }
-@app.get("/ask", response_model=AskResponse, tags=["inference"])
-async def ask(
-    q:     str = Query(..., min_length=1, max_length=1000, description="Your Islamic question"),
-    top_k: int = Query(cfg.TOP_K_RETURN, ge=1, le=20,     description="Sources to retrieve"),
-):
-    _check_ready()
-    result = await run_rag_pipeline(q, top_k=top_k)
-    return AskResponse(question=q, **result)
-@app.post("/v1/chat/completions", tags=["inference"])
-async def chat_completions(req: ChatCompletionRequest):
-    _check_ready()
-    user_msgs = [m.content for m in req.messages if m.role == "user"]
-    question  = user_msgs[-1]
-    result    = await run_rag_pipeline(question, top_k=req.top_k or cfg.TOP_K_RETURN)
-    return {
-        "id":      f"chatcmpl-{int(time.time())}",
-        "object":  "chat.completion",
-        "created": int(time.time()),
-        "model":   req.model,
-        "choices": [{
-            "index":         0,
-            "message":       {"role": "assistant", "content": result["answer"]},
-            "finish_reason": "stop",
-        }],
-        "usage": {
-            "prompt_tokens":     -1,
-            "completion_tokens": -1,
-            "total_tokens":      -1,
-        },
-        "x_metadata": {
-            "language":      result["language"],
-            "intent":        result["intent"],
-            "top_score":     result["top_score"],
-            "latency_ms":    result["latency_ms"],
-            "sources_count": len(result["sources"]),
-            "analysis":      result["analysis"],
-        },
-    }

 """
+QModel v4 — Islamic RAG API
+===========================
+Specialized Quran & Hadith system with dual LLM backend support.
+Features:
+  • Dual backend: Hugging Face (transformers) + Ollama
+  • Grade filtering: Return only Sahih/Hasan Hadiths
+  • Source filtering: Quran-only or Hadith-only queries
+  • Hadith verification: Quick auth check endpoint
+  • Word frequency: Enhanced with Surah grouping
+  • No hallucinations: Confidence gating + few-shot anti-hallucination
+  • Arabic & English: Full bilingual support with proper normalization
+Configuration via .env:
+  LLM_BACKEND=hf|ollama (default: hf)
+  HF_MODEL_NAME=<hf-model-id> (e.g. gpt2, default: Qwen/Qwen2-7B-Instruct)
+  OLLAMA_HOST=<url> (e.g. http://localhost:11434, default: http://localhost:11434)
+  OLLAMA_MODEL=<model> (e.g. llama2, default: llama2)
+  EMBED_MODEL=intfloat/multilingual-e5-large (embedding model)
 """
 from __future__ import annotations
 import time
 from collections import Counter, OrderedDict
 from contextlib import asynccontextmanager
+from typing import Dict, List, Literal, Optional
 import faiss
 import numpy as np
 from dotenv import load_dotenv
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
 from pydantic import BaseModel, Field, validator
 from sentence_transformers import SentenceTransformer
 # ═══════════════════════════════════════════════════════════════════════
+# CONFIG & LLM FACTORY
 # ═══════════════════════════════════════════════════════════════════════
 class Config:
+    """Centralized configuration with dual backend support."""
+    # Backend selection
+    LLM_BACKEND:          str   = os.getenv("LLM_BACKEND", "ollama")  # "hf" or "ollama"
+    # Hugging Face backend
+    HF_MODEL_NAME:        str   = os.getenv("HF_MODEL_NAME", "Qwen/Qwen2-7B-Instruct")
+    HF_DEVICE:            str   = os.getenv("HF_DEVICE", "auto")
+    HF_MAX_NEW_TOKENS:    int   = int(os.getenv("HF_MAX_NEW_TOKENS", 2048))
+    # Ollama backend
     OLLAMA_HOST:          str   = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+    OLLAMA_MODEL:         str   = os.getenv("OLLAMA_MODEL", "llama2")
+    # Embedding model
+    EMBED_MODEL:          str   = os.getenv("EMBED_MODEL", "intfloat/multilingual-e5-large")
+    # Index & data
+    FAISS_INDEX:          str   = os.getenv("FAISS_INDEX", "QModel.index")
+    METADATA_FILE:        str   = os.getenv("METADATA_FILE", "metadata.json")
+    # Retrieval
+    TOP_K_SEARCH:         int   = int(os.getenv("TOP_K_SEARCH", 20))     # candidate pool
+    TOP_K_RETURN:         int   = int(os.getenv("TOP_K_RETURN", 5))      # final results
+    # Generation
+    TEMPERATURE:          float = float(os.getenv("TEMPERATURE", 0.2))
+    MAX_TOKENS:           int   = int(os.getenv("MAX_TOKENS", 2048))
+    # Caching
+    CACHE_SIZE:           int   = int(os.getenv("CACHE_SIZE", 512))
+    CACHE_TTL:            int   = int(os.getenv("CACHE_TTL", 3600))
+    # Ranking
+    RERANK_ALPHA:         float = float(os.getenv("RERANK_ALPHA", 0.6))  # 60% dense, 40% sparse
     HADITH_BOOST:         float = float(os.getenv("HADITH_BOOST", 0.08))
+    # Safety
+    CONFIDENCE_THRESHOLD: float = float(os.getenv("CONFIDENCE_THRESHOLD", 0.30))
+    # CORS
+    ALLOWED_ORIGINS:      str   = os.getenv("ALLOWED_ORIGINS", "*")
+    MAX_EXAMPLES:         int   = int(os.getenv("MAX_EXAMPLES", 3))
 cfg = Config()
+# ═══════════════════════════════════════════════════════════════════════
+# LLM ABSTRACTION LAYER
+# ═══════════════════════════════════════════════════════════════════════
+class LLMProvider:
+    """Abstract base for LLM providers."""
+    async def chat(
+        self, messages: List[dict], temperature: float, max_tokens: int
+    ) -> str:
+        raise NotImplementedError
+class OllamaProvider(LLMProvider):
+    """Ollama-based LLM provider."""
+    def __init__(self, host: str, model: str):
+        self.host = host
+        self.model = model
+        try:
+            import ollama
+            self.client = ollama.Client(host=host)
+        except ImportError:
+            raise ImportError("Install ollama:  pip install ollama")
+    async def chat(
+        self, messages: List[dict], temperature: float, max_tokens: int
+    ) -> str:
+        loop = asyncio.get_event_loop()
+        try:
+            result = await loop.run_in_executor(
+                None,
+                lambda: self.client.chat(
+                    model=self.model,
+                    messages=messages,
+                    options={"temperature": temperature, "num_predict": max_tokens},
+                ),
+            )
+            return result["message"]["content"].strip()
+        except Exception as exc:
+            logger.error("Ollama chat failed: %s", exc)
+            raise
+class HuggingFaceProvider(LLMProvider):
+    """Hugging Face transformers-based LLM provider."""
+    def __init__(self, model_name: str, device: str):
+        self.model_name = model_name
+        self.device = device
+        try:
+            from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                device_map=device,
+                torch_dtype="auto",
+            )
+            self.pipeline = TextGenerationPipeline(
+                model=self.model,
+                tokenizer=self.tokenizer,
+                device=0 if device != "cpu" else None,
+            )
+        except ImportError:
+            raise ImportError("Install transformers:  pip install transformers torch")
+    async def chat(
+        self, messages: List[dict], temperature: float, max_tokens: int
+    ) -> str:
+        # Format messages for the model
+        prompt = self._format_messages(messages)
+        loop = asyncio.get_event_loop()
+        try:
+            result = await loop.run_in_executor(
+                None,
+                lambda: self.pipeline(
+                    prompt,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    do_sample=temperature > 0,
+                ),
+            )
+            # Extract generated text
+            generated = result[0]["generated_text"]
+            # Remove the prompt from generated text
+            output = generated[len(prompt):].strip()
+            return output
+        except Exception as exc:
+            logger.error("HF chat failed: %s", exc)
+            raise
+    def _format_messages(self, messages: List[dict]) -> str:
+        """Format messages for the model."""
+        prompt = ""
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            if role == "system":
+                prompt += f"{content}\n\n"
+            elif role == "user":
+                prompt += f"User: {content}\n"
+            elif role == "assistant":
+                prompt += f"Assistant: {content}\n"
+        prompt += "Assistant: "
+        return prompt
+def get_llm_provider() -> LLMProvider:
+    """Factory function to get the configured LLM provider."""
+    if cfg.LLM_BACKEND == "ollama":
+        logger.info("Using Ollama backend: %s @ %s", cfg.OLLAMA_MODEL, cfg.OLLAMA_HOST)
+        return OllamaProvider(cfg.OLLAMA_HOST, cfg.OLLAMA_MODEL)
+    elif cfg.LLM_BACKEND == "hf":
+        logger.info("Using HuggingFace backend: %s on %s", cfg.HF_MODEL_NAME, cfg.HF_DEVICE)
+        return HuggingFaceProvider(cfg.HF_MODEL_NAME, cfg.HF_DEVICE)
+    else:
+        raise ValueError(f"Unknown LLM_BACKEND: {cfg.LLM_BACKEND}")
 # ═══════════════════════════════════════════════════════════════════════
 rewrite_cache  = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL * 6)
 # ═══════════════════════════════════════════════════════════════════════
 # ARABIC NLP  — normalisation + light stemming
 # ═══════════════════════════════════════════════════════════════════════
+_DIACRITICS   = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u0671\u06D6-\u06ED]")
 _ALEF_VARS    = re.compile(r"[أإآٱ]")
 _WAW_HAMZA    = re.compile(r"ؤ")
 _YA_HAMZA     = re.compile(r"ئ")
     "قران":    "قرآن",
     "القران":  "القرآن",
     "اللہ":    "الله",
 }
 def normalize_arabic(text: str, *, aggressive: bool = False) -> str:
+    """Normalize Arabic text: diacritics, hamza, ta marbuta, etc."""
     text = _DIACRITICS.sub("", text)
     text = _TATWEEL.sub("", text)
     text = _ALEF_VARS.sub("ا", text)
 def light_stem(word: str) -> str:
+    """Light stemming: remove common Arabic affixes."""
     w = _AR_PREFIXES.sub("", word)
     w = _AR_SUFFIXES.sub("", w)
     return w if len(w) >= 2 else word
 def tokenize_ar(text: str) -> List[str]:
+    """Tokenize and stem Arabic text."""
     norm = normalize_arabic(text, aggressive=True).lower()
     return [light_stem(t) for t in norm.split() if t]
 )
+def detect_language(text: str) -> Literal["arabic", "english", "mixed"]:
+    """Detect if text is Arabic, English, or mixed."""
     ar    = len(_ARABIC_SCRIPT.findall(text))
     en    = len(re.findall(r"[a-zA-Z]", text))
     tot   = ar + en or 1
 def language_instruction(lang: str) -> str:
+    """Generate language-specific instruction for LLM."""
     return {
         "arabic": (
             "يجب أن تكون الإجابة كاملةً باللغة العربية الفصحى تماماً. "
   "ar_query": "<query in clear Arabic فصحى, ≤25 words>",
   "en_query": "<query in clear English, ≤25 words>",
   "keywords": ["<3-7 key Arabic or English terms from the question>"],
+  "intent": "<one of: fatwa | tafsir | hadith | count | auth | general>"
 }
+Intent Detection Rules (CRITICAL):
+- 'count' intent = asking for number/frequency (كم مرة, how many times, count occurrences)
+- 'auth' intent = asking about authenticity (صحيح؟, هل صحيح, is it authentic, verify hadith grade)
+- 'hadith' intent = asking about specific hadith meaning/text (not authenticity)
+- 'tafsir' intent = asking about Quranic verses or Islamic ruling (fatwa)
+- 'general' intent = other questions
+Examples:
+- "كم مرة ذُكرت كلمة مريم" → intent: count
+- "هل حديث إنما الأعمال بالنيات صحيح" → intent: auth (asking if authentic!)
+- "ما معنى حديث إنما الأعمال" → intent: hadith
+- "ما حكم الربا في الإسلام" → intent: fatwa
 """
+async def rewrite_query(raw: str, llm: LLMProvider) -> Dict:
+    """Rewrite query for better retrieval."""
     cached = await rewrite_cache.get(raw)
     if cached:
         return cached
         "intent":   "general",
     }
     try:
+        text = await llm.chat(
             messages=[
                 {"role": "system", "content": _REWRITE_SYSTEM},
                 {"role": "user",   "content": raw},
         for k in ("ar_query", "en_query", "keywords", "intent"):
             result.setdefault(k, fallback[k])
         await rewrite_cache.set(result, raw)
+        logger.info("Rewrite: intent=%s ar=%s", result["intent"], result["ar_query"][:60])
         return result
     except Exception as exc:
         logger.warning("Query rewrite failed (%s) — using fallback", exc)
 # ═══════════════════════════════════════════════════════════════════════
+# INTENT DETECTION  (frequency / count queries / hadith auth)
 # ═══════════════════════════════════════════════════════════════════════
 _COUNT_EN = re.compile(
     r"\b(how many|count|number of|frequency|occurrences? of|how often|"
     r"(كم مرة|كم عدد|كم تكرر|عدد مرات|تكرار|كم ذُكر|كم وردت?)"
 )
+_AUTH_EN = re.compile(
+    r"\b(authentic|is.*authentic|authenticity|sahih|hasan|weak|daif|verify)\b",
+    re.I,
+)
+_AUTH_AR = re.compile(
+    r"(صحيح|حسن|ضعيف|درجة|صحة|تصحيح|هل.*صحيح|هل.*ضعيف)"
+)
 async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
+    """Detect if query is asking for word frequency analysis."""
     if rewrite.get("intent") == "count":
         kws = rewrite.get("keywords", [])
         return kws[0] if kws else None
     if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
         return None
+    # Simple heuristic: last word after "how many"
+    for pat in (_COUNT_EN, _COUNT_AR):
+        m = pat.search(query)
+        if m:
+            tail = query[m.end():].strip().split()
+            if tail:
+                return tail[0]
     return None
 # OCCURRENCE ANALYSIS  (exact + stemmed matching)
 # ═══════════════════════════════════════════════════════════════════════
 async def count_occurrences(keyword: str, dataset: list) -> dict:
+    """Count keyword occurrences with Surah grouping."""
     cached = await analysis_cache.get(keyword)
     if cached:
         return cached
     kw_norm  = normalize_arabic(keyword, aggressive=True).lower()
     kw_stem  = light_stem(kw_norm)
     count    = 0
+    by_surah: Dict[int, Dict] = {}
     examples: list = []
     for item in dataset:
+        if item.get("type") != "quran":
+            continue
+        ar_norm  = normalize_arabic(item.get("arabic", ""), aggressive=True).lower()
         combined = f"{ar_norm} {item.get('english', '')}".lower()
         exact    = combined.count(kw_norm)
         stemmed  = combined.count(kw_stem) - exact if kw_stem != kw_norm else 0
         occ      = exact + stemmed
         if occ > 0:
             count += occ
+            surah_num = item.get("surah_number", 0)
+            if surah_num not in by_surah:
+                by_surah[surah_num] = {
+                    "name": item.get("surah_name_en", f"Surah {surah_num}"),
+                    "count": 0,
+                }
+            by_surah[surah_num]["count"] += occ
             if len(examples) < cfg.MAX_EXAMPLES:
                 examples.append({
+                    "reference": item.get("source", ""),
+                    "arabic": item.get("arabic", ""),
                     "english": item.get("english", ""),
                 })
     result = {
         "keyword":     keyword,
         "kw_stemmed":  kw_stem,
         "total_count": count,
+        "by_surah": dict(sorted(by_surah.items())),
         "examples":    examples,
     }
     await analysis_cache.set(result, keyword)
 # ═══════════════════════════════════════════════════════════════════════
+# HYBRID SEARCH  — dense FAISS + BM25 re-ranking + filtering
 # ═══════════════════════════════════════════════════════════════════════
 def _bm25_score(
     query_terms: List[str],
     k1: float = 1.5,
     b: float  = 0.75,
 ) -> float:
+    """BM25 term-frequency scoring."""
     doc_tokens = tokenize_ar(doc_text)
     dl         = len(doc_tokens)
     tf         = Counter(doc_tokens)
     index: faiss.Index,
     dataset: list,
     top_n: int = cfg.TOP_K_RETURN,
+    source_type: Optional[Literal["quran", "hadith"]] = None,
+    grade_filter: Optional[str] = None,
 ) -> list:
+    """Hybrid search: dense + sparse with optional filtering."""
+    cache_key = (raw_query, top_n, source_type, grade_filter)
+    cached = await search_cache.get(*cache_key)
     if cached:
         return cached
     distances, indices = index.search(fused.reshape(1, -1), cfg.TOP_K_SEARCH)
+    # ── 2. De-duplicate candidates & apply filters ─────────────────────
     seen: set  = set()
     candidates = []
     for dist, idx in zip(distances[0], indices[0]):
         item_idx = int(idx) // 2
         if item_idx not in seen and 0 <= item_idx < len(dataset):
             seen.add(item_idx)
+            item = dataset[item_idx]
+            # Source type filter
+            if source_type and item.get("type") != source_type:
+                continue
+            # Grade filter (Hadith only)
+            if grade_filter and item.get("type") == "hadith":
+                item_grade = item.get("grade", "").lower()
+                if grade_filter.lower() not in item_grade:
+                    continue
+            candidates.append({**item, "_dense": float(dist)})
+    if not candidates:
+        return []
     # ── 3. BM25 sparse scoring ─────────────────────────────────────────
     query_terms = [
         doc        = c.get("arabic", "") + " " + c.get("english", "")
         c["_sparse"] = _bm25_score(query_terms, doc, avg_dl)
+    # ── 3.5. Phrase matching boost for exact snippets ───────────────────
+    query_norm = normalize_arabic(raw_query, aggressive=False).lower()
+    for c in candidates:
+        # For hadiths: if query contains specific text, boost exact match
+        if c.get("type") == "hadith":
+            ar_norm = normalize_arabic(c.get("arabic", ""), aggressive=False).lower()
+            # Check if any significant phrase (3+ words) from query appears in hadith
+            query_fragments = query_norm.split()
+            for i in range(len(query_fragments) - 2):
+                phrase = " ".join(query_fragments[i:i+3])
+                if len(phrase) > 5 and phrase in ar_norm:  # phrase is 5+ chars
+                    c["_sparse"] += 2.0  # boost exact phrase match
+                    break
     # ── 4. Score fusion ────────────────────────────────────────────────
     α          = cfg.RERANK_ALPHA
     intent     = rewrite.get("intent", "general")
+    # For hadith authenticity queries, rely more on semantic search
+    if intent == "auth":
+        α = 0.75  # 75% dense, 25% sparse (vs default 60/40)
+    max_sparse = max((c["_sparse"] for c in candidates), default=1.0) or 1.0
     for c in candidates:
         base_score = α * c["_dense"] + (1 - α) * c["_sparse"] / max_sparse
         if intent == "hadith" and c.get("type") == "hadith":
             base_score += cfg.HADITH_BOOST
         c["_score"] = base_score
     candidates.sort(key=lambda x: x["_score"], reverse=True)
     results = candidates[:top_n]
+    await search_cache.set(results, *cache_key)
     return results
+def build_context(results: list) -> str:
+    """Format search results into context block for LLM."""
     lines = []
     for i, r in enumerate(results, 1):
         source    = r.get("source") or r.get("reference") or "Unknown Source"
+        item_type = "Quranic Verse" if r.get("type") == "quran" else "Hadith"
+        grade_str = f" [Grade: {r.get('grade')}]" if r.get("grade") else ""
         lines.append(
+            f"[{i}] 📌 {item_type}{grade_str} | {source} | score: {r.get('_score', 0):.3f}\n"
             f"    Arabic : {r.get('arabic', '')}\n"
             f"    English: {r.get('english', '')}"
         )
 # ═══════════════════════════════════════════════════════════════════════
+# PROMPT ENGINEERING
 # ═════════════════════════════════════════════��═════════════════════════
 _PERSONA = (
+    "You are Sheikh QModel, a meticulous Islamic scholar with expertise "
+    "in Tafsir (Quranic exegesis), Hadith sciences, Fiqh, and Arabic. "
+    "You respond with scholarly rigor and modern clarity."
 )
 _TASK_INSTRUCTIONS: Dict[str, str] = {
     "tafsir": (
+        "The user asks about a Quranic verse. Steps:\n"
+        "1. Identify the verse(s) from context.\n"
+        "2. Provide Tafsir: linguistic analysis and deeper meaning.\n"
+        "3. Draw connections to related verses.\n"
+        "4. Answer the user's question directly."
     ),
     "hadith": (
         "The user asks about a Hadith. Steps:\n"
+        "1. Quote the text EXACTLY from the context below.\n"
+        "2. Explain the meaning and implications.\n"
+        "3. Note any related Hadiths.\n"
+        "CRITICAL: If the Hadith is NOT in context, say so clearly."
+    ),
+    "auth": (
+        "The user asks about Hadith authenticity. YOU MUST:\n"
+        "1. Check if the Hadith is in the context below.\n"
+        "2. If FOUND, state the grade (Sahih, Hasan, Da'if, etc.) confidently.\n"
+        "3. If found in Sahih Bukhari or Sahih Muslim, assert it is AUTHENTIC (Sahih).\n"
+        "4. Provide the Hadith text from context and explain its authenticity basis.\n"
+        "5. If NOT found after careful search, clearly state it's absent from the dataset.\n"
+        "CRITICAL: Use the context provided. Do not rely on your training data."
     ),
     "fatwa": (
+        "The user seeks a religious ruling. Steps:\n"
+        "1. Gather evidence from Quran + Sunnah in context.\n"
+        "2. Reason step-by-step to a conclusion.\n"
+        "3. If insufficient, state so explicitly."
     ),
     "count": (
+        "The user asks for word frequency. Steps:\n"
+        "1. State the ANALYSIS RESULT prominently.\n"
+        "2. List example occurrences with Surah names.\n"
+        "3. Comment on significance."
     ),
     "general": (
         "The user has a general Islamic question. Steps:\n"
         "1. Give a direct answer first.\n"
+        "2. Support with evidence from context.\n"
         "3. Conclude with a summary."
     ),
 }
 _FORMAT_RULES = """\
+For EVERY supporting evidence, use this exact format:
 ┌─────────────────────────────────────────────┐
 │  ❝ {Arabic text} ❞
 │  📖 Source: {exact citation from context}
 └─────────────────────────────────────────────┘
+ABSOLUTE RULES:
+• Use ONLY content from the Islamic Context block. Zero outside knowledge.
+• Copy Arabic text and translations VERBATIM from context. Never paraphrase.
+• If a specific Hadith/verse is NOT in context → respond with:
+    "هذا الحديث/الآية غير موجود في قاعدة البيانات." (Arabic)
+    or "This Hadith/verse is not in the available dataset." (English)
+• Never invent or guess content.
+• End with: "والله أعلم." (Arabic) or "And Allah knows best." (English)
 """
 _SYSTEM_TEMPLATE = """\
 === OUTPUT FORMAT ===
 {fmt}
+=== ISLAMIC CONTEXT ===
 {context}
 === END CONTEXT ===
 """
     intent: str,
     analysis: Optional[dict] = None,
 ) -> List[dict]:
+    """Build system and user messages for LLM."""
     if analysis:
+        by_surah_str = "\n  ".join([
+            f"Surah {s}: {data['name']} ({data['count']} times)"
+            for s, data in analysis["by_surah"].items()
+        ])
         analysis_block = (
             f"\n[ANALYSIS RESULT]\n"
+            f"The keyword «{analysis['keyword']}» appears {analysis['total_count']} times.\n"
+            f"  {by_surah_str}\n"
         )
         context = analysis_block + context
         lang_instruction=language_instruction(lang),
         task=_TASK_INSTRUCTIONS.get(intent, _TASK_INSTRUCTIONS["general"]),
         fmt=_FORMAT_RULES,
         context=context,
     )
     ]
 def _not_found_answer(lang: str) -> str:
+    """Safe fallback when confidence is too low."""
     if lang == "arabic":
         return (
+            "لم أجد في قاعدة البيانات ما يكفي للإجابة على هذا السؤال بدقة.\n"
+            "يُرجى الرجوع إلى مصادر إسلامية موثوقة.\n"
             "والله أعلم."
         )
     return (
         "The available dataset does not contain sufficient information to answer "
+        "this question accurately.\nPlease refer to trusted Islamic sources.\n"
         "And Allah knows best."
     )
+# ═══════════════════════════════════════════════════════════════════════
+# HADITH GRADE INFERENCE
+# ═══════════════════════════════════════════════════════════════════════
+def infer_hadith_grade(item: dict) -> dict:
+    """Infer hadith grade from collection name if not present."""
+    if item.get("type") != "hadith" or item.get("grade"):
+        return item
+    # Map collection names to grades
+    collection = item.get("collection", "").lower()
+    reference = item.get("reference", "").lower()
+    combined = f"{collection} {reference}"
+    # Sahih collections (highest authenticity)
+    if any(s in combined for s in ["sahih al-bukhari", "sahih bukhari", "bukhari"]):
+        item["grade"] = "Sahih"
+    elif any(s in combined for s in ["sahih muslim", "sahih al-muslim"]):
+        item["grade"] = "Sahih"
+    elif any(s in combined for s in ["sunan an-nasai", "sunan an-nasa", "nasa'i", "nasa"]):
+        item["grade"] = "Sahih"
+    # Hasan collections
+    elif any(s in combined for s in ["jami at-tirmidhi", "tirmidhi", "at-tirmidhi"]):
+        item["grade"] = "Hasan"
+    elif any(s in combined for s in ["sunan abu dawood", "abu dawood", "abo daud", "abou daoude"]):
+        item["grade"] = "Hasan"
+    elif any(s in combined for s in ["sunan ibn majah", "ibn majah", "ibn maja"]):
+        item["grade"] = "Hasan"
+    elif any(s in combined for s in ["muwatta malik", "muwatta", "malik"]):
+        item["grade"] = "Hasan"
+    # New collections from enrichment
+    elif any(s in combined for s in ["musnad ahmad", "ahmad", "ahmed"]):
+        item["grade"] = "Hasan/Sahih"
+    elif any(s in combined for s in ["sunan al-darimi", "darimi", "al-darimi"]):
+        item["grade"] = "Hasan"
+    return item
 # ═══════════════════════════════════════════════════════════════════════
 # APP STATE
 # ═══════════════════════════════════════════════════════════════════════
     embed_model: Optional[SentenceTransformer] = None
     faiss_index: Optional[faiss.Index]         = None
     dataset:     Optional[list]                = None
+    llm:         Optional[LLMProvider]         = None
     ready:       bool                          = False
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    """Initialize state on startup."""
     logger.info("⏳  Loading embed model:  %s", cfg.EMBED_MODEL)
     state.embed_model = SentenceTransformer(cfg.EMBED_MODEL)
     with open(cfg.METADATA_FILE, "r", encoding="utf-8") as f:
         state.dataset = json.load(f)
+    # Infer hadith grades from collection names
+    state.dataset = [infer_hadith_grade(item) for item in state.dataset]
+    logger.info("⏳  Initializing LLM provider: %s", cfg.LLM_BACKEND)
+    state.llm = get_llm_provider()
     state.ready = True
     logger.info(
+        "✅  QModel v4 ready | backend=%s | dataset=%d | faiss=%d | threshold=%.2f",
+        cfg.LLM_BACKEND,
+        len(state.dataset) if state.dataset else 0,
+        state.faiss_index.ntotal if state.faiss_index else 0,
+        cfg.CONFIDENCE_THRESHOLD,
     )
     yield
     state.ready = False
 # FASTAPI APP
 # ═══════════════════════════════════════════════════════════════════════
 app = FastAPI(
+    title="QModel v4 — Islamic RAG API",
+    description="Specialized Quran & Hadith system with dual LLM backend",
+    version="4.0.0",
     lifespan=lifespan,
 )
     content: str = Field(..., min_length=1, max_length=4000)
 class AnalysisResult(BaseModel):
     keyword:     str
     kw_stemmed:  str
     total_count: int
+    by_surah:    Dict[int, Dict]
     examples:    List[dict]
+class SourceItem(BaseModel):
+    source:        str
+    type:          str
+    grade:         Optional[str] = None
+    arabic:        str
+    english:       str
+    _score:        float
 class AskResponse(BaseModel):
+    question:     str
+    answer:       str
+    language:     str
+    intent:       str
+    analysis:     Optional[AnalysisResult] = None
+    sources:      List[SourceItem]
+    top_score:    float
+    latency_ms:   int
+class HadithVerifyResponse(BaseModel):
+    query:        str
+    found:        bool
+    collection:   Optional[str] = None
+    grade:        Optional[str] = None
+    reference:    Optional[str] = None
+    arabic:       Optional[str] = None
+    english:      Optional[str] = None
+    latency_ms:   int
+# ═══════════════════════════════════════════════════════════════════════
+# OPENAI-COMPATIBLE SCHEMAS  (for Open-WebUI integration)
+# ═══════════════════════════════════════════════════════════════════════
+class ChatCompletionMessage(BaseModel):
+    role:    str = Field(..., description="Message role: system, user, or assistant")
+    content: str = Field(..., description="Message content")
+class ChatCompletionRequest(BaseModel):
+    model:       str            = Field(default="QModel", description="Model name")
+    messages:    List[ChatCompletionMessage] = Field(..., description="Messages for the model")
+    temperature: Optional[float] = Field(default=cfg.TEMPERATURE, ge=0.0, le=2.0)
+    top_p:       Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
+    max_tokens:  Optional[int]   = Field(default=cfg.MAX_TOKENS, ge=1, le=8000)
+    top_k:       Optional[int]   = Field(default=5, ge=1, le=20, description="Islamic sources to retrieve")
+    stream:      Optional[bool]  = Field(default=False, description="Enable streaming responses")
+class ChatCompletionChoice(BaseModel):
+    index:        int
+    message:      ChatCompletionMessage
+    finish_reason: str = "stop"
+class ChatCompletionResponse(BaseModel):
+    id:      str
+    object:  str = "chat.completion"
+    created: int
+    model:   str
+    choices: List[ChatCompletionChoice]
+    usage:   dict
+    x_metadata: Optional[dict] = None  # QModel-specific metadata
+class ModelInfo(BaseModel):
+    id:       str
+    object:   str = "model"
+    created:  int
+    owned_by: str = "elgendy"
+    permission: List[dict] = Field(default_factory=list)
+    root:     Optional[str] = None
+    parent:   Optional[str] = None
+class ModelsListResponse(BaseModel):
+    object: str = "list"
+    data:   List[ModelInfo]
 # ═══════════════════════════════════════════════════════════════════════
+# CORE RAG PIPELINE
 # ═══════════════════════════════════════════════════════════════════════
+async def run_rag_pipeline(
+    question: str,
+    top_k: int = cfg.TOP_K_RETURN,
+    source_type: Optional[Literal["quran", "hadith"]] = None,
+    grade_filter: Optional[str] = None,
+) -> dict:
+    """Core RAG pipeline: rewrite → search → verify → generate."""
     t0 = time.perf_counter()
     # 1. Query rewriting
+    rewrite = await rewrite_query(question, state.llm)
     intent  = rewrite.get("intent", "general")
     # 2. Intent detection + hybrid search — concurrently
         hybrid_search(
             question, rewrite,
             state.embed_model, state.faiss_index, state.dataset,
+            top_k, source_type, grade_filter,
         ),
     )
     analysis_kw, results = await asyncio.gather(kw_task, search_task)
         intent, top_score, cfg.CONFIDENCE_THRESHOLD,
     )
+    # 5. Confidence gate
     if top_score < cfg.CONFIDENCE_THRESHOLD:
         logger.warning(
+            "Low confidence (%.3f < %.2f) — returning safe fallback",
             top_score, cfg.CONFIDENCE_THRESHOLD,
         )
         return {
             "latency_ms": int((time.perf_counter() - t0) * 1000),
         }
+    # 6. Build context + prompt + LLM call
+    context  = build_context(results)
     messages = build_messages(context, question, lang, intent, analysis)
     try:
+        answer = await state.llm.chat(
+            messages,
+            max_tokens=cfg.MAX_TOKENS,
+            temperature=cfg.TEMPERATURE,
         )
+    except Exception as exc:
+        logger.error("LLM call failed: %s", exc)
+        raise HTTPException(status_code=502, detail="LLM service unavailable")
     latency = int((time.perf_counter() - t0) * 1000)
     logger.info(
 # ═══════════════════════════════════════════════════════════════════════
 @app.get("/health", tags=["ops"])
 def health():
+    """Health check endpoint."""
     return {
         "status":               "ok" if state.ready else "initialising",
+        "version":              "4.0.0",
+        "llm_backend":          cfg.LLM_BACKEND,
         "dataset_size":         len(state.dataset)        if state.dataset    else 0,
         "faiss_total":          state.faiss_index.ntotal  if state.faiss_index else 0,
         "confidence_threshold": cfg.CONFIDENCE_THRESHOLD,
     }
+@app.get("/v1/models", response_model=ModelsListResponse, tags=["models"])
 def list_models():
+    """List available models (OpenAI-compatible)."""
+    return ModelsListResponse(
+        data=[
+            ModelInfo(
+                id="QModel",
+                created=int(time.time()),
+                owned_by="elgendy",
+            ),
+            ModelInfo(
+                id="qmodel",  # Lowercase variant for compatibility
+                created=int(time.time()),
+                owned_by="elgendy",
+            ),
+        ]
+    )
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse, tags=["inference"])
+async def chat_completions(request: ChatCompletionRequest):
+    """OpenAI-compatible chat completions endpoint (for Open-WebUI integration)."""
+    _check_ready()
+    # Extract user message (last message with role="user")
+    user_messages = [m.content for m in request.messages if m.role == "user"]
+    if not user_messages:
+        raise HTTPException(status_code=400, detail="No user message in request")
+    question = user_messages[-1]
+    top_k = request.top_k or cfg.TOP_K_RETURN
+    temperature = request.temperature or cfg.TEMPERATURE
+    max_tokens = request.max_tokens or cfg.MAX_TOKENS
+    try:
+        result = await run_rag_pipeline(question, top_k=top_k)
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.error("Pipeline error: %s", exc)
+        raise HTTPException(status_code=500, detail=str(exc))
+    # Handle streaming if requested
+    if request.stream:
+        return StreamingResponse(
+            _stream_response(result, request.model),
+            media_type="text/event-stream",
+        )
+    # Format response in OpenAI schema
+    return ChatCompletionResponse(
+        id=f"qmodel-{int(time.time() * 1000)}",
+        created=int(time.time()),
+        model=request.model,
+        choices=[
+            ChatCompletionChoice(
+                index=0,
+                message=ChatCompletionMessage(
+                    role="assistant",
+                    content=result["answer"],
+                ),
+            )
+        ],
+        usage={
+            "prompt_tokens":     -1,
+            "completion_tokens": -1,
+            "total_tokens":      -1,
+        },
+        x_metadata={
+            "language":       result["language"],
+            "intent":         result["intent"],
+            "top_score":      round(result["top_score"], 4),
+            "latency_ms":     result["latency_ms"],
+            "sources_count":  len(result["sources"]),
+            "sources": [
+                {
+                    "source": s.get("source") or s.get("reference", ""),
+                    "type":   s.get("type", ""),
+                    "grade":  s.get("grade"),
+                    "score":  round(s.get("_score", 0), 4),
+                }
+                for s in result.get("sources", [])[:5]
+            ],
+            "analysis": result.get("analysis"),
+        },
+    )
+async def _stream_response(result: dict, model: str):
+    """Stream response chunks in OpenAI format."""
+    import json
+    # Send answer in chunks
+    answer = result.get("answer", "")
+    for line in answer.split("\n"):
+        chunk = {
+            "id": f"qmodel-{int(time.time() * 1000)}",
+            "object": "chat.completion.chunk",
+            "created": int(time.time()),
+            "model": model,
+            "choices": [{
+                "index": 0,
+                "delta": {"content": line + "\n"},
+                "finish_reason": None,
+            }],
+        }
+        yield f"data: {json.dumps(chunk)}\n\n"
+    # Send final chunk
+    final_chunk = {
+        "id": f"qmodel-{int(time.time() * 1000)}",
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "delta": {},
+            "finish_reason": "stop",
         }],
     }
+    yield f"data: {json.dumps(final_chunk)}\n\n"
+    yield "data: [DONE]\n\n"
+@app.get("/ask", response_model=AskResponse, tags=["inference"])
+async def ask(
+    q: str = Query(..., min_length=1, max_length=1000, description="Your Islamic question"),
+    top_k: int = Query(cfg.TOP_K_RETURN, ge=1, le=20, description="Number of sources"),
+    source_type: Optional[str] = Query(None, description="Filter: quran|hadith"),
+    grade_filter: Optional[str] = Query(None, description="Filter Hadith: sahih|hasan|,all"),
+):
+    """Main inference endpoint."""
+    _check_ready()
+    result = await run_rag_pipeline(q, top_k, source_type, grade_filter)
+    sources = [
+        SourceItem(
+            source=r.get("source") or r.get("reference") or "Unknown",
+            type=r.get("type", "unknown"),
+            grade=r.get("grade"),
+            arabic=r.get("arabic", ""),
+            english=r.get("english", ""),
+            _score=r.get("_score", 0.0),
+        )
+        for r in result["sources"]
+    ]
+    return AskResponse(
+        question=q,
+        answer=result["answer"],
+        language=result["language"],
+        intent=result["intent"],
+        analysis=result["analysis"],
+        sources=sources,
+        top_score=result["top_score"],
+        latency_ms=result["latency_ms"],
+    )
+@app.get("/hadith/verify", response_model=HadithVerifyResponse, tags=["hadith"])
+async def verify_hadith(
+    q: str = Query(..., description="First few words or query of Hadith"),
+    collection: Optional[str] = Query(None, description="Filter: bukhari|muslim|all"),
+):
+    """Verify if a Hadith is in authenticated collections."""
+    _check_ready()
+    t0 = time.perf_counter()
+    results = await hybrid_search(
+        q, {"ar_query": q, "en_query": q, "keywords": q.split(), "intent": "hadith"},
+        state.embed_model, state.faiss_index, state.dataset,
+        top_n=5, source_type="hadith", grade_filter="sahih",
+    )
+    if results:
+        r = results[0]
+        return HadithVerifyResponse(
+            query=q,
+            found=True,
+            collection=r.get("collection"),
+            grade=r.get("grade"),
+            reference=r.get("reference"),
+            arabic=r.get("arabic"),
+            english=r.get("english"),
+            latency_ms=int((time.perf_counter() - t0) * 1000),
+        )
+    return HadithVerifyResponse(
+        query=q,
+        found=False,
+        latency_ms=int((time.perf_counter() - t0) * 1000),
+    )
 @app.get("/debug/scores", tags=["ops"])
 async def debug_scores(
+    q: str = Query(..., min_length=1, max_length=1000),
     top_k: int = Query(10, ge=1, le=20),
 ):
+    """Debug: inspect raw retrieval scores without LLM."""
     _check_ready()
+    rewrite = await rewrite_query(q, state.llm)
     results = await hybrid_search(q, rewrite, state.embed_model, state.faiss_index, state.dataset, top_k)
     return {
         "intent":    rewrite.get("intent"),
                 "rank":    i + 1,
                 "source":  r.get("source") or r.get("reference"),
                 "type":    r.get("type"),
+                "grade":   r.get("grade"),
                 "_dense":  round(r.get("_dense", 0), 4),
                 "_sparse": round(r.get("_sparse", 0), 4),
                 "_score":  round(r.get("_score", 0), 4),
             }
             for i, r in enumerate(results)
         ],
     }
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt CHANGED Viewed

@@ -1,9 +1,21 @@
-sentence-transformers
-faiss-cpu
-fastapi
-uvicorn
-numpy
-accelerate
-torch
-ollama
-python-dotenv

+# Web framework
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+pydantic==2.4.2
+# Core: Embeddings & Search
+sentence-transformers==2.2.2
+faiss-cpu==1.7.4
+numpy==1.24.3
+# Optional: HuggingFace backend
+transformers==4.34.1
+torch==2.1.1
+accelerate==0.24.1
+# Optional: Ollama backend
+ollama==0.0.48
+# Configuration & Data
+python-dotenv==1.0.0
+requests==2.31.0