aelgendy commited on
Commit
20edea9
·
1 Parent(s): 1e67e7f

Upload folder using huggingface_hub

Browse files
Files changed (14) hide show
  1. .env.example +96 -0
  2. .gitattributes +0 -2
  3. .gitignore +28 -2
  4. ARCHITECTURE.md +235 -0
  5. DOCKER.md +443 -0
  6. Dockerfile +31 -9
  7. OPEN_WEBUI.md +385 -0
  8. README.md +294 -16
  9. SETUP.md +590 -0
  10. build_index.py +74 -64
  11. docker-compose.yml +42 -4
  12. enrich_dataset.py +210 -0
  13. main.py +695 -346
  14. requirements.txt +21 -9
.env.example ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QModel v4 Configuration Template
2
+ # ==================================
3
+ # Copy this to .env and update values for your environment
4
+
5
+ # LLM Backend Selection
6
+ # Options: "hf" (HuggingFace) or "ollama"
7
+ LLM_BACKEND=ollama
8
+
9
+ # ─────────────────────────────────────────────────────────────────────
10
+ # OLLAMA BACKEND (if LLM_BACKEND=ollama)
11
+ # ─────────────────────────────────────────────────────────────────────
12
+ OLLAMA_HOST=http://localhost:11434
13
+ OLLAMA_MODEL=minimax-m2.7:cloud
14
+ # Available models: llama3.1, mistral, neural-chat, openhermes
15
+
16
+ # ─────────────────────────────────────────────────────────────────────
17
+ # HUGGINGFACE BACKEND (if LLM_BACKEND=hf)
18
+ # ─────────────────────────────────────────────────────────────────────
19
+ # HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
20
+ # HF_DEVICE=auto # Options: auto, cuda, cpu
21
+ # HF_MAX_NEW_TOKENS=2048
22
+ # Popular models:
23
+ # - Qwen/Qwen2-7B-Instruct (excellent Arabic)
24
+ # - mistralai/Mistral-7B-Instruct-v0.2
25
+ # - meta-llama/Llama-2-13b-chat-hf
26
+
27
+ # ─────────────────────────────────────────────────────────────────────
28
+ # EMBEDDING MODEL (shared by both backends)
29
+ # ─────────────────────────────────────────────────────────────────────
30
+ EMBED_MODEL=intfloat/multilingual-e5-large
31
+
32
+ # ─────────────────────────────────────────────────────────────────────
33
+ # DATA FILES
34
+ # ─────────────────────────────────────────────────────────────────────
35
+ FAISS_INDEX=QModel.index
36
+ METADATA_FILE=metadata.json
37
+
38
+ # ─────────────────────────────────────────────────────────────────────
39
+ # RETRIEVAL SETTINGS
40
+ # ─────────────────────────────────────────────────────────────────────
41
+ TOP_K_SEARCH=20 # Candidate pool size
42
+ TOP_K_RETURN=5 # Final results returned to user
43
+
44
+ # ─────────────────────────────────────────────────────────────────────
45
+ # GENERATION SETTINGS
46
+ # ─────────────────────────────────────────────────────────────────────
47
+ TEMPERATURE=0.2 # 0.0=deterministic, 1.0=creative
48
+ MAX_TOKENS=2048 # Max output length
49
+
50
+ # ─────────────────────────────────────────────────────────────────────
51
+ # SAFETY & QUALITY
52
+ # ─────────────────────────────────────────────────────────────────────
53
+ # Confidence threshold: Below this score, skip LLM and return "not found"
54
+ # Prevents hallucinations but may miss valid results
55
+ # Range: 0.0-1.0 (default 0.30)
56
+ # Tune up (0.50+) for stricter, tune down (0.20) for looser
57
+ CONFIDENCE_THRESHOLD=0.30
58
+
59
+ # Hadith boost: Score bonus when intent=hadith
60
+ # Prevents Quran verses from outranking relevant Hadiths
61
+ HADITH_BOOST=0.08
62
+
63
+ # ─────────────────────────────────────────────────────────────────────
64
+ # RANKING
65
+ # ─────────────────────────────────────────────────────────────────────
66
+ RERANK_ALPHA=0.6 # 60% dense (embedding), 40% sparse (BM25)
67
+
68
+ # ──────────────────────────────────��──────────────────────────────────
69
+ # CACHING
70
+ # ─────────────────────────────────────────────────────────────────────
71
+ CACHE_SIZE=512 # Max cache entries
72
+ CACHE_TTL=3600 # Cache expiry in seconds
73
+
74
+ # ─────────────────────────────────────────────────────────────────────
75
+ # SECURITY
76
+ # ─────────────────────────────────────────────────────────────────────
77
+ ALLOWED_ORIGINS=* # CORS origins (restrict in production: origin1.com,origin2.com)
78
+
79
+ # ─────────────────────────────────────────────────────────────────────
80
+ # USAGE EXAMPLES
81
+ # ─────────────────────────────────────────────────────────────────────
82
+ #
83
+ # Development (Ollama):
84
+ # LLM_BACKEND=ollama
85
+ # OLLAMA_HOST=http://localhost:11434
86
+ # OLLAMA_MODEL=llama2
87
+ #
88
+ # Production (HuggingFace GPU):
89
+ # LLM_BACKEND=hf
90
+ # HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
91
+ # HF_DEVICE=cuda
92
+ #
93
+ # Production (HuggingFace CPU):
94
+ # LLM_BACKEND=hf
95
+ # HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
96
+ # HF_DEVICE=cpu
.gitattributes CHANGED
@@ -1,4 +1,2 @@
1
  # Auto detect text files and perform LF normalization
2
  * text=auto
3
- QModel.index filter=lfs diff=lfs merge=lfs -text
4
- metadata.json filter=lfs diff=lfs merge=lfs -text
 
1
  # Auto detect text files and perform LF normalization
2
  * text=auto
 
 
.gitignore CHANGED
@@ -173,12 +173,38 @@ cython_debug/
173
  # PyPI configuration file
174
  .pypirc
175
 
176
- # Cursor
177
- # Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
178
  # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179
  # refer to https://docs.cursor.com/context/ignore-files
180
  .cursorignore
181
  .cursorindexingignore
182
 
 
 
 
 
 
 
183
  .DS_Store
 
 
 
 
 
 
 
 
184
  data/
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  # PyPI configuration file
174
  .pypirc
175
 
176
+ # Cursor
177
+ # Cursor is an AI-powered code editor.`.cursorignore` specifies files/directories to
178
  # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
179
  # refer to https://docs.cursor.com/context/ignore-files
180
  .cursorignore
181
  .cursorindexingignore
182
 
183
+ # IDE and Editor Settings
184
+ .vscode/
185
+ .idea/
186
+ *.swp
187
+ *.swo
188
+ *~
189
  .DS_Store
190
+ Thumbs.db
191
+
192
+ # Local Environment Files
193
+ .env
194
+ .env.local
195
+ .env*.local
196
+
197
+ # Development Artifacts
198
  data/
199
+ *.log
200
+ *.tmp
201
+ .cache/
202
+
203
+ # Editor/IDE specific
204
+ *.sublime-project
205
+ *.sublime-workspace
206
+ .vim/
207
+ .emacs.d/.DS_Store
208
+
209
+ QModel.index
210
+ metadata.json
ARCHITECTURE.md ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QModel v4 Architecture — Detailed System Design
2
+
3
+ > For a quick overview, see [README.md](README.md#architecture-overview)
4
+
5
+ ## System Vision
6
+ A RAG system specialized **exclusively** in authenticated Qur'an and Hadith. No hallucinations, no outside knowledge—only content from verified sources.
7
+
8
+ ## Core Capabilities
9
+
10
+ ### 1. **Quran Analysis**
11
+ - **Verse Lookup**: Find verses by topic, keyword, or Surah
12
+ - **Word Frequency**: Count word/phrase occurrences across all 114 Surahs
13
+ - **Topic Tafsir**: Retrieve and explain related Quranic verses
14
+ - **Bilingual**: Arabic (Uthmani) + English (Saheeh International)
15
+
16
+ ### 2. **Hadith Operations**
17
+ - **Authentication Status**: Verify if a Hadith is in an authenticated collection
18
+ - **Grade Display**: Show authenticity grade (Sahih, Hasan, Da'if, etc.)
19
+ - **Topic Search**: Find Hadiths related to topics across 7 major collections
20
+ - **Collection Navigation**: Filter by Bukhari, Muslim, Abu Dawud, Tirmidhi, Ibn Majah, Nasa'i, Malik
21
+
22
+ ### 3. **Safety First**
23
+ - **Confidence Gating**: Low-confidence queries return "not found" instead of LLM guess
24
+ - **Source Attribution**: Every answer cites exact verse/Hadith with reference
25
+ - **Grade Filtering**: Optional: only return Sahih-authenticated Hadiths
26
+ - **Verbatim Quotes**: Copy text directly from data, no paraphrasing
27
+
28
+ ---
29
+
30
+ ## Data Pipeline
31
+
32
+ The system follows a three-phase approach:
33
+
34
+ **Metadata Schema**:
35
+ ```json
36
+ {
37
+ "id": "surah:verse or hadith_prefix_number",
38
+ "arabic": "...",
39
+ "english": "...",
40
+ "source": "Surah Al-Baqarah 2:43 | Sahih al-Bukhari 1",
41
+ "type": "quran | hadith",
42
+
43
+ // Quran only
44
+ "surah_number": 2,
45
+ "surah_name_en": "Al-Baqarah",
46
+ "surah_name_ar": "البقرة",
47
+ "verse_number": 43,
48
+
49
+ // Hadith only
50
+ "collection": "Sahih al-Bukhari",
51
+ "grade": "Sahih",
52
+ "hadith_number": 1
53
+ }
54
+ ```
55
+
56
+ ### Phase 2: Indexing
57
+ ```
58
+ build_index.py
59
+ ├── Load Quran + Hadith JSON
60
+ ├── Encode all texts with multilingual-e5-large
61
+ │ ├── Dual embeddings: Arabic + English per item
62
+ │ └── Normalize before encoding
63
+ └── Build FAISS IndexFlatIP for dense retrieval
64
+ ```
65
+
66
+ ### Phase 3: Retrieval & Ranking
67
+
68
+ **Hybrid Search Algorithm**:
69
+ 1. Dense retrieval: FAISS semantic scoring
70
+ 2. Sparse retrieval: BM25 term-frequency ranking
71
+ 3. Fusion: 60% dense + 40% sparse
72
+ 4. Intent-aware boost: +0.08 to Hadith items when intent=hadith
73
+ 5. Type filter: Optional (quran_only / hadith_only / authenticated_only)
74
+
75
+ ---
76
+
77
+ ## Core Components
78
+
79
+ ### `fetch_data.py` — Data Acquisition
80
+ - Fetches complete Quran and 7 Hadith collections
81
+ - Handles network retries + CDN redirects
82
+ - Normalizes and validates data
83
+ - Exports `data/quran.json` and `data/hadith.json`
84
+
85
+ ### `build_index.py` — Index Construction
86
+ - Loads datasets and embeddings model
87
+ - Creates dual-language FAISS vectors
88
+ - Serializes to `QModel.index` + `metadata.json`
89
+
90
+ ### `main.py` — Inference Engine
91
+ **Three processing layers**:
92
+
93
+ 1. **Query Layer** (Rewriting & Intent Detection)
94
+ - `rewrite_query()` — dual-language normalization, spelling correction
95
+ - `detect_analysis_intent()` — detects word frequency queries
96
+ - `detect_language()` — routes to Arabic or English persona
97
+
98
+ 2. **Retrieval Layer** (Semantic Search)
99
+ - `hybrid_search()` — FAISS + BM25 fusion
100
+ - `count_occurrences()` — exact/stemmed word frequency across dataset
101
+ - Caching at query level for fast follow-ups
102
+
103
+ 3. **Generation Layer** (Safe LLM Call)
104
+ - `chat_with_fallback()` — Ollama with 3-model fallback chain
105
+ - `build_context()` — formats retrieved items with scores
106
+ - `build_messages()` — intent-aware prompts with few-shot examples
107
+ - Confidence gate: skips LLM if top_score < threshold
108
+
109
+ **Anti-Hallucination Measures**:
110
+ - Few-shot examples including "not found" refusal path
111
+ - Hardcoded format rules (box/citation format required)
112
+ - Verbatim copy rules (no reconstruction from memory)
113
+ - Confidence threshold gating (default: 0.30)
114
+
115
+ ---
116
+
117
+ ## API Endpoints
118
+
119
+ ### `GET /ask?q=<question>&top_k=5`
120
+ Returns structured Islamic answer with full lineage.
121
+
122
+ **Response**:
123
+ ```json
124
+ {
125
+ "question": "...",
126
+ "answer": "...",
127
+ "language": "arabic | english | mixed",
128
+ "intent": "tafsir | hadith | fatwa | count | general",
129
+ "analysis": {
130
+ "keyword": "محمد",
131
+ "total_count": 157,
132
+ "examples": [...]
133
+ },
134
+ "sources": [
135
+ {
136
+ "rank": 1,
137
+ "source": "Sahih al-Bukhari 1",
138
+ "type": "hadith",
139
+ "grade": "Sahih",
140
+ "_score": 0.876
141
+ }
142
+ ],
143
+ "top_score": 0.876,
144
+ "latency_ms": 342
145
+ }
146
+ ```
147
+
148
+ ### `GET /debug/scores?q=<question>&top_k=10`
149
+ Inspect raw retrieval scores without LLM call. Use to calibrate `CONFIDENCE_THRESHOLD`.
150
+
151
+ ### `POST /v1/chat/completions`
152
+ OpenAI-compatible endpoint for language model clients.
153
+
154
+ ---
155
+
156
+ ## Configuration
157
+
158
+ **`.env` priority**:
159
+ ```
160
+ OLLAMA_HOST # Ollama server URL
161
+ LLM_MODEL # Primary model (e.g. minimax-m2.7:cloud)
162
+ EMBED_MODEL # Embedding model (intfloat/multilingual-e5-large)
163
+ FAISS_INDEX # Path to QModel.index
164
+ METADATA_FILE # Path to metadata.json
165
+ CONFIDENCE_THRESHOLD # Min hybrid score for LLM call (default: 0.30)
166
+ HADITH_BOOST # Intent-aware boost for Hadith (default: 0.08)
167
+ TOP_K_SEARCH # Retrieval candidate pool (default: 20)
168
+ TOP_K_RETURN # Results returned to user (default: 5)
169
+ TEMPERATURE # LLM creativity (default: 0.2 for factual)
170
+ ```
171
+
172
+ ---
173
+
174
+ ## Deployment
175
+
176
+ ### Local Development
177
+ ```bash
178
+ python main.py
179
+ # API at http://localhost:8000
180
+ # Docs at http://localhost:8000/docs
181
+ ```
182
+
183
+ ### Docker
184
+ ```bash
185
+ docker-compose up
186
+ # Ollama on port 11434
187
+ # QModel on port 8000
188
+ ```
189
+
190
+ ---
191
+
192
+ ## Testing the System
193
+
194
+ ### 1. Word Frequency Query
195
+ ```
196
+ Q: "How many times is the word 'mercy' mentioned in the Quran?"
197
+ → Detects 'count' intent
198
+ → Calls count_occurrences()
199
+ → Returns: 114 occurrences with examples
200
+ ```
201
+
202
+ ### 2. Hadith Authenticity Check
203
+ ```
204
+ Q: "Is the Hadith 'Actions are judged by intentions' authentic?"
205
+ → Searches dataset
206
+ → Returns: "Sahih al-Bukhari 1 — Grade: Sahih"
207
+ → LLM elaborates on significance
208
+ ```
209
+
210
+ ### 3. Topic-Based Aya Retrieval
211
+ ```
212
+ Q: "What does the Quran say about patience?"
213
+ → Retrieves top 5 verses about patience
214
+ → Returns: Verses with Tafsir and interconnections
215
+ ```
216
+
217
+ ### 4. Confidence Gate in Action
218
+ ```
219
+ Q: "Who was Muhammad's 7th wife?" (not in dataset)
220
+ → Retrieval score: 0.15 (below 0.30 threshold)
221
+ → Returns: "Not in available dataset"
222
+ → LLM not called (prevents hallucination)
223
+ ```
224
+
225
+ ---
226
+
227
+ ## Roadmap: v4 Enhancements
228
+
229
+ - [ ] Grade-based filtering: `?grade=sahih` to return only authenticated Hadiths
230
+ - [ ] Chain of narrators: Display Isnad with full narrator details
231
+ - [ ] Synonym expansion: Better topic matching (e.g., "mercy" → "rahma, compassion")
232
+ - [ ] Multi-Surah topics: Topics spanning multiple Surahs
233
+ - [ ] Batch processing: Handle multiple questions in one request
234
+ - [ ] Streaming responses: SSE for long-form answers
235
+ - [ ] Islamic calendar integration: Hijri date references
DOCKER.md ADDED
@@ -0,0 +1,443 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QModel Docker Guide
2
+
3
+ Complete guide for running QModel in Docker with both backend options.
4
+
5
+ ## Quick Start
6
+
7
+ ### Option 1: Docker Compose (Recommended)
8
+
9
+ ```bash
10
+ # 1. Copy example config
11
+ cp .env.example .env
12
+
13
+ # 2. Edit .env and choose your backend (see below)
14
+ nano .env
15
+
16
+ # 3. Run with compose
17
+ docker-compose up
18
+ ```
19
+
20
+ API available at: `http://localhost:8000`
21
+
22
+ ### Option 2: Docker CLI
23
+
24
+ ```bash
25
+ # Build image
26
+ docker build -t qmodel .
27
+
28
+ # Run with Ollama backend
29
+ docker run -p 8000:8000 \
30
+ --env-file .env \
31
+ --add-host host.docker.internal:host-gateway \
32
+ qmodel
33
+
34
+ # Or run with HuggingFace backend
35
+ docker run -p 8000:8000 \
36
+ --env-file .env \
37
+ --env HF_TOKEN=your_token_here \
38
+ qmodel
39
+ ```
40
+
41
+ ---
42
+
43
+ ## Backend Configuration
44
+
45
+ Configure which backend to use via `.env` file:
46
+
47
+ ### Backend 1: Ollama (Local)
48
+
49
+ **Best for**: Development, testing, Docker Desktop
50
+
51
+ ```bash
52
+ # .env
53
+ LLM_BACKEND=ollama
54
+ OLLAMA_HOST=http://host.docker.internal:11434
55
+ OLLAMA_MODEL=llama2
56
+ ```
57
+
58
+ **Prerequisites**:
59
+ - Ollama installed on host machine
60
+ - Running: `ollama serve`
61
+ - Model pulled: `ollama pull llama2`
62
+
63
+ **Why**:
64
+ - ✅ Fast setup
65
+ - ✅ No GPU required
66
+ - ✅ Works on Docker Desktop (Mac/Windows)
67
+ - ❌ Requires host Ollama service
68
+
69
+ ### Backend 2: HuggingFace (Remote)
70
+
71
+ **Best for**: Production, GPU servers, containerized environments
72
+
73
+ ```bash
74
+ # .env
75
+ LLM_BACKEND=hf
76
+ HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
77
+ HF_DEVICE=auto
78
+ ```
79
+
80
+ **Prerequisites**:
81
+ - GPU (recommended) OR significant RAM
82
+ - HuggingFace token (for gated models)
83
+
84
+ **Passing HF Token**:
85
+ ```bash
86
+ # Via docker-compose
87
+ export HF_TOKEN=your_token_here
88
+ docker-compose up
89
+
90
+ # Via docker run
91
+ docker run -p 8000:8000 \
92
+ --env-file .env \
93
+ --env HF_TOKEN=your_token_here \
94
+ qmodel
95
+ ```
96
+
97
+ ---
98
+
99
+ ## Docker Compose Configuration
100
+
101
+ The `docker-compose.yml` includes:
102
+
103
+ | Setting | Value | Description |
104
+ |---------|-------|-------------|
105
+ | **Image** | Builds from `Dockerfile` | Python 3.11 + dependencies |
106
+ | **Port** | `8000:8000` | API port mapping |
107
+ | **Env File** | `.env` | Configuration source |
108
+ | **HF Token** | From `.env` or `${HF_TOKEN}` | For HuggingFace auth |
109
+ | **Ollama Host** | `host.docker.internal:11434` | Connect to host Ollama |
110
+ | **Volumes** | `.:/app` | Code changes sync (dev mode) |
111
+ | **HF Cache** | `/root/.cache/huggingface` | Persistent model cache |
112
+ | **Networks** | `qmodel-network` | Internal network |
113
+ | **Health Check** | `/health` endpoint | Auto-restart on failure |
114
+
115
+ ### For Production
116
+
117
+ Modify `docker-compose.yml`:
118
+ ```yaml
119
+ services:
120
+ qmodel:
121
+ # ... (same as above)
122
+ volumes:
123
+ # Remove live code volume
124
+ - huggingface_cache:/root/.cache/huggingface
125
+ restart: on-failure:5
126
+ ```
127
+
128
+ ---
129
+
130
+ ## Examples
131
+
132
+ ### Development with Ollama
133
+
134
+ ```bash
135
+ # Terminal 1: Start Ollama
136
+ ollama serve
137
+
138
+ # Terminal 2: Run QModel
139
+ cat > .env << EOF
140
+ LLM_BACKEND=ollama
141
+ OLLAMA_HOST=http://host.docker.internal:11434
142
+ OLLAMA_MODEL=llama2
143
+ TEMPERATURE=0.2
144
+ CONFIDENCE_THRESHOLD=0.30
145
+ EOF
146
+
147
+ docker-compose up
148
+ ```
149
+
150
+ Access: `http://localhost:8000`
151
+
152
+ ### Production with HuggingFace
153
+
154
+ ```bash
155
+ # Create .env for production
156
+ cat > .env << EOF
157
+ LLM_BACKEND=hf
158
+ HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
159
+ HF_DEVICE=auto
160
+ TEMPERATURE=0.1
161
+ CONFIDENCE_THRESHOLD=0.35
162
+ ALLOWED_ORIGINS=yourdomain.com
163
+ EOF
164
+
165
+ # Export HF token
166
+ export HF_TOKEN=hf_xxxxxxxxxxxxx
167
+
168
+ # Run
169
+ docker-compose up -d
170
+ docker-compose logs -f
171
+ ```
172
+
173
+ ### Detached Mode
174
+
175
+ ```bash
176
+ # Run in background
177
+ docker-compose up -d
178
+
179
+ # View logs
180
+ docker-compose logs -f
181
+
182
+ # Check status
183
+ docker-compose ps
184
+
185
+ # Stop
186
+ docker-compose down
187
+ ```
188
+
189
+ ---
190
+
191
+ ## Troubleshooting
192
+
193
+ ### "Cannot connect to Ollama"
194
+
195
+ **Symptom**: `ConnectionRefusedError` when using Ollama backend
196
+
197
+ **Solution**:
198
+ ```bash
199
+ # Ensure Ollama is running on host
200
+ ollama serve
201
+
202
+ # Verify in Docker container
203
+ docker run --add-host host.docker.internal:host-gateway qmodel \
204
+ python -c "import requests; print(requests.get('http://host.docker.internal:11434/api/tags').json())"
205
+ ```
206
+
207
+ ### "HuggingFace model not found"
208
+
209
+ **Symptom**: `OSError: ... not found`
210
+
211
+ **Solution**:
212
+ ```bash
213
+ # Check HF token is set
214
+ echo $HF_TOKEN
215
+
216
+ # If not set, export it
217
+ export HF_TOKEN=hf_xxxxxxxxxxxxx
218
+ docker-compose up
219
+ ```
220
+
221
+ ### "Out of memory"
222
+
223
+ **Symptom**: Container exits with no error message
224
+
225
+ **Solution**:
226
+ - Use smaller model: `HF_MODEL_NAME=mistralai/Mistral-7B-Instruct-v0.2`
227
+ - Use Ollama with `neural-chat` model
228
+ - Increase Docker memory limits:
229
+
230
+ ```bash
231
+ # Edit docker-compose.yml
232
+ services:
233
+ qmodel:
234
+ deploy:
235
+ resources:
236
+ limits:
237
+ memory: 16G
238
+ ```
239
+
240
+ ### "Port already in use"
241
+
242
+ **Symptom**: `Address already in use`
243
+
244
+ **Solution**:
245
+ ```bash
246
+ # Change port in docker-compose.yml
247
+ ports:
248
+ - "8001:8000"
249
+
250
+ # Or kill existing container
251
+ docker-compose down
252
+ docker system prune
253
+ ```
254
+
255
+ ---
256
+
257
+ ## Building Custom Images
258
+
259
+ ### Build for Specific Backend
260
+
261
+ No code changes needed - just use `.env` to configure.
262
+
263
+ ### Build with Custom Requirements
264
+
265
+ ```bash
266
+ # Edit requirements.txt, then rebuild
267
+ docker build -t qmodel:custom .
268
+ ```
269
+
270
+ ### Push to Registry
271
+
272
+ ```bash
273
+ # Tag for registry
274
+ docker tag qmodel myregistry/qmodel:v4.1
275
+
276
+ # Push
277
+ docker push myregistry/qmodel:v4.1
278
+
279
+ # Run from registry
280
+ docker run -p 8000:8000 \
281
+ --env-file .env \
282
+ myregistry/qmodel:v4.1
283
+ ```
284
+
285
+ ---
286
+
287
+ ## Performance Tips
288
+
289
+ ### Docker Compose with GPU (Linux)
290
+
291
+ ```yaml
292
+ services:
293
+ qmodel:
294
+ deploy:
295
+ resources:
296
+ reservations:
297
+ devices:
298
+ - driver: nvidia
299
+ count: 1
300
+ capabilities: [gpu]
301
+ ```
302
+
303
+ Then set in `.env`:
304
+ ```bash
305
+ HF_DEVICE=cuda
306
+ ```
307
+
308
+ ### Reduce Memory Usage
309
+
310
+ ```bash
311
+ # In .env
312
+ HF_MODEL_NAME=gpt2 # Tiny model
313
+ OLLAMA_MODEL=orca-mini # Smaller Ollama model
314
+ TOP_K_SEARCH=10 # Fewer candidates
315
+ ```
316
+
317
+ ### Cache Management
318
+
319
+ ```bash
320
+ # Clear HuggingFace cache
321
+ docker-compose down
322
+ docker volume rm qmodel_huggingface_cache
323
+
324
+ # Or cleanup all
325
+ docker system prune -a
326
+ ```
327
+
328
+ ---
329
+
330
+ ## Docker Networking
331
+
332
+ ### Access QModel from Host
333
+
334
+ ```bash
335
+ # Default (works)
336
+ curl http://localhost:8000/health
337
+ ```
338
+
339
+ ### Custom Network
340
+
341
+ ```bash
342
+ # Create network
343
+ docker network create qmodel-net
344
+
345
+ # Run with network
346
+ docker-compose -f docker-compose.yml up
347
+ ```
348
+
349
+ ### Multiple Containers
350
+
351
+ ```yaml
352
+ # docker-compose.yml
353
+ services:
354
+ qmodel:
355
+ networks:
356
+ - custom-network
357
+ other-service:
358
+ networks:
359
+ - custom-network
360
+
361
+ networks:
362
+ custom-network:
363
+ driver: bridge
364
+ ```
365
+
366
+ ---
367
+
368
+ ## CI/CD Integration
369
+
370
+ ### GitHub Actions Example
371
+
372
+ ```yaml
373
+ name: Deploy QModel
374
+
375
+ on: [push]
376
+
377
+ jobs:
378
+ deploy:
379
+ runs-on: ubuntu-latest
380
+ steps:
381
+ - uses: actions/checkout@v2
382
+
383
+ - name: Build Docker image
384
+ run: docker build -t qmodel .
385
+
386
+ - name: Run tests
387
+ run: |
388
+ docker run -port 8000:8000 qmodel &
389
+ sleep 30
390
+ curl http://localhost:8000/health
391
+
392
+ - name: Push to registry
393
+ run: |
394
+ echo ${{ secrets.REGISTRY_TOKEN }} | docker login -u ${{ secrets.REGISTRY_USER }}
395
+ docker tag qmodel myregistry/qmodel:${{ github.sha }}
396
+ docker push myregistry/qmodel:${{ github.sha }}
397
+ ```
398
+
399
+ ---
400
+
401
+ ## Security Considerations
402
+
403
+ ### Secrets Management
404
+
405
+ ```bash
406
+ # Don't commit .env with real tokens
407
+ echo ".env" >> .gitignore
408
+
409
+ # Use Docker secrets (Swarm mode)
410
+ docker secret create hf_token -
411
+ # Then use in compose:
412
+ # HF_TOKEN=${HF_TOKEN_FILE}
413
+ ```
414
+
415
+ ### CORS Configuration
416
+
417
+ ```bash
418
+ # In .env (restrict in production)
419
+ ALLOWED_ORIGINS=yourdomain.com,api.yourdomain.com
420
+ ```
421
+
422
+ ### Network Isolation
423
+
424
+ ```yaml
425
+ # docker-compose.yml
426
+ services:
427
+ qmodel:
428
+ networks:
429
+ - internal
430
+
431
+ networks:
432
+ internal:
433
+ internal: true
434
+ ```
435
+
436
+ ---
437
+
438
+ ## Reference
439
+
440
+ - **Dockerfile**: Multi-stage build, health checks, proper layer caching
441
+ - **docker-compose.yml**: Service definition, volumes, networking, health checks
442
+ - **Environment**: Fully configurable via `.env`
443
+ - **Backends**: Ollama (local) or HuggingFace (remote) via `LLM_BACKEND` variable
Dockerfile CHANGED
@@ -1,29 +1,51 @@
1
- # Use an official Python runtime as a parent image
 
 
 
 
 
 
 
2
  FROM python:3.11-slim
3
 
4
- # Set environment variables
5
- ENV PYTHONDONTWRITEBYTECODE 1
6
- ENV PYTHONUNBUFFERED 1
 
 
 
 
 
 
7
 
8
- # Set the working directory in the container
9
  WORKDIR /app
10
 
11
  # Install system dependencies
 
 
 
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
13
  build-essential \
14
  libopenblas-dev \
15
  libomp-dev \
 
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
- # Install Python dependencies
19
  COPY requirements.txt .
20
  RUN pip install --no-cache-dir -r requirements.txt
21
 
22
- # Copy the rest of the application code
23
  COPY . .
24
 
25
- # Expose the port the app runs on
26
  EXPOSE 8000
27
 
28
- # Command to run the application
 
 
 
 
 
29
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
 
1
+ # QModel v4 - Islamic RAG API
2
+ # =============================
3
+ # Dockerfile for QModel API
4
+ # Supports both Ollama and HuggingFace backends via .env configuration
5
+ #
6
+ # Build: docker build -t qmodel .
7
+ # Run: docker run -p 8000:8000 --env-file .env qmodel
8
+
9
  FROM python:3.11-slim
10
 
11
+ # Metadata
12
+ LABEL maintainer="QModel Team"
13
+ LABEL description="QModel v4 - Quran & Hadith RAG API"
14
+ LABEL version="4.1"
15
+
16
+ # Environment variables
17
+ ENV PYTHONDONTWRITEBYTECODE=1 \
18
+ PYTHONUNBUFFERED=1 \
19
+ PIP_NO_CACHE_DIR=1
20
 
21
+ # Set working directory
22
  WORKDIR /app
23
 
24
  # Install system dependencies
25
+ # - build-essential: For compiling Python packages
26
+ # - libopenblas-dev: For numerical operations (FAISS, numpy)
27
+ # - libomp-dev: For OpenMP (FAISS parallelization)
28
  RUN apt-get update && apt-get install -y --no-install-recommends \
29
  build-essential \
30
  libopenblas-dev \
31
  libomp-dev \
32
+ curl \
33
  && rm -rf /var/lib/apt/lists/*
34
 
35
+ # Copy requirements and install Python dependencies
36
  COPY requirements.txt .
37
  RUN pip install --no-cache-dir -r requirements.txt
38
 
39
+ # Copy application code
40
  COPY . .
41
 
42
+ # Expose port for API
43
  EXPOSE 8000
44
 
45
+ # Health check
46
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
47
+ CMD curl -f http://localhost:8000/health || exit 1
48
+
49
+ # Start application
50
+ # Configure via .env: LLM_BACKEND=ollama or LLM_BACKEND=hf
51
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
OPEN_WEBUI.md ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Using QModel v4 with Open-WebUI
2
+
3
+ QModel v4 is fully compatible with **Open-WebUI** thanks to its OpenAI-compatible API endpoints. This guide shows you how to integrate them.
4
+
5
+ ## Prerequisites
6
+
7
+ 1. **QModel running** on your local machine or server
8
+ ```bash
9
+ python main.py
10
+ # Runs on http://localhost:8000
11
+ ```
12
+
13
+ 2. **Open-WebUI installed** (Docker recommended)
14
+ ```bash
15
+ docker run -d -p 3000:8080 --name open-webui ghcr.io/open-webui/open-webui:latest
16
+ # Runs on http://localhost:3000
17
+ ```
18
+
19
+ ---
20
+
21
+ ## Integration Steps
22
+
23
+ ### Step 1: Add QModel as a Custom OpenAI-Compatible Model
24
+
25
+ In Open-WebUI:
26
+
27
+ 1. **Settings** → **Models** → **Manage Models**
28
+ 2. Click **"Connect to OpenAI-compatible API"**
29
+ 3. Enter:
30
+ - **API Base URL**: `http://localhost:8000/v1`
31
+ - **Model Name**: `QModel` (or `qmodel`)
32
+ - **API Key**: Leave blank (no auth required)
33
+
34
+ 4. Click **"Save & Test"**
35
+ 5. You should see: ✅ **Model connected successfully**
36
+
37
+ ### Step 2: Start Using QModel
38
+
39
+ 1. Open a **New Chat** in Open-WebUI
40
+ 2. Select **QModel** from the model dropdown
41
+ 3. Type your Islamic question:
42
+ ```
43
+ What does the Quran say about mercy?
44
+ ```
45
+
46
+ 4. Press Enter and get an Islamic-grounded RAG response with sources!
47
+
48
+ ---
49
+
50
+ ## API Endpoints (OpenAI-Compatible)
51
+
52
+ ### POST `/v1/chat/completions`
53
+ Standard OpenAI chat completions endpoint.
54
+
55
+ **Request:**
56
+ ```json
57
+ {
58
+ "model": "QModel",
59
+ "messages": [
60
+ {"role": "user", "content": "What does Islam say about patience?"}
61
+ ],
62
+ "temperature": 0.2,
63
+ "max_tokens": 2048,
64
+ "top_k": 5,
65
+ "stream": false
66
+ }
67
+ ```
68
+
69
+ **Response:**
70
+ ```json
71
+ {
72
+ "id": "qmodel-1234567890",
73
+ "object": "chat.completion",
74
+ "created": 1234567890,
75
+ "model": "QModel",
76
+ "choices": [
77
+ {
78
+ "index": 0,
79
+ "message": {
80
+ "role": "assistant",
81
+ "content": "Islam emphasizes patience as a core virtue..."
82
+ },
83
+ "finish_reason": "stop"
84
+ }
85
+ ],
86
+ "x_metadata": {
87
+ "language": "english",
88
+ "intent": "general",
89
+ "top_score": 0.876,
90
+ "latency_ms": 342,
91
+ "sources": [
92
+ {
93
+ "source": "Surah Al-Imran 3:200",
94
+ "type": "quran",
95
+ "grade": null,
96
+ "score": 0.876
97
+ }
98
+ ]
99
+ }
100
+ }
101
+ ```
102
+
103
+ ### GET `/v1/models`
104
+ List available models.
105
+
106
+ **Response:**
107
+ ```json
108
+ {
109
+ "object": "list",
110
+ "data": [
111
+ {
112
+ "id": "QModel",
113
+ "object": "model",
114
+ "created": 1234567890,
115
+ "owned_by": "elgendy"
116
+ }
117
+ ]
118
+ }
119
+ ```
120
+
121
+ ---
122
+
123
+ ## Advanced Query Parameters (Open-WebUI Compatible)
124
+
125
+ When using Open-WebUI, you can include special parameters:
126
+
127
+ ### Islamic-Specific Parameters
128
+
129
+ **URL Query String:**
130
+ ```
131
+ /v1/chat/completions?source_type=hadith&grade_filter=sahih&top_k=5
132
+ ```
133
+
134
+ **Supported Parameters:**
135
+ - `source_type`: `quran` | `hadith` | (both, default)
136
+ - `grade_filter`: `sahih` | `hasan` | (all, default)
137
+ - `top_k`: 1-20 (number of sources to retrieve)
138
+
139
+ ### Example Requests via curl
140
+
141
+ ```bash
142
+ # 1. Basic query (both Quran + Hadith)
143
+ curl -X POST http://localhost:8000/v1/chat/completions \
144
+ -H "Content-Type: application/json" \
145
+ -d '{
146
+ "model": "QModel",
147
+ "messages": [{"role": "user", "content": "What does Islam say about mercy?"}]
148
+ }'
149
+
150
+ # 2. Quran-only query
151
+ curl -X POST http://localhost:8000/v1/chat/completions?source_type=quran \
152
+ -H "Content-Type: application/json" \
153
+ -d '{
154
+ "model": "QModel",
155
+ "messages": [{"role": "user", "content": "What does the Quran say about patience?"}]
156
+ }'
157
+
158
+ # 3. Authenticated Hadiths only (Sahih grade)
159
+ curl -X POST http://localhost:8000/v1/chat/completions?source_type=hadith&grade_filter=sahih \
160
+ -H "Content-Type: application/json" \
161
+ -d '{
162
+ "model": "QModel",
163
+ "messages": [{"role": "user", "content": "Hadiths about prayer"}]
164
+ }'
165
+
166
+ # 4. Streaming response
167
+ curl -X POST http://localhost:8000/v1/chat/completions \
168
+ -H "Content-Type: application/json" \
169
+ -d '{
170
+ "model": "QModel",
171
+ "messages": [{"role": "user", "content": "Tell me about Zakat"}],
172
+ "stream": true
173
+ }'
174
+ ```
175
+
176
+ ---
177
+
178
+ ## Open-WebUI Features Supported
179
+
180
+ | Feature | Status | Notes |
181
+ |---------|--------|-------|
182
+ | **Chat** | ✅ Full support | Normal Q&A |
183
+ | **Streaming** | ✅ Supported | Set `stream: true` in request |
184
+ | **Context** | ✅ Multi-turn | Open-WebUI handles conversation history |
185
+ | **Temperature** | ✅ Configurable | Via Open-WebUI settings |
186
+ | **Token Limits** | ✅ Supported | Via `max_tokens` parameter |
187
+ | **Model List** | ✅ Available | Via `/v1/models` endpoint |
188
+ | **Source Attribution** | ✅ In metadata | Via `x_metadata.sources` |
189
+
190
+ ---
191
+
192
+ ## Custom System Prompts in Open-WebUI
193
+
194
+ To customize QModel for specific Islamic tasks, create a custom chatbot in Open-WebUI:
195
+
196
+ 1. **Home** → **+ New Chatbot**
197
+ 2. Configure:
198
+ - **Name**: "Islamic Scholar" (or your choice)
199
+ - **Model**: QModel
200
+ - **System Prompt**:
201
+ ```
202
+ You are an expert Islamic scholar specializing in Qur'an and Hadith.
203
+ Always cite sources exactly as provided.
204
+ Only answer from the provided Islamic context—never use outside knowledge.
205
+ If information is not in the dataset, say so clearly.
206
+ ```
207
+ - **Top K Sources**: 5
208
+ - **Temperature**: 0.1 (for consistency)
209
+
210
+ 3. **Save** and start chatting!
211
+
212
+ ---
213
+
214
+ ## Troubleshooting
215
+
216
+ ### Issue: "Failed to connect to QModel"
217
+
218
+ **Solutions:**
219
+ 1. Check QModel is running: `curl http://localhost:8000/health`
220
+ 2. Verify API Base URL is correct: `http://localhost:8000/v1`
221
+ 3. Check firewall: Port 8000 must be accessible
222
+ 4. Check logs: `python main.py` to see startup messages
223
+
224
+ ### Issue: "No sources in response"
225
+
226
+ **Solutions:**
227
+ 1. Check `/debug/scores` endpoint directly:
228
+ ```bash
229
+ curl "http://localhost:8000/debug/scores?q=patience&top_k=10"
230
+ ```
231
+ 2. Adjust `CONFIDENCE_THRESHOLD` in `.env` if retrievals are low-quality
232
+ 3. Try synonyms: "mercy" instead of "compassion"
233
+
234
+ ### Issue: "Assistant returns 'Not found'"
235
+
236
+ **This is expected behavior!** QModel has safety checks:
237
+ 1. If retrieval score is too low (< 0.30), it returns "not found"
238
+ 2. This prevents hallucinations
239
+ 3. Try more specific queries or adjust `CONFIDENCE_THRESHOLD`
240
+
241
+ ---
242
+
243
+ ## Configuration for Open-WebUI
244
+
245
+ ### Recommended Settings
246
+
247
+ For best results with Open-WebUI:
248
+
249
+ ```env
250
+ # More conservative (fewer hallucinations)
251
+ CONFIDENCE_THRESHOLD=0.40
252
+ TEMPERATURE=0.1
253
+ HADITH_BOOST=0.08
254
+
255
+ # More liberal (more answers, higher hallucination risk)
256
+ CONFIDENCE_THRESHOLD=0.20
257
+ TEMPERATURE=0.3
258
+ HADITH_BOOST=0.05
259
+ ```
260
+
261
+ ### Docker Compose Integration
262
+
263
+ To run both QModel and Open-WebUI together:
264
+
265
+ ```yaml
266
+ version: '3.8'
267
+ services:
268
+ qmodel:
269
+ build: .
270
+ ports:
271
+ - "8000:8000"
272
+ environment:
273
+ - LLM_BACKEND=ollama
274
+ - OLLAMA_HOST=http://ollama:11434
275
+ depends_on:
276
+ - ollama
277
+
278
+ ollama:
279
+ image: ollama/ollama:latest
280
+ ports:
281
+ - "11434:11434"
282
+
283
+ web-ui:
284
+ image: ghcr.io/open-webui/open-webui:latest
285
+ ports:
286
+ - "3000:8080"
287
+ depends_on:
288
+ - qmodel
289
+ ```
290
+
291
+ Run: `docker-compose up`
292
+
293
+ ---
294
+
295
+ ## Using QModel in Open-WebUI Workflows
296
+
297
+ ### Example 1: Islamic Q&A Chatbot
298
+
299
+ 1. Create chatbot with system prompt about Islamic knowledge
300
+ 2. Select QModel as backend
301
+ 3. Set temperature to 0.1 for consistency
302
+ 4. Enable web search toggle (optional, for cross-verification)
303
+
304
+ ### Example 2: Hadith Research Tool
305
+
306
+ 1. Create chatbot: "Hadith Researcher"
307
+ 2. System prompt:
308
+ ```
309
+ You are a Hadith researcher. For each query:
310
+ 1. Search authenticated Hadiths only (Sahih grade)
311
+ 2. Display the full text with authenticity grade
312
+ 3. Explain the Hadith's significance
313
+ 4. Always cite the collection and number
314
+ ```
315
+ 3. Enable grade filtering: `grade_filter=sahih`
316
+
317
+ ### Example 3: Qur'anic Study Assistant
318
+
319
+ 1. Create chatbot: "Qur'an Tafsir"
320
+ 2. Set `source_type=quran` in parameters
321
+ 3. System prompt focusing on Qur'anic interpretation
322
+ 4. Enable multi-turn for deeper exploration
323
+
324
+ ---
325
+
326
+ ## API Testing
327
+
328
+ ### Test with Open-WebUI's Developer Tools
329
+
330
+ 1. Open Open-WebUI console (F12)
331
+ 2. Go to **Network** tab
332
+ 3. Send a message to QModel
333
+ 4. Inspect the request/response to `/v1/chat/completions`
334
+
335
+ ### Test with cURL
336
+
337
+ ```bash
338
+ # 1. Health check
339
+ curl http://localhost:8000/health | jq
340
+
341
+ # 2. List models
342
+ curl http://localhost:8000/v1/models | jq
343
+
344
+ # 3. Simple chat
345
+ curl -X POST http://localhost:8000/v1/chat/completions \
346
+ -H "Content-Type: application/json" \
347
+ -d '{"model":"QModel","messages":[{"role":"user","content":"Assalam alaikum"}]}' | jq
348
+ ```
349
+
350
+ ---
351
+
352
+ ## Performance Tips
353
+
354
+ ### For Optimal Open-WebUI Experience
355
+
356
+ 1. **Use Ollama locally** for responsive chat (400-800ms per query)
357
+ 2. **Set `max_tokens=1024`** to avoid long waits
358
+ 3. **Use temperature=0.1** for reliable, consistent answers
359
+ 4. **Increase `CACHE_TTL`** for frequently asked questions
360
+ 5. **Reduce `TOP_K_SEARCH`** if queries are slow (default 20)
361
+
362
+ ---
363
+
364
+ ## Security Notes
365
+
366
+ ### For Production Deployments
367
+
368
+ 1. **Restrict CORS**: Set `ALLOWED_ORIGINS=your-domain.com` in `.env`
369
+ 2. **Use HTTPS**: Proxy through nginx with TLS
370
+ 3. **Rate limit**: Add rate limiting middleware (not in v4, but recommended)
371
+ 4. **Authentication**: Consider adding API key validation layer
372
+ 5. **Network**: Don't expose QModel directly to the internet without auth
373
+
374
+ ---
375
+
376
+ ## Support
377
+
378
+ - 📖 Full setup guide: See `SETUP.md`
379
+ - 🔍 Debugging: Use `/debug/scores` to inspect retrievals
380
+ - 💬 Questions about Open-WebUI: See https://docs.openwebui.com
381
+ - 🕌 Islamic knowledge: See `ARCHITECTURE.md` for system details
382
+
383
+ ---
384
+
385
+ **Happy chatting with QModel + Open-WebUI! 🕌**
README.md CHANGED
@@ -1,17 +1,295 @@
 
 
 
 
 
 
 
 
 
1
  ---
2
- license: mit
3
- language:
4
- - ar
5
- - en
6
- base_model:
7
- - Qwen/Qwen2.5-72B-Instruct
8
- pipeline_tag: question-answering
9
- tags:
10
- - queean
11
- - hadith
12
- - islam
13
- - abdullah
14
- - elgendy
15
- metrics:
16
- - accuracy
17
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QModel v4 — Islamic RAG System
2
+ **Specialized Qur'an & Hadith Knowledge System with Dual LLM Support**
3
+
4
+ > A production-ready Retrieval-Augmented Generation system specialized exclusively in authenticated Islamic knowledge. No hallucinations, no outside knowledge—only content from verified sources.
5
+
6
+ ![Version](https://img.shields.io/badge/version-4.0.0-blue)
7
+ ![Backend](https://img.shields.io/badge/backend-ollama%20%7C%20huggingface-green)
8
+ ![Status](https://img.shields.io/badge/status-production--ready-success)
9
+
10
  ---
11
+
12
+ ## Features
13
+
14
+ ### 📖 Qur'an Capabilities
15
+ - **Verse Lookup**: Find verses by topic or keyword
16
+ - **Word Frequency**: Count occurrences with Surah breakdown
17
+ - **Bilingual**: Full Arabic + English translation support
18
+ - **Tafsir Integration**: AI-powered contextual interpretation
19
+
20
+ ### 📚 Hadith Capabilities
21
+ - **Authenticity Verification**: Check if Hadith is in authenticated collections
22
+ - **Grade Display**: Show Sahih/Hasan/Da'if authenticity levels
23
+ - **Topic Search**: Find relevant Hadiths across 9 major collections
24
+ - **Collection Navigation**: Filter by Bukhari, Muslim, Abu Dawud, etc.
25
+
26
+ ### 🛡️ Safety Features
27
+ - **Confidence Gating**: Low-confidence queries return "not found" instead of guesses
28
+ - **Source Attribution**: Every answer cites exact verse/Hadith reference
29
+ - **Verbatim Quotes**: Text copied directly from data, never paraphrased
30
+ - **Anti-Hallucination**: Hardened prompts with few-shot "not found" examples
31
+
32
+ ### 🚀 Integration
33
+ - **OpenAI-Compatible API**: Use with Open-WebUI, Langchain, or any OpenAI client
34
+ - **OpenAI Schema**: Full support for `/v1/chat/completions` and `/v1/models`
35
+ - **Streaming Responses**: SSE streaming for long-form answers
36
+
37
+ ### ⚙️ Technical
38
+ - **Dual LLM Backend**: Ollama (dev) + HuggingFace (prod)
39
+ - **Hybrid Search**: Dense (FAISS) + Sparse (BM25) scoring
40
+ - **Async API**: FastAPI with async/await throughout
41
+ - **Caching**: TTL-based LRU cache for frequent queries
42
+ - **Scale**: 6,236 Quranic verses + 41,390 Hadiths indexed
43
+
44
+ ---
45
+
46
+ ## Quick Start (5 minutes)
47
+
48
+ ```bash
49
+ # 1. Install
50
+ git clone https://github.com/elgendy/QModel.git && cd QModel
51
+ python3 -m venv .venv && source .venv/bin/activate
52
+ pip install -r requirements.txt
53
+
54
+ # 2. Configure (choose one)
55
+ # For local development - Ollama:
56
+ export LLM_BACKEND=ollama
57
+ export OLLAMA_MODEL=llama2
58
+ # Make sure Ollama is running: ollama serve
59
+
60
+ # OR for production - HuggingFace:
61
+ export LLM_BACKEND=hf
62
+ export HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
63
+
64
+ # 3. Run
65
+ python main.py
66
+
67
+ # 4. Query
68
+ curl "http://localhost:8000/ask?q=What%20does%20Islam%20say%20about%20mercy?"
69
+ ```
70
+
71
+ API docs: http://localhost:8000/docs
72
+
73
+ ---
74
+
75
+ ## Example Queries
76
+
77
+ ```bash
78
+ # Basic question
79
+ curl "http://localhost:8000/ask?q=What%20does%20Islam%20say%20about%20mercy?"
80
+
81
+ # Word frequency
82
+ curl "http://localhost:8000/ask?q=How%20many%20times%20is%20mercy%20mentioned?"
83
+
84
+ # Authentic Hadiths only
85
+ curl "http://localhost:8000/ask?q=prayer&source_type=hadith&grade_filter=sahih"
86
+
87
+ # Verify Hadith
88
+ curl "http://localhost:8000/hadith/verify?q=Actions%20are%20judged%20by%20intentions"
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Documentation
94
+
95
+ | Document | Purpose |
96
+ |----------|---------|
97
+ | **[SETUP.md](SETUP.md)** | Installation, configuration (both backends), API endpoints, examples |
98
+ | **[DOCKER.md](DOCKER.md)** | Docker deployment, production setup, troubleshooting |
99
+ | **[ARCHITECTURE.md](ARCHITECTURE.md)** | System design, data pipeline, core components |
100
+ | **[OPEN_WEBUI.md](OPEN_WEBUI.md)** | Integration with Open-WebUI chat interface |
101
+
102
+ ---
103
+
104
+ ## Key Decisions
105
+
106
+ ### Backend Selection
107
+ - **Ollama** — Fast setup, no GPU, great for development, `LLM_BACKEND=ollama`
108
+ - **HuggingFace** — Production-grade, better quality, GPU recommended, `LLM_BACKEND=hf`
109
+
110
+ Both are equally supported via the same `.env` configuration. Just set `LLM_BACKEND` and restart.
111
+
112
+ ### Data
113
+ - **47,626 documents**: 6,236 Quranic verses + 41,390 hadiths from 9 canonical collections
114
+ - **Pre-built**: `metadata.json` and `QModel.index` included, ready to use
115
+ - **Dual-language**: Arabic and English support
116
+
117
+ ---
118
+
119
+ ## Open-WebUI Integration
120
+
121
+ QModel integrates seamlessly with Open-WebUI for a chat interface:
122
+
123
+ ```bash
124
+ # Start QModel
125
+ python main.py
126
+
127
+ # Start Open-WebUI (Docker)
128
+ docker run -p 3000:8080 ghcr.io/open-webui/open-webui:latest
129
+
130
+ # In Open-WebUI: Settings → Models → Add OpenAI-compatible
131
+ # API Base: http://localhost:8000/v1
132
+ # Model: QModel
133
+ ```
134
+
135
+ See [OPEN_WEBUI.md](OPEN_WEBUI.md) for detailed integration guide.
136
+
137
+ ---
138
+
139
+ ## API Reference (Quick)
140
+
141
+ ### Main Query
142
+ ```
143
+ GET /ask?q=<question>&top_k=5&source_type=<quran|hadith>&grade_filter=<sahih|hasan>
144
+ ```
145
+
146
+ **Response includes:**
147
+ - AI-generated answer
148
+ - Listed sources with scores
149
+ - Language detection (Arabic/English)
150
+ - Query intent classification
151
+
152
+ ### Other Endpoints
153
+ - `GET /debug/scores?q=<question>&top_k=10` — Inspect raw retrieval scores
154
+ - `GET /hadith/verify?q=<hadith_text>` — Check hadith authenticity
155
+ - `POST /v1/chat/completions` — OpenAI-compatible endpoint
156
+ - `GET /health` — Health check
157
+
158
+ See [SETUP.md](SETUP.md) for full endpoint documentation.
159
+
160
+ ---
161
+
162
+ ## Configuration
163
+
164
+ All configuration via environment variables (no code changes needed):
165
+
166
+ ```bash
167
+ # Backend (required)
168
+ LLM_BACKEND=ollama # or: hf
169
+
170
+ # Ollama settings
171
+ OLLAMA_HOST=http://localhost:11434
172
+ OLLAMA_MODEL=llama2 # or: mistral, neural-chat
173
+
174
+ # HuggingFace settings
175
+ HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
176
+ HF_DEVICE=auto # auto, cuda, or cpu
177
+
178
+ # Quality tuning
179
+ TEMPERATURE=0.2 # 0=deterministic, 1=creative
180
+ CONFIDENCE_THRESHOLD=0.30 # Min score for LLM call
181
+ TOP_K_RETURN=5 # Results per query
182
+ ```
183
+
184
+ See [SETUP.md](SETUP.md) for comprehensive configuration reference.
185
+
186
+ ---
187
+
188
+ ## Performance
189
+
190
+ | Operation | Time | Backend |
191
+ |-----------|------|---------|
192
+ | Query (cached) | ~50ms | Both |
193
+ | Query (Ollama) | 400-800ms | Ollama |
194
+ | Query (HF GPU) | 500-1500ms | CUDA |
195
+ | Query (HF CPU) | 2-5s | CPU |
196
+
197
+ ---
198
+
199
+ ## Deployment
200
+
201
+ ### Local Development
202
+ ```bash
203
+ python main.py
204
+ ```
205
+
206
+ ### Docker (with Ollama backend)
207
+ ```bash
208
+ docker-compose up
209
+ ```
210
+
211
+ ### Docker (with HuggingFace backend)
212
+ Set `LLM_BACKEND=hf` in `.env` then `docker-compose up`
213
+
214
+ See [DOCKER.md](DOCKER.md) for production deployment, troubleshooting, and advanced configuration.
215
+
216
+ ---
217
+
218
+ ## Data Sources
219
+
220
+ - **Qur'an**: [risan/quran-json](https://github.com/risan/quran-json) — 114 Surahs, 6,236 verses
221
+ - **Hadith**: [AhmedBaset/hadith-json](https://github.com/AhmedBaset/hadith-json) — 9 canonical collections, 41,390 hadiths
222
+
223
+ ---
224
+
225
+ ## Architecture Overview
226
+
227
+ ```
228
+ User Query
229
+
230
+ Query Rewriting & Intent Detection
231
+
232
+ Hybrid Search (FAISS dense + BM25 sparse)
233
+
234
+ Filtering & Ranking
235
+
236
+ Confidence Gate (skip LLM if low-scoring)
237
+
238
+ LLM Generation (Ollama or HuggingFace)
239
+
240
+ Formatted Response with Sources
241
+ ```
242
+
243
+ See [ARCHITECTURE.md](ARCHITECTURE.md) for detailed system design.
244
+
245
+ ---
246
+
247
+ ## Troubleshooting
248
+
249
+ | Issue | Solution |
250
+ |-------|----------|
251
+ | "Service is initialising" | Wait 60-90s for embeddings model to load |
252
+ | Low retrieval scores | Check `/debug/scores`, try synonyms, lower threshold |
253
+ | "Model not found" (HF) | Run `huggingface-cli login` |
254
+ | Out of memory | Use smaller model or CPU backend |
255
+ | No results | Verify data files exist: `metadata.json` and `QModel.index` |
256
+
257
+ See [SETUP.md](SETUP.md) and [DOCKER.md](DOCKER.md) for more detailed troubleshooting.
258
+
259
+ ---
260
+
261
+ ## What's New in v4
262
+
263
+ ✨ **Dual LLM Backend** — Ollama (dev) + HuggingFace (prod)
264
+ ✨ **Grade Filtering** — Return only Sahih/Hasan authenticated Hadiths
265
+ ✨ **Source Filtering** — Quran-only or Hadith-only queries
266
+ ✨ **Hadith Verification** — `/hadith/verify` endpoint
267
+ ✨ **Enhanced Frequency** — Word counts by Surah
268
+ ✨ **OpenAI Compatible** — Use with any OpenAI client
269
+ ✨ **Production Ready** — Structured logging, error handling, async throughout
270
+
271
+ ---
272
+
273
+ ## Next Steps
274
+
275
+ 1. **Get Started**: See [SETUP.md](SETUP.md)
276
+ 2. **Integrate with Open-WebUI**: See [OPEN_WEBUI.md](OPEN_WEBUI.md)
277
+ 3. **Deploy with Docker**: See [DOCKER.md](DOCKER.md)
278
+ 4. **Understand Architecture**: See [ARCHITECTURE.md](ARCHITECTURE.md)
279
+
280
+ ---
281
+
282
+ ## License
283
+
284
+ This project uses open-source data from:
285
+ - [Qur'an JSON](https://github.com/risan/quran-json) — Open source
286
+ - [Hadith API](https://github.com/AhmedBaset/hadith-json) — Open source
287
+
288
+ See individual repositories for license details.
289
+
290
+ ---
291
+
292
+ **Made with ❤️ for Islamic scholarship.**
293
+
294
+ Version 4.0.0 | March 2025 | Production-Ready
295
+
SETUP.md ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QModel v4 Setup & Deployment Guide
2
+
3
+ ## Quick Start
4
+
5
+ ### 1. Prerequisites
6
+ - Python 3.10+
7
+ - 16 GB RAM minimum (for embeddings + LLM)
8
+ - GPU recommended for HuggingFace backend
9
+ - Ollama installed (for local development) OR internet access (for HuggingFace)
10
+
11
+ ### 2. Installation
12
+
13
+ ```bash
14
+ # Clone and enter project
15
+ cd /Users/elgendy/Projects/QModel
16
+
17
+ # Create virtual environment
18
+ python3 -m venv .venv
19
+ source .venv/bin/activate
20
+
21
+ # Install dependencies
22
+ pip install -r requirements.txt
23
+ ```
24
+
25
+ ### 3. Data & Index
26
+
27
+ The project includes pre-built data files:
28
+ - `metadata.json` — 47,626 documents (6,236 Quran verses + 41,390 hadiths from 9 canonical collections)
29
+ - `QModel.index` — FAISS search index (pre-generated)
30
+
31
+ If you need to rebuild the index after dataset changes:
32
+ ```bash
33
+ python build_index.py
34
+ ```
35
+
36
+ ---
37
+
38
+ ## Backend Configuration
39
+
40
+ QModel supports two LLM backends. Choose based on your environment:
41
+
42
+ | Backend | Pros | Cons | When to Use |
43
+ |---------|------|------|------------|
44
+ | **Ollama** (local) | Fast setup, no GPU needed, no model downloads, free | Smaller models, limited customization | Development, testing, resource-constrained |
45
+ | **HuggingFace** (remote) | Larger models, better quality, full control | Requires GPU or significant RAM, slower downloads | Production, high-quality responses |
46
+
47
+ ### LLM Backend Selection
48
+
49
+ **Option 1: Local Ollama (Development)**
50
+
51
+ For development, testing, and when you already have Ollama running locally:
52
+
53
+ ```bash
54
+ LLM_BACKEND=ollama
55
+ OLLAMA_HOST=http://localhost:11434
56
+ OLLAMA_MODEL=llama2 # or: mistral, neural-chat, orca-mini
57
+ ```
58
+
59
+ **Available Ollama Models:**
60
+ - `llama2` — Fast, good quality (default, recommended)
61
+ - `mistral` — Better Arabic support
62
+ - `neural-chat` — Good balance
63
+ - `openchat` — Good instruction following
64
+ - `orca-mini` — Lightweight
65
+
66
+ **Option 2: Remote HuggingFace (Production)**
67
+
68
+ For production deployments with better quality and control:
69
+
70
+ ```bash
71
+ LLM_BACKEND=hf
72
+ HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct # Excellent Arabic support
73
+ HF_DEVICE=auto # auto | cuda | cpu
74
+ HF_MAX_NEW_TOKENS=2048
75
+ ```
76
+
77
+ **Recommended HuggingFace Models:**
78
+ - `Qwen/Qwen2-7B-Instruct` — Excellent Arabic, strong reasoning (default)
79
+ - `mistralai/Mistral-7B-Instruct-v0.2` — Very capable, fast
80
+ - `meta-llama/Llama-2-13b-chat-hf` — Larger, needs HF token
81
+
82
+ **Device Options:**
83
+ - `auto` — Auto-detect (GPU if available, else CPU)
84
+ - `cuda` — Force GPU (requires NVIDIA GPU)
85
+ - `cpu` — Force CPU (slower, but works everywhere)
86
+
87
+ ### Complete Environment Variables Reference
88
+
89
+ #### Backend Selection
90
+ | Variable | Default | Options | Example |
91
+ |----------|---------|---------|---------|
92
+ | `LLM_BACKEND` | `hf` | `ollama`, `hf` | `ollama` |
93
+
94
+ #### Ollama Backend
95
+ | Variable | Default | Description | Example |
96
+ |----------|---------|-------------|---------|
97
+ | `OLLAMA_HOST` | `http://localhost:11434` | Ollama server URL | `http://localhost:11434` |
98
+ | `OLLAMA_MODEL` | `llama2` | Model name | `mistral` |
99
+
100
+ #### HuggingFace Backend
101
+ | Variable | Default | Description | Example |
102
+ |----------|---------|-------------|---------|
103
+ | `HF_MODEL_NAME` | `Qwen/Qwen2-7B-Instruct` | Model ID | `Qwen/Qwen2-7B-Instruct` |
104
+ | `HF_DEVICE` | `auto` | Device to use | `cuda` |
105
+ | `HF_MAX_NEW_TOKENS` | `2048` | Max output length | `2048` |
106
+
107
+ #### Embedding & Data
108
+ | Variable | Default | Description |
109
+ |----------|---------|-------------|
110
+ | `EMBED_MODEL` | `intfloat/multilingual-e5-large` | Embedding model (keep default) |
111
+ | `FAISS_INDEX` | `QModel.index` | Index file path |
112
+ | `METADATA_FILE` | `metadata.json` | Dataset file |
113
+
114
+ #### Retrieval & Ranking
115
+ | Variable | Default | Range | Purpose |
116
+ |----------|---------|-------|---------|
117
+ | `TOP_K_SEARCH` | `20` | 5-100 | Candidate pool (⬆️ = slower but more coverage) |
118
+ | `TOP_K_RETURN` | `5` | 1-20 | Results shown to user |
119
+ | `RERANK_ALPHA` | `0.6` | 0.0-1.0 | Dense (0.6) vs Sparse (0.4) weighting |
120
+
121
+ #### Generation
122
+ | Variable | Default | Range | Purpose |
123
+ |----------|---------|-------|---------|
124
+ | `TEMPERATURE` | `0.2` | 0.0-1.0 | 0.0=deterministic, 1.0=creative (use 0.1-0.2 for religious) |
125
+ | `MAX_TOKENS` | `2048` | 512-4096 | Max response length |
126
+
127
+ #### Safety & Quality
128
+ | Variable | Default | Range | Purpose |
129
+ |----------|---------|-------|---------|
130
+ | `CONFIDENCE_THRESHOLD` | `0.30` | 0.0-1.0 | Min score to call LLM (⬆️ = fewer hallucinations) |
131
+ | `HADITH_BOOST` | `0.08` | 0.0-1.0 | Score boost for hadith on hadith queries |
132
+
133
+ #### Other Settings
134
+ | Variable | Default | Description |
135
+ |----------|---------|-------------|
136
+ | `CACHE_SIZE` | `512` | Query response cache entries |
137
+ | `CACHE_TTL` | `3600` | Cache expiry in seconds |
138
+ | `ALLOWED_ORIGINS` | `*` | CORS origins (use specific domains in production) |
139
+ | `MAX_EXAMPLES` | `3` | Few-shot examples in system prompt |
140
+
141
+ ### Configuration Examples
142
+
143
+ **Development (Ollama) - Recommended for getting started**
144
+ ```bash
145
+ LLM_BACKEND=ollama
146
+ OLLAMA_HOST=http://localhost:11434
147
+ OLLAMA_MODEL=llama2
148
+
149
+ EMBED_MODEL=intfloat/multilingual-e5-large
150
+ FAISS_INDEX=QModel.index
151
+ METADATA_FILE=metadata.json
152
+
153
+ TOP_K_SEARCH=20
154
+ TOP_K_RETURN=5
155
+ TEMPERATURE=0.2
156
+ CONFIDENCE_THRESHOLD=0.30
157
+ ALLOWED_ORIGINS=*
158
+ ```
159
+
160
+ **Production (HuggingFace + GPU) - Best quality, uses GPU**
161
+ ```bash
162
+ LLM_BACKEND=hf
163
+ HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
164
+ HF_DEVICE=cuda
165
+
166
+ EMBED_MODEL=intfloat/multilingual-e5-large
167
+ FAISS_INDEX=QModel.index
168
+ METADATA_FILE=metadata.json
169
+
170
+ TOP_K_SEARCH=30 # More candidates for better quality
171
+ TOP_K_RETURN=5
172
+ TEMPERATURE=0.1 # More deterministic
173
+ CONFIDENCE_THRESHOLD=0.35
174
+ ALLOWED_ORIGINS=yourdomain.com,api.yourdomain.com
175
+ ```
176
+
177
+ **Production (HuggingFace + CPU) - CPU-only, slower but no GPU required**
178
+ ```bash
179
+ LLM_BACKEND=hf
180
+ HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct
181
+ HF_DEVICE=cpu
182
+
183
+ TEMPERATURE=0.1
184
+ MAX_TOKENS=1024 # Reduce for faster responses
185
+ CONFIDENCE_THRESHOLD=0.35
186
+ ```
187
+
188
+ ### Tuning Tips
189
+
190
+ **For Better Results:**
191
+ - Increase `TOP_K_SEARCH` (costs slightly more compute)
192
+ - Lower `CONFIDENCE_THRESHOLD` (may get some hallucinations)
193
+ - Use larger model with more parameters
194
+ - Set `TEMPERATURE=0.1` for most consistent answers
195
+
196
+ **For Faster Performance:**
197
+ - Lower `TOP_K_SEARCH` and `TOP_K_RETURN`
198
+ - Use Ollama backend (faster inference)
199
+ - Reduce `MAX_TOKENS`
200
+ - Set `HF_DEVICE=cpu` if using HF (faster than auto-selecting)
201
+
202
+ **For More Accurate/Conservative Answers:**
203
+ - Increase `CONFIDENCE_THRESHOLD` (skip borderline queries)
204
+ - Lower `TEMPERATURE` (more deterministic)
205
+ - Use larger model (7B+ parameters)
206
+
207
+ **For CPU-Only (No GPU Available):**
208
+ - Use Ollama backend with `neural-chat` model
209
+ - Set `HF_DEVICE=cpu` if using HF
210
+ - Reduce `MAX_TOKENS` to 1024
211
+
212
+ ---
213
+
214
+ ## Running QModel
215
+
216
+ ### Step-by-Step: Starting the API
217
+
218
+ 1. **Create `.env` file**:
219
+ ```bash
220
+ cp .env.example .env
221
+ # Edit .env and choose your backend (see Configuration section above)
222
+ ```
223
+
224
+ 2. **Start the backend service**:
225
+
226
+ **If using Ollama:**
227
+ ```bash
228
+ # Terminal 1: Start Ollama daemon
229
+ ollama serve
230
+
231
+ # Terminal 2: Pull a model (first time only)
232
+ ollama pull llama2 # or: mistral, neural-chat
233
+ ```
234
+
235
+ **If using HuggingFace:**
236
+ - No separate service needed, models download automatically
237
+
238
+ 3. **Start QModel API**:
239
+ ```bash
240
+ python main.py
241
+ ```
242
+
243
+ API available at `http://localhost:8000`
244
+
245
+ View interactive docs: `http://localhost:8000/docs`
246
+
247
+ ### Docker Option
248
+
249
+ ```bash
250
+ # Configure your backend in .env (see Configuration section)
251
+ cp .env.example .env
252
+ nano .env # Choose LLM_BACKEND=ollama or hf
253
+
254
+ # Run with Docker Compose
255
+ docker-compose up
256
+ ```
257
+
258
+ For full Docker documentation (including production deployment, troubleshooting, and multi-container setup), see **[DOCKER.md](DOCKER.md)**.
259
+
260
+ ---
261
+
262
+ ## API Endpoints
263
+
264
+ ### Main Query Endpoint
265
+
266
+ ```bash
267
+ GET /ask?q=<question>&top_k=5&source_type=<filter>&grade_filter=<filter>
268
+ ```
269
+
270
+ **Parameters:**
271
+ - `q` (required): Your Islamic question
272
+ - `top_k`: Number of sources to retrieve (1-20, default: 5)
273
+ - `source_type`: Filter by source type
274
+ - `quran` — Quranic verses only
275
+ - `hadith` — Hadiths only
276
+ - `null` (default) — Both
277
+ - `grade_filter`: Filter Hadith by authenticity grade
278
+ - `sahih` — Only Sahih-graded Hadiths
279
+ - `hasan` — Sahih + Hasan
280
+ - `null` (default) — All grades
281
+
282
+ **Example Requests:**
283
+
284
+ ```bash
285
+ # General question
286
+ curl "http://localhost:8000/ask?q=What%20does%20Islam%20say%20about%20mercy?"
287
+
288
+ # Quran-only with word frequency
289
+ curl "http://localhost:8000/ask?q=How%20many%20times%20is%20mercy%20mentioned?&source_type=quran"
290
+
291
+ # Authentic Hadiths only
292
+ curl "http://localhost:8000/ask?q=Hadiths%20about%20prayer&source_type=hadith&grade_filter=sahih"
293
+ ```
294
+
295
+ **Response:**
296
+ ```json
297
+ {
298
+ "question": "What does Islam say about mercy?",
299
+ "answer": "Islam emphasizes mercy as a core value...",
300
+ "language": "english",
301
+ "intent": "general",
302
+ "analysis": null,
303
+ "sources": [
304
+ {
305
+ "source": "Surah Al-Baqarah 2:178",
306
+ "type": "quran",
307
+ "grade": null,
308
+ "arabic": "...",
309
+ "english": "...",
310
+ "_score": 0.876
311
+ }
312
+ ],
313
+ "top_score": 0.876,
314
+ "latency_ms": 342
315
+ }
316
+ ```
317
+
318
+ ---
319
+
320
+ ### Hadith Verification Endpoint
321
+
322
+ ```bash
323
+ GET /hadith/verify?q=<hadith_text>&collection=<filter>
324
+ ```
325
+
326
+ **Purpose:** Quick authenticity check for a Hadith
327
+
328
+ **Example:**
329
+ ```bash
330
+ curl "http://localhost:8000/hadith/verify?q=Actions%20are%20judged%20by%20intentions"
331
+ ```
332
+
333
+ **Response:**
334
+ ```json
335
+ {
336
+ "query": "Actions are judged by intentions",
337
+ "found": true,
338
+ "collection": "Sahih al-Bukhari",
339
+ "grade": "Sahih",
340
+ "reference": "Sahih al-Bukhari 1",
341
+ "arabic": "إنما الأعمال بالنيات",
342
+ "english": "Verily, actions are judged by intentions...",
343
+ "latency_ms": 156
344
+ }
345
+ ```
346
+
347
+ ---
348
+
349
+ ### Debug Endpoint
350
+
351
+ ```bash
352
+ GET /debug/scores?q=<question>&top_k=10
353
+ ```
354
+
355
+ **Purpose:** Inspect raw retrieval scores without LLM call. Use to calibrate `CONFIDENCE_THRESHOLD`.
356
+
357
+ **Example:**
358
+ ```bash
359
+ curl "http://localhost:8000/debug/scores?q=patience&top_k=10"
360
+ ```
361
+
362
+ **Response:**
363
+ ```json
364
+ {
365
+ "intent": "general",
366
+ "threshold": 0.3,
367
+ "results": [
368
+ {
369
+ "rank": 1,
370
+ "source": "Surah Al-Baqarah 2:45",
371
+ "type": "quran",
372
+ "grade": null,
373
+ "_dense": 0.8234,
374
+ "_sparse": 0.5421,
375
+ "_score": 0.7234
376
+ }
377
+ ]
378
+ }
379
+ ```
380
+
381
+ Use this to fine-tune `CONFIDENCE_THRESHOLD`. If queries you expect to work have `_score < threshold`, lower the threshold.
382
+
383
+ ---
384
+
385
+ ### Health & Metadata
386
+
387
+ ```bash
388
+ # Health check
389
+ curl http://localhost:8000/health
390
+
391
+ # List available models
392
+ curl http://localhost:8000/v1/models
393
+
394
+ # Interactive API docs
395
+ http://localhost:8000/docs
396
+ ```
397
+
398
+ ---
399
+
400
+ ## Query Examples
401
+
402
+ ### 1. Word Frequency Analysis
403
+
404
+ **Question:** "How many times is the word 'mercy' mentioned in the Quran?"
405
+
406
+ **System detects:** `intent=count`
407
+
408
+ **Response includes:**
409
+ ```json
410
+ {
411
+ "analysis": {
412
+ "keyword": "mercy",
413
+ "total_count": 87,
414
+ "by_surah": {
415
+ "2": {"name": "Al-Baqarah", "count": 12},
416
+ "7": {"name": "Al-A'raf", "count": 8},
417
+ ...
418
+ }
419
+ }
420
+ }
421
+ ```
422
+
423
+ ---
424
+
425
+ ### 2. Topic-Based Aya Retrieval
426
+
427
+ **Question:** "What does the Quran say about patience?"
428
+
429
+ **System detects:** `intent=tafsir`
430
+
431
+ **Response:**
432
+ - Retrieves top 5 verses about patience
433
+ - LLM explains each with Tafsir
434
+ - Shows interconnections between verses
435
+
436
+ ---
437
+
438
+ ### 3. Hadith Authentication
439
+
440
+ **Question:** "Is the Hadith 'Actions are judged by intentions' authentic?"
441
+
442
+ **System detects:** `intent=auth`
443
+
444
+ **LLM response:**
445
+ - "Yes, this is found in Sahih al-Bukhari 1"
446
+ - "Grade: Sahih (authentic)"
447
+ - "Explanation: This Hadith establishes the principle of intention..."
448
+
449
+ ---
450
+
451
+ ### 4. Bilingual Support
452
+
453
+ **Arabic Question:** "ما أهمية الصبر في الإسلام؟"
454
+
455
+ **System detects:** Language = arabic
456
+
457
+ **Response:** Full Arabic response with proper vocalization
458
+
459
+ ---
460
+
461
+ ## Tuning & Optimization
462
+
463
+ ### Confidence Threshold
464
+
465
+ The `CONFIDENCE_THRESHOLD` (default 0.30) controls when to call the LLM:
466
+
467
+ - **Too high (e.g., 0.70)**: Many queries rejected as "not found" (safer but less helpful)
468
+ - **Too low (e.g., 0.10)**: LLM called on weak matches (more hallucinations)
469
+ - **Sweet spot (0.30-0.50)**: Most queries get through, but low-quality matches rejected
470
+
471
+ **To calibrate:**
472
+ 1. Run `/debug/scores` on representative queries
473
+ 2. Check what `_score` values are returned
474
+ 3. Adjust `CONFIDENCE_THRESHOLD` in `.env`
475
+ 4. Restart service
476
+
477
+ ---
478
+
479
+ ### Temperature
480
+
481
+ - **0.0**: Deterministic (best for factual Islamic answers)
482
+ - **0.2**: Slightly creative (default)
483
+ - **0.5+**: More creative (not recommended for religious content)
484
+
485
+ ---
486
+
487
+ ### Model Selection
488
+
489
+ #### For Development (Ollama)
490
+ - **llama2** — Fastest, good quality, easy setup
491
+ - **mistral** — Better Arabic, slightly slower
492
+ - **neural-chat** — Good balance
493
+
494
+ ```bash
495
+ ollama pull llama2
496
+ OLLAMA_MODEL=llama2 python main.py
497
+ ```
498
+
499
+ #### For Production (HuggingFace)
500
+ - **Qwen/Qwen2-7B-Instruct** — Strong Arabic, 7B params
501
+ - **mistralai/Mistral-7B-Instruct-v0.2** — Very capable
502
+ - **meta-llama/Llama-2-13b-chat-hf** — Larger, better quality (requires HF token)
503
+
504
+ ```bash
505
+ HF_MODEL_NAME=Qwen/Qwen2-7B-Instruct python main.py
506
+ ```
507
+
508
+ ---
509
+
510
+ ## Troubleshooting
511
+
512
+ ### Issue: "Service is still initialising"
513
+
514
+ **Solution:** Wait 60-90 seconds for embedding model to load. Check logs:
515
+ ```bash
516
+ tail -f <logfile>
517
+ ```
518
+
519
+ ### Issue: Low retrieval scores
520
+
521
+ **Cause:** Queries don't match dataset language better
522
+
523
+ **Solution:**
524
+ 1. Check `/debug/scores` output
525
+ 2. Ensure query is in Arabic or clear English
526
+ 3. Try synonyms (e.g., "mercy" vs "compassion")
527
+ 4. Lower `CONFIDENCE_THRESHOLD` in `.env`
528
+
529
+ ### Issue: LLM model not found (HF backend)
530
+
531
+ **Solution:**
532
+ ```bash
533
+ huggingface-cli login
534
+ export HF_TOKEN=<your_token>
535
+ ```
536
+
537
+ ### Issue: Out of memory
538
+
539
+ **Solution:**
540
+ - Use `OLLAMA_MODEL=neural-chat` (smaller)
541
+ - Set `HF_DEVICE=cpu` (slower but uses RAM instead of VRAM)
542
+ - Reduce `TOP_K_SEARCH` in `.env`
543
+
544
+ ---
545
+
546
+ ## Production Checklist
547
+
548
+ - [ ] Test with at least 10 representative queries
549
+ - [ ] Verify `/debug/scores` on low-confidence queries
550
+ - [ ] Adjust `CONFIDENCE_THRESHOLD` to acceptable false-positive rate
551
+ - [ ] Set `ALLOWED_ORIGINS` to your domain only (security)
552
+ - [ ] Use production-grade LLM model (Qwen 7B+ or Mistral)
553
+ - [ ] Set `TEMPERATURE=0.1` for maximum consistency
554
+ - [ ] Monitor first 100 queries for quality
555
+ - [ ] Enable access logging and error tracking
556
+
557
+ ---
558
+
559
+ ## Architecture Files
560
+
561
+ - **main.py** — Core API + RAG pipeline (LLM backend abstraction, retrieval, generation)
562
+ - **build_index.py** — FAISS index generation from metadata
563
+ - **enrich_dataset.py** — Dataset enrichment script (fetch hadith collections, deduplicate)
564
+ - **metadata.json** — Combined dataset: 6,236 Quran verses + 41,390 hadiths
565
+ - **QModel.index** — FAISS vector index (pre-built, ready to use)
566
+ - **ARCHITECTURE.md** — Detailed system design
567
+ - **requirements.txt** — Python dependencies
568
+
569
+ ---
570
+
571
+ ## Next Steps
572
+
573
+ After setup, consider:
574
+ 1. Grade filtering: Try `?grade_filter=sahih` for authenticated-only results
575
+ 2. Source filtering: Use `?source_type=quran` vs `?source_type=hadith`
576
+ 3. Batch processing: Add endpoint for multiple questions
577
+ 4. Webhook integration: Stream answers as they generate
578
+ 5. Caching improvements: Persistent Redis cache for production
579
+
580
+ ---
581
+
582
+ ## Support
583
+
584
+ For issues:
585
+ 1. Check logs: `python main.py` (stdout)
586
+ 2. Test endpoints: http://localhost:8000/docs
587
+ 3. Review `/debug/scores` for retrieval quality
588
+ 4. Check `.env` configuration
589
+
590
+ Happy querying! 🕌
build_index.py CHANGED
@@ -1,69 +1,79 @@
 
 
 
 
 
 
1
  import json
2
- import time
3
  import numpy as np
 
4
  import faiss
5
  from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # ── Config ─────────────────────────────────────────────────────────────────────
8
- EMBED_MODEL = "intfloat/multilingual-e5-large"
9
- BATCH_SIZE = 128 # Increase to 256 if you have ≥16 GB RAM and no GPU OOM
10
- SHOW_PROGRESS = True # tqdm progress bar per batch
11
-
12
- # ── Load model ─────────────────────────────────────────────────────────────────
13
- print(f"⏳ Loading model: {EMBED_MODEL}")
14
- t0 = time.perf_counter()
15
- model = SentenceTransformer(EMBED_MODEL)
16
- print(f"✅ Model loaded in {time.perf_counter()-t0:.1f}s")
17
-
18
- # ── Load data ──────────────────────────────────────────────────────────────────
19
- with open("data/quran.json", "r", encoding="utf-8") as f:
20
- quran = json.load(f)
21
- for item in quran:
22
- item["type"] = "quran"
23
-
24
- with open("data/hadith.json", "r", encoding="utf-8") as f:
25
- hadith = json.load(f)
26
- for item in hadith:
27
- item["type"] = "hadith"
28
-
29
- data = quran + hadith
30
- print(f"📊 Dataset: {len(quran):,} Quran verses + {len(hadith):,} Hadiths = {len(data):,} items")
31
-
32
- # ── Build text pairs ────────────────────────────────────────────────────────────
33
- # Each item → 2 texts (Arabic + English), indexed as item_idx * 2 and item_idx * 2 + 1
34
- texts = []
35
- for item in data:
36
- source = item.get("source") or item.get("reference") or ""
37
- texts.append(f"passage: {source} Arabic: {item['arabic']}")
38
- texts.append(f"passage: {source} English: {item['english']}")
39
-
40
- print(f"📝 Encoding {len(texts):,} texts (batch_size={BATCH_SIZE}) …")
41
- t1 = time.perf_counter()
42
-
43
- # ── Encode ───────────────────────────────────────────────────────────────��─────
44
- # show_progress_bar gives a tqdm bar so you can see throughput + ETA
45
- embeddings = model.encode(
46
- texts,
47
- batch_size=BATCH_SIZE,
48
- normalize_embeddings=True,
49
- show_progress_bar=SHOW_PROGRESS,
50
- convert_to_numpy=True,
51
- )
52
-
53
- elapsed = time.perf_counter() - t1
54
- rate = len(texts) / elapsed
55
- print(f"\n✅ Encoded {len(texts):,} texts in {elapsed:.0f}s ({rate:.0f} texts/sec)")
56
-
57
- # ── Build FAISS index ──────────────────────────────────────────────────────────
58
- print("🔨 Building FAISS index …")
59
- dim = embeddings.shape[1]
60
- index = faiss.IndexFlatIP(dim)
61
- index.add(embeddings.astype("float32")) # IP needs float32
62
- faiss.write_index(index, "QModel.index")
63
- print(f"✅ FAISS index saved (vectors: {index.ntotal:,}, dim: {dim})")
64
-
65
- # ── Save metadata ──────────────────────────────────────────────────────────────
66
- with open("metadata.json", "w", encoding="utf-8") as f:
67
- json.dump(data, f, ensure_ascii=False, indent=2)
68
- print("✅ metadata.json saved")
69
- print(f"\n🎉 Index built in {time.perf_counter()-t0:.0f}s total")
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Regenerate FAISS index with enriched metadata.
4
+ This script loads the enriched metadata and generates embeddings for all documents.
5
+ """
6
+
7
  import json
 
8
  import numpy as np
9
+ from pathlib import Path
10
  import faiss
11
  from sentence_transformers import SentenceTransformer
12
+ from tqdm import tqdm
13
+
14
+ def generate_embeddings(model_name: str = "intfloat/multilingual-e5-large"):
15
+ """Generate embeddings for all documents in metadata.json"""
16
+
17
+ metadata_path = Path("/Users/elgendy/Projects/QModel/metadata.json")
18
+ index_path = Path("/Users/elgendy/Projects/QModel/QModel.index")
19
+
20
+ # Load metadata
21
+ print("Loading metadata...")
22
+ with open(metadata_path, 'r', encoding='utf-8') as f:
23
+ documents = json.load(f)
24
+
25
+ print(f"Total documents: {len(documents)}")
26
+
27
+ # Load embedding model
28
+ print(f"\nLoading embedding model: {model_name}")
29
+ model = SentenceTransformer(model_name)
30
+ embedding_dim = model.get_sentence_embedding_dimension()
31
+ print(f"Embedding dimension: {embedding_dim}")
32
+
33
+ # Prepare texts for embedding
34
+ all_texts = []
35
+ for doc in documents:
36
+ if doc.get("type") == "quran":
37
+ # For Quran: use Tafseer/meaning + Sura name
38
+ text = f"{doc.get('surah_name_en', '')} {doc.get('english', '')}"
39
+ else: # hadith
40
+ # For Hadith: use collection + Arabic text (for better semantic matching)
41
+ text = f"{doc.get('collection', '')} {doc.get('arabic', '')} {doc.get('english', '')}"
42
+
43
+ all_texts.append(text.strip())
44
+
45
+ # Generate embeddings in batches for efficiency
46
+ print(f"\nGenerating embeddings for {len(all_texts)} documents...")
47
+ batch_size = 32
48
+ all_embeddings = []
49
+
50
+ for i in tqdm(range(0, len(all_texts), batch_size), desc="Embedding batches"):
51
+ batch_texts = all_texts[i:i + batch_size]
52
+ batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
53
+ all_embeddings.extend(batch_embeddings)
54
+
55
+ embeddings = np.array(all_embeddings, dtype=np.float32)
56
+ print(f"Generated embeddings shape: {embeddings.shape}")
57
+
58
+ # Create FAISS index
59
+ print("\nCreating FAISS index...")
60
+ index = faiss.IndexFlatIP(embedding_dim) # Inner product (cosine on normalized)
61
+ faiss.normalize_L2(embeddings)
62
+ index.add(embeddings)
63
+
64
+ # Save index
65
+ print(f"Saving FAISS index to {index_path}")
66
+ faiss.write_index(index, str(index_path))
67
+
68
+ print(f"\n{'='*60}")
69
+ print("Index Generation Complete")
70
+ print(f"{'='*60}")
71
+ print(f"Documents indexed: {len(documents)}")
72
+ print(f"Embeddings generated: {len(all_embeddings)}")
73
+ print(f"Index file size: {index_path.stat().st_size / (1024*1024):.2f} MB")
74
+ print(f"Index capacity: {index.ntotal}")
75
+ print(f"{'='*60}")
76
+
77
 
78
+ if __name__ == "__main__":
79
+ generate_embeddings()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker-compose.yml CHANGED
@@ -1,16 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  services:
2
  qmodel:
3
  build: .
 
4
  ports:
5
  - "8000:8000"
6
  env_file:
7
  - .env
8
  environment:
9
- - HF_TOKEN=${HF_TOKEN}
10
- - OLLAMA_HOST=http://host.docker.internal:11434
 
 
11
  volumes:
 
12
  - .:/app
13
- # Restart policy
14
- restart: always
 
 
15
  extra_hosts:
 
16
  - "host.docker.internal:host-gateway"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QModel Docker Compose Configuration
2
+ # ====================================
3
+ # Configure via .env file:
4
+ # LLM_BACKEND=ollama (default: local Ollama on host machine)
5
+ # LLM_BACKEND=hf (HuggingFace backend)
6
+ #
7
+ # Usage:
8
+ # docker-compose up # Uses backend from .env
9
+ # docker-compose up -d # Run in background
10
+ # docker-compose logs -f # View logs
11
+ # docker-compose down # Stop services
12
+
13
+ version: "3.8"
14
+
15
  services:
16
  qmodel:
17
  build: .
18
+ container_name: qmodel-api
19
  ports:
20
  - "8000:8000"
21
  env_file:
22
  - .env
23
  environment:
24
+ # Pass through HF token if using HuggingFace backend
25
+ - HF_TOKEN=${HF_TOKEN:-}
26
+ # Ollama host: use Docker host IP for local Ollama
27
+ - OLLAMA_HOST=${OLLAMA_HOST:-http://host.docker.internal:11434}
28
  volumes:
29
+ # Mount current directory for live code changes (development)
30
  - .:/app
31
+ # Cache HuggingFace models to avoid re-downloading
32
+ - huggingface_cache:/root/.cache/huggingface
33
+ # Restart automatically if container exits
34
+ restart: on-failure:3
35
  extra_hosts:
36
+ # Allow container to reach host.docker.internal on Mac/Windows
37
  - "host.docker.internal:host-gateway"
38
+ networks:
39
+ - qmodel-network
40
+ # Health check for orchestration
41
+ healthcheck:
42
+ test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
43
+ interval: 30s
44
+ timeout: 10s
45
+ retries: 3
46
+ start_period: 60s
47
+
48
+ networks:
49
+ qmodel-network:
50
+ driver: bridge
51
+
52
+ volumes:
53
+ # Persistent cache for HuggingFace models
54
+ huggingface_cache:
enrich_dataset.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to enrich the QModel dataset with hadith collections from GitHub.
4
+ Fetches Musnad Ahmad and other major hadith collections from:
5
+ https://github.com/AhmedBaset/hadith-json/tree/main/db/by_book/the_9_books
6
+ """
7
+
8
+ import json
9
+ import requests
10
+ from typing import Dict, List
11
+ from collections import defaultdict
12
+
13
+ # The 9 canonical hadith books
14
+ HADITH_BOOKS = {
15
+ "ahmed.json": {
16
+ "collection": "Musnad Ahmad",
17
+ "id_prefix": "ahmad",
18
+ "grade": "Hasan/Sahih",
19
+ "author": "Imam Ahmad ibn Hanbal"
20
+ },
21
+ "bukhari.json": {
22
+ "collection": "Sahih al-Bukhari",
23
+ "id_prefix": "bukhari",
24
+ "grade": "Sahih",
25
+ "author": "Muhammad al-Bukhari"
26
+ },
27
+ "muslim.json": {
28
+ "collection": "Sahih Muslim",
29
+ "id_prefix": "muslim",
30
+ "grade": "Sahih",
31
+ "author": "Muslim ibn al-Hajjaj"
32
+ },
33
+ "abudawud.json": {
34
+ "collection": "Sunan Abu Dawood",
35
+ "id_prefix": "abudawud",
36
+ "grade": "Hasan",
37
+ "author": "Abu Dawood Sulaiman"
38
+ },
39
+ "tirmidhi.json": {
40
+ "collection": "Jami' at-Tirmidhi",
41
+ "id_prefix": "tirmidhi",
42
+ "grade": "Hasan",
43
+ "author": "Al-Tirmidhi"
44
+ },
45
+ "ibnmajah.json": {
46
+ "collection": "Sunan Ibn Majah",
47
+ "id_prefix": "ibnmajah",
48
+ "grade": "Hasan",
49
+ "author": "Ibn Majah al-Qazwini"
50
+ },
51
+ "nasai.json": {
52
+ "collection": "Sunan an-Nasai",
53
+ "id_prefix": "nasai",
54
+ "grade": "Sahih",
55
+ "author": "Ahmad al-Nasai"
56
+ },
57
+ "malik.json": {
58
+ "collection": "Muwatta Malik",
59
+ "id_prefix": "malik",
60
+ "grade": "Sahih",
61
+ "author": "Malik ibn Anas"
62
+ },
63
+ "darimi.json": {
64
+ "collection": "Sunan al-Darimi",
65
+ "id_prefix": "darimi",
66
+ "grade": "Hasan",
67
+ "author": "Al-Darimi"
68
+ }
69
+ }
70
+
71
+ BASE_URL = "https://raw.githubusercontent.com/AhmedBaset/hadith-json/main/db/by_book/the_9_books"
72
+
73
+
74
+ def fetch_hadith_book(filename: str) -> Dict:
75
+ """Fetch a hadith book JSON from GitHub."""
76
+ url = f"{BASE_URL}/{filename}"
77
+ print(f"Fetching {filename}...")
78
+ response = requests.get(url, timeout=30)
79
+ response.raise_for_status()
80
+ return response.json()
81
+
82
+
83
+ def transform_hadith(hadith: Dict, book_config: Dict, book_data: Dict) -> Dict:
84
+ """Transform hadith from GitHub format to our metadata format."""
85
+
86
+ # Find chapter name if available
87
+ chapter_name = ""
88
+ if "chapterId" in hadith:
89
+ for chapter in book_data.get("chapters", []):
90
+ if chapter.get("id") == hadith.get("chapterId"):
91
+ chapter_name = chapter.get("arabic", "")
92
+ break
93
+
94
+ # Build the reference string
95
+ hadith_num = hadith.get("idInBook", hadith.get("id", ""))
96
+ reference = f"{book_config['collection']} {hadith_num}"
97
+
98
+ # Combine narrator and text for English
99
+ english_parts = []
100
+ if isinstance(hadith.get("english"), dict):
101
+ if hadith["english"].get("narrator"):
102
+ english_parts.append(hadith["english"]["narrator"])
103
+ if hadith["english"].get("text"):
104
+ english_parts.append(hadith["english"]["text"])
105
+ english = " ".join(english_parts)
106
+ else:
107
+ english = str(hadith.get("english", ""))
108
+
109
+ return {
110
+ "id": f"{book_config['id_prefix']}_{hadith_num}",
111
+ "arabic": hadith.get("arabic", ""),
112
+ "english": english,
113
+ "reference": reference,
114
+ "hadith_number": hadith_num,
115
+ "collection": book_config["collection"],
116
+ "chapter": chapter_name,
117
+ "grade": "", # Will be inferred by main.py's infer_hadith_grade()
118
+ "type": "hadith",
119
+ "author": book_config["author"]
120
+ }
121
+
122
+
123
+ def load_existing_metadata(filepath: str) -> List[Dict]:
124
+ """Load existing metadata.json file."""
125
+ print(f"Loading existing metadata from {filepath}...")
126
+ with open(filepath, 'r', encoding='utf-8') as f:
127
+ return json.load(f)
128
+
129
+
130
+ def save_enriched_metadata(filepath: str, data: List[Dict], stats: Dict) -> None:
131
+ """Save enriched metadata to file."""
132
+ print(f"Saving enriched metadata to {filepath}...")
133
+ with open(filepath, 'w', encoding='utf-8') as f:
134
+ json.dump(data, f, ensure_ascii=False, indent=2)
135
+
136
+ print("\n" + "="*60)
137
+ print("Dataset Enrichment Summary")
138
+ print("="*60)
139
+ print(f"Total documents: {len(data)}")
140
+ print(f"\nBreakdown by collection:")
141
+ for collection, count in sorted(stats.items()):
142
+ print(f" {collection}: {count}")
143
+ print("="*60)
144
+
145
+
146
+ def main():
147
+ """Main enrichment process."""
148
+
149
+ # Load existing metadata
150
+ metadata_path = "/Users/elgendy/Projects/QModel/metadata.json"
151
+ existing_data = load_existing_metadata(metadata_path)
152
+
153
+ # Track which existing hadiths we have
154
+ existing_ids = {item["id"] for item in existing_data if item.get("type") == "hadith"}
155
+ print(f"Existing hadith entries: {len(existing_ids)}")
156
+
157
+ # New hadiths to add
158
+ new_hadiths = []
159
+ stats = defaultdict(int)
160
+
161
+ # Count existing Quran verses
162
+ for item in existing_data:
163
+ if item.get("type") == "quran":
164
+ stats["Quran"] += 1
165
+ elif item.get("type") == "hadith":
166
+ collection = item.get("collection", "Unknown")
167
+ stats[collection] += 1
168
+
169
+ # Fetch and process each hadith book
170
+ for filename, book_config in HADITH_BOOKS.items():
171
+ try:
172
+ book_data = fetch_hadith_book(filename)
173
+ hadiths = book_data.get("hadiths", [])
174
+
175
+ skipped = 0
176
+ added = 0
177
+
178
+ for hadith in hadiths:
179
+ # Transform to our format
180
+ transformed = transform_hadith(hadith, book_config, book_data)
181
+
182
+ # Check if we already have this hadith
183
+ if transformed["id"] in existing_ids:
184
+ skipped += 1
185
+ continue
186
+
187
+ new_hadiths.append(transformed)
188
+ existing_ids.add(transformed["id"])
189
+ added += 1
190
+
191
+ collection_name = book_config["collection"]
192
+ stats[collection_name] += added
193
+
194
+ print(f" ✓ {filename}: {added} new hadiths added, {skipped} already exist")
195
+
196
+ except Exception as e:
197
+ print(f" ✗ Error fetching {filename}: {e}")
198
+
199
+ # Merge with existing data
200
+ enriched_data = existing_data + new_hadiths
201
+
202
+ print(f"\nTotal new hadiths added: {len(new_hadiths)}")
203
+ print(f"Total documents after enrichment: {len(enriched_data)}")
204
+
205
+ # Save enriched metadata
206
+ save_enriched_metadata(metadata_path, enriched_data, stats)
207
+
208
+
209
+ if __name__ == "__main__":
210
+ main()
main.py CHANGED
@@ -1,15 +1,23 @@
1
  """
2
- QModel v3.1 — Islamic RAG API
3
- Fixes over v3:
4
- Confidence gate: blocks LLM call when top retrieval score is too low →
5
- returns a safe "not in dataset" answer instead of hallucinating
6
- • Hardened anti-hallucination prompt: explicit rule against reconstructing
7
- or completing any Hadith from memory; citation must match context verbatim
8
- Hadith type-boost: intent=hadith raises _score for Hadith items so they
9
- are not outranked by Quran verses on Hadith-specific queries
10
- top_score exposed in AskResponse and /v1/chat/completions x_metadata
11
- so callers can implement their own confidence thresholds
12
- Few-shot example updated to show a correct "not found" refusal path
 
 
 
 
 
 
 
 
13
  """
14
 
15
  from __future__ import annotations
@@ -23,14 +31,14 @@ import re
23
  import time
24
  from collections import Counter, OrderedDict
25
  from contextlib import asynccontextmanager
26
- from typing import Dict, List, Optional
27
 
28
  import faiss
29
  import numpy as np
30
  from dotenv import load_dotenv
31
  from fastapi import FastAPI, HTTPException, Query
32
  from fastapi.middleware.cors import CORSMiddleware
33
- import ollama
34
  from pydantic import BaseModel, Field, validator
35
  from sentence_transformers import SentenceTransformer
36
 
@@ -47,42 +55,175 @@ logger = logging.getLogger("qmodel")
47
 
48
 
49
  # ═══════════════════════════════════════════════════════════════════════
50
- # CONFIG
51
  # ═══════════════════════════════════════════════════════════════════════
52
  class Config:
 
 
 
 
 
 
 
 
 
 
 
53
  OLLAMA_HOST: str = os.getenv("OLLAMA_HOST", "http://localhost:11434")
54
- LLM_MODEL: str = os.getenv("LLM_MODEL", "minimax-m2.7:cloud")
55
- EMBED_MODEL: str = os.getenv("EMBED_MODEL", "intfloat/multilingual-e5-large")
56
- FAISS_INDEX: str = os.getenv("FAISS_INDEX", "QModel.index")
57
- METADATA_FILE: str = os.getenv("METADATA_FILE", "metadata.json")
58
- TOP_K_SEARCH: int = int(os.getenv("TOP_K_SEARCH", 20))
59
- TOP_K_RETURN: int = int(os.getenv("TOP_K_RETURN", 5))
60
- MAX_TOKENS: int = int(os.getenv("MAX_TOKENS", 2048))
61
- TEMPERATURE: float = float(os.getenv("TEMPERATURE", 0.2))
62
- CACHE_SIZE: int = int(os.getenv("CACHE_SIZE", 512))
63
- CACHE_TTL: int = int(os.getenv("CACHE_TTL", 3600))
64
- RERANK_ALPHA: float = float(os.getenv("RERANK_ALPHA", 0.6))
65
- ALLOWED_ORIGINS: str = os.getenv("ALLOWED_ORIGINS", "*")
66
- MAX_EXAMPLES: int = int(os.getenv("MAX_EXAMPLES", 3))
67
- # ── NEW: minimum hybrid score to allow an LLM answer ──────────────
68
- # Below this threshold the pipeline returns a safe "not found" reply
69
- # without calling the LLM at all, preventing hallucination.
70
- # Tune upward (e.g. 0.70) to be stricter; downward (0.50) to be looser.
71
- CONFIDENCE_THRESHOLD: float = float(os.getenv("CONFIDENCE_THRESHOLD", 0.30))
72
- # ── NEW: score bonus applied to Hadith items when intent == "hadith"
73
- # Prevents Quran verses from outranking relevant Hadiths on Sunnah queries.
 
 
 
74
  HADITH_BOOST: float = float(os.getenv("HADITH_BOOST", 0.08))
75
 
 
 
 
 
 
 
 
 
 
76
  cfg = Config()
77
 
78
- # ─────────────────────────────────────────────────────────────────────────────
79
- # MODEL FALLBACK CHAIN
80
- # ─────────────────────────────────────────────────────────────────────────────
81
- _FALLBACK_MODELS: List[str] = [
82
- "minimax-m2.7:cloud", # primary — 14 GB, best quality
83
- "gavtoken/minimax:latest", # cloud fallback — strong Arabic
84
- "llama3.1:latest", # local fallback — 4.9 GB
85
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
 
88
  # ═══════════════════════════════════════════════════════════════════════
@@ -126,43 +267,10 @@ analysis_cache = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL)
126
  rewrite_cache = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL * 6)
127
 
128
 
129
- # ═══════════════════════════════════════════════════════════════════════
130
- # RESILIENT LLM CALLER — auto-fallback across Ollama models
131
- # ═══════════════════════════════════════════════════════════════════════
132
- def chat_with_fallback(
133
- messages: List[dict],
134
- max_tokens: int = cfg.MAX_TOKENS,
135
- temperature: float = cfg.TEMPERATURE,
136
- ) -> str:
137
- primary = cfg.LLM_MODEL
138
- models = [primary] + [m for m in _FALLBACK_MODELS if m != primary]
139
-
140
- last_err: Exception = RuntimeError("No Ollama models available")
141
- for model in models:
142
- try:
143
- logger.info("LLM → %s (Ollama)", model)
144
- client = ollama.Client(host=cfg.OLLAMA_HOST)
145
- response = client.chat(
146
- model=model,
147
- messages=messages,
148
- options={"num_predict": max_tokens, "temperature": temperature},
149
- )
150
- content = response["message"]["content"].strip()
151
- if content:
152
- if model != primary:
153
- logger.warning("Fell back to: %s", model)
154
- return content
155
- except Exception as exc:
156
- logger.error("Skip %s — %s", model, exc)
157
- last_err = exc
158
-
159
- raise RuntimeError(f"All LLM models failed. Last error: {last_err}")
160
-
161
-
162
  # ═══════════════════════════════════════════════════════════════════════
163
  # ARABIC NLP — normalisation + light stemming
164
  # ═══════════════════════════════════════════════════════════════════════
165
- _DIACRITICS = re.compile(r"[\u064B-\u0652\u0670\u0671\u0653\u0654\u0655]")
166
  _ALEF_VARS = re.compile(r"[أإآٱ]")
167
  _WAW_HAMZA = re.compile(r"ؤ")
168
  _YA_HAMZA = re.compile(r"ئ")
@@ -177,12 +285,11 @@ _SPELLING_MAP: Dict[str, str] = {
177
  "قران": "قرآن",
178
  "القران": "القرآن",
179
  "اللہ": "الله",
180
- "الرّحمن": "الرحمن",
181
- "محمّد": "محمد",
182
  }
183
 
184
 
185
  def normalize_arabic(text: str, *, aggressive: bool = False) -> str:
 
186
  text = _DIACRITICS.sub("", text)
187
  text = _TATWEEL.sub("", text)
188
  text = _ALEF_VARS.sub("ا", text)
@@ -207,12 +314,14 @@ _AR_SUFFIXES = re.compile(
207
 
208
 
209
  def light_stem(word: str) -> str:
 
210
  w = _AR_PREFIXES.sub("", word)
211
  w = _AR_SUFFIXES.sub("", w)
212
  return w if len(w) >= 2 else word
213
 
214
 
215
  def tokenize_ar(text: str) -> List[str]:
 
216
  norm = normalize_arabic(text, aggressive=True).lower()
217
  return [light_stem(t) for t in norm.split() if t]
218
 
@@ -225,7 +334,8 @@ _ARABIC_SCRIPT = re.compile(
225
  )
226
 
227
 
228
- def detect_language(text: str) -> str:
 
229
  ar = len(_ARABIC_SCRIPT.findall(text))
230
  en = len(re.findall(r"[a-zA-Z]", text))
231
  tot = ar + en or 1
@@ -238,6 +348,7 @@ def detect_language(text: str) -> str:
238
 
239
 
240
  def language_instruction(lang: str) -> str:
 
241
  return {
242
  "arabic": (
243
  "يجب أن تكون الإجابة كاملةً باللغة العربية الفصحى تماماً. "
@@ -263,19 +374,26 @@ Reply ONLY with a valid JSON object — no markdown, no preamble:
263
  "ar_query": "<query in clear Arabic فصحى, ≤25 words>",
264
  "en_query": "<query in clear English, ≤25 words>",
265
  "keywords": ["<3-7 key Arabic or English terms from the question>"],
266
- "intent": "<one of: fatwa | tafsir | hadith | count | general>"
267
  }
268
 
269
- Rules:
270
- - Fix spelling errors (e.g. "quran" "Quran", ران" "قرآن").
271
- - Expand abbreviations (e.g. "pbuh" "peace be upon him / صلى الله عليه وسلم").
272
- - Do NOT answer the question only rephrase it for search.
273
- - 'count' intent = user wants the frequency or number of occurrences of a word/name.
274
- - 'hadith' intent = user quotes or asks about a specific Hadith text or its authenticity.
 
 
 
 
 
 
275
  """
276
 
277
 
278
- async def rewrite_query(raw: str) -> Dict:
 
279
  cached = await rewrite_cache.get(raw)
280
  if cached:
281
  return cached
@@ -287,7 +405,7 @@ async def rewrite_query(raw: str) -> Dict:
287
  "intent": "general",
288
  }
289
  try:
290
- text = chat_with_fallback(
291
  messages=[
292
  {"role": "system", "content": _REWRITE_SYSTEM},
293
  {"role": "user", "content": raw},
@@ -300,9 +418,7 @@ async def rewrite_query(raw: str) -> Dict:
300
  for k in ("ar_query", "en_query", "keywords", "intent"):
301
  result.setdefault(k, fallback[k])
302
  await rewrite_cache.set(result, raw)
303
- logger.info(
304
- "Rewrite: intent=%s ar=%s", result["intent"], result["ar_query"][:60]
305
- )
306
  return result
307
  except Exception as exc:
308
  logger.warning("Query rewrite failed (%s) — using fallback", exc)
@@ -310,7 +426,7 @@ async def rewrite_query(raw: str) -> Dict:
310
 
311
 
312
  # ═══════════════════════════════════════════════════════════════════════
313
- # INTENT DETECTION (frequency / count queries)
314
  # ═══════════════════════════════════════════════════════════════════════
315
  _COUNT_EN = re.compile(
316
  r"\b(how many|count|number of|frequency|occurrences? of|how often|"
@@ -321,15 +437,17 @@ _COUNT_AR = re.compile(
321
  r"(كم مرة|كم عدد|كم تكرر|عدد مرات|تكرار|كم ذُكر|كم وردت?)"
322
  )
323
 
324
- _INTENT_SYSTEM = """\
325
- You are an intent classifier for an Islamic Q&A system.
326
- Determine if the query asks for the COUNT or FREQUENCY of a specific word or name.
327
- Reply ONLY with valid JSON, no markdown:
328
- {"analysis": true, "keyword": "<exact Arabic or English word to count>"} or {"analysis": false}
329
- """
 
330
 
331
 
332
  async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
 
333
  if rewrite.get("intent") == "count":
334
  kws = rewrite.get("keywords", [])
335
  return kws[0] if kws else None
@@ -337,27 +455,13 @@ async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
337
  if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
338
  return None
339
 
340
- try:
341
- raw = chat_with_fallback(
342
- messages=[
343
- {"role": "system", "content": _INTENT_SYSTEM},
344
- {"role": "user", "content": query},
345
- ],
346
- max_tokens=60,
347
- temperature=0.0,
348
- )
349
- raw = re.sub(r"```(?:json)?\n?|\n?```", "", raw).strip()
350
- res = json.loads(raw)
351
- if res.get("analysis"):
352
- return res.get("keyword")
353
- except Exception as exc:
354
- logger.warning("Intent detection failed (%s) — heuristic fallback", exc)
355
- for pat in (_COUNT_EN, _COUNT_AR):
356
- m = pat.search(query)
357
- if m:
358
- tail = query[m.end():].strip().split()
359
- if tail:
360
- return tail[-1]
361
  return None
362
 
363
 
@@ -365,6 +469,7 @@ async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
365
  # OCCURRENCE ANALYSIS (exact + stemmed matching)
366
  # ═══════════════════════════════════════════════════════════════════════
367
  async def count_occurrences(keyword: str, dataset: list) -> dict:
 
368
  cached = await analysis_cache.get(keyword)
369
  if cached:
370
  return cached
@@ -372,27 +477,41 @@ async def count_occurrences(keyword: str, dataset: list) -> dict:
372
  kw_norm = normalize_arabic(keyword, aggressive=True).lower()
373
  kw_stem = light_stem(kw_norm)
374
  count = 0
 
375
  examples: list = []
376
 
377
  for item in dataset:
378
- ar_norm = normalize_arabic(item.get("arabic", ""), aggressive=True)
 
 
 
379
  combined = f"{ar_norm} {item.get('english', '')}".lower()
380
  exact = combined.count(kw_norm)
381
  stemmed = combined.count(kw_stem) - exact if kw_stem != kw_norm else 0
382
  occ = exact + stemmed
 
383
  if occ > 0:
384
  count += occ
 
 
 
 
 
 
 
 
385
  if len(examples) < cfg.MAX_EXAMPLES:
386
  examples.append({
387
- "arabic": item.get("arabic", ""),
 
388
  "english": item.get("english", ""),
389
- "source": item.get("source") or item.get("reference", ""),
390
  })
391
 
392
  result = {
393
  "keyword": keyword,
394
  "kw_stemmed": kw_stem,
395
  "total_count": count,
 
396
  "examples": examples,
397
  }
398
  await analysis_cache.set(result, keyword)
@@ -400,7 +519,7 @@ async def count_occurrences(keyword: str, dataset: list) -> dict:
400
 
401
 
402
  # ═══════════════════════════════════════════════════════════════════════
403
- # HYBRID SEARCH — dense FAISS + BM25 re-ranking + hadith type-boost
404
  # ═══════════════════════════════════════════════════════════════════════
405
  def _bm25_score(
406
  query_terms: List[str],
@@ -409,6 +528,7 @@ def _bm25_score(
409
  k1: float = 1.5,
410
  b: float = 0.75,
411
  ) -> float:
 
412
  doc_tokens = tokenize_ar(doc_text)
413
  dl = len(doc_tokens)
414
  tf = Counter(doc_tokens)
@@ -426,8 +546,12 @@ async def hybrid_search(
426
  index: faiss.Index,
427
  dataset: list,
428
  top_n: int = cfg.TOP_K_RETURN,
 
 
429
  ) -> list:
430
- cached = await search_cache.get(raw_query, top_n)
 
 
431
  if cached:
432
  return cached
433
 
@@ -444,14 +568,29 @@ async def hybrid_search(
444
 
445
  distances, indices = index.search(fused.reshape(1, -1), cfg.TOP_K_SEARCH)
446
 
447
- # ��─ 2. De-duplicate candidates ─────────────────────────────────────
448
  seen: set = set()
449
  candidates = []
450
  for dist, idx in zip(distances[0], indices[0]):
451
  item_idx = int(idx) // 2
452
  if item_idx not in seen and 0 <= item_idx < len(dataset):
453
  seen.add(item_idx)
454
- candidates.append({**dataset[item_idx], "_dense": float(dist)})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
 
456
  # ── 3. BM25 sparse scoring ─────────────────────────────────────────
457
  query_terms = [
@@ -466,16 +605,32 @@ async def hybrid_search(
466
  doc = c.get("arabic", "") + " " + c.get("english", "")
467
  c["_sparse"] = _bm25_score(query_terms, doc, avg_dl)
468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  # ── 4. Score fusion ────────────────────────────────────────────────
470
  α = cfg.RERANK_ALPHA
471
- max_sparse = max((c["_sparse"] for c in candidates), default=1.0) or 1.0
472
  intent = rewrite.get("intent", "general")
473
 
 
 
 
 
 
 
474
  for c in candidates:
475
  base_score = α * c["_dense"] + (1 - α) * c["_sparse"] / max_sparse
476
- # ── FIX: boost Hadith items when the query is about a Hadith ──
477
- # This prevents Quran verses from always outranking Hadiths on
478
- # Sunnah-specific queries purely due to embedding distance.
479
  if intent == "hadith" and c.get("type") == "hadith":
480
  base_score += cfg.HADITH_BOOST
481
  c["_score"] = base_score
@@ -483,21 +638,20 @@ async def hybrid_search(
483
  candidates.sort(key=lambda x: x["_score"], reverse=True)
484
  results = candidates[:top_n]
485
 
486
- await search_cache.set(results, raw_query, top_n)
487
  return results
488
 
489
 
490
- def build_context(results: list, intent: str = "general") -> str:
 
491
  lines = []
492
  for i, r in enumerate(results, 1):
493
  source = r.get("source") or r.get("reference") or "Unknown Source"
494
- item_type = (
495
- "Quranic Verse"
496
- if re.search(r"سورة|surah|quran", source, re.I)
497
- else "Hadith"
498
- )
499
  lines.append(
500
- f"[{i}] 📌 {item_type} | {source} | score: {r.get('_score', 0):.3f}\n"
501
  f" Arabic : {r.get('arabic', '')}\n"
502
  f" English: {r.get('english', '')}"
503
  )
@@ -505,57 +659,60 @@ def build_context(results: list, intent: str = "general") -> str:
505
 
506
 
507
  # ═══════════════════════════════════════════════════════════════════════
508
- # PROMPT ENGINEERING — intent-aware, chain-of-thought, few-shot
509
  # ═════════════════════════════════════════════��═════════════════════════
510
  _PERSONA = (
511
- "You are Sheikh QModel, a meticulous Islamic scholar-assistant with deep expertise "
512
- "in Tafsir (Quranic exegesis), Hadith sciences, Fiqh, and Arabic linguistics. "
513
- "You respond with the rigour of a classical scholar and the clarity of a modern educator."
514
  )
515
 
516
  _TASK_INSTRUCTIONS: Dict[str, str] = {
517
  "tafsir": (
518
- "The user asks about a Quranic verse or its interpretation. Steps:\n"
519
- "1. Identify the verse(s) from the context below.\n"
520
- "2. Provide in-depth Tafsir: linguistic analysis, occasion of revelation "
521
- "(Asbab al-Nuzul) if present.\n"
522
- "3. Draw connections to related verses in the context.\n"
523
- "4. Answer the user's specific question directly."
524
  ),
525
  "hadith": (
526
  "The user asks about a Hadith. Steps:\n"
527
- "1. Locate the relevant Hadith(s) ONLY from the context block below.\n"
528
- "2. Quote the Arabic text and English translation EXACTLY as they appear "
529
- "in the context do not alter, complete, or paraphrase the wording.\n"
530
- "3. Elaborate on meaning, legal and spiritual implications (Fiqh / Tarbiya).\n"
531
- "4. Note any related Hadiths present in the context.\n"
532
- "CRITICAL: If the specific Hadith the user mentions is NOT present verbatim "
533
- "in the context, say so clearly. Do NOT reconstruct it from memory."
 
 
 
 
 
 
534
  ),
535
  "fatwa": (
536
- "The user seeks a religious ruling or guidance. Steps:\n"
537
- "1. Gather ALL relevant evidence (Quran + Sunnah) from the context.\n"
538
- "2. Reason step-by-step from the evidence to a conclusion.\n"
539
- "3. If the context is insufficient for a clear ruling, state so explicitly. "
540
- "Do NOT speculate."
541
  ),
542
  "count": (
543
- "The user asks for the frequency or count of a word/name. Steps:\n"
544
- "1. State the ANALYSIS RESULT prominently at the top.\n"
545
- "2. List up to 3 example occurrences from the context with their sources.\n"
546
- "3. Briefly comment on the significance of this repetition."
547
  ),
548
  "general": (
549
  "The user has a general Islamic question. Steps:\n"
550
  "1. Give a direct answer first.\n"
551
- "2. Support with evidence from the context.\n"
552
  "3. Conclude with a summary."
553
  ),
554
  }
555
 
556
- # ── FIX: hardened anti-hallucination rules ────────────────────────────────────
557
  _FORMAT_RULES = """\
558
- For EVERY piece of supporting evidence, use this exact format:
559
 
560
  ┌─────────────────────────────────────────────┐
561
  │ ❝ {Arabic text} ❞
@@ -563,50 +720,14 @@ For EVERY piece of supporting evidence, use this exact format:
563
  │ 📖 Source: {exact citation from context}
564
  └─────────────────────────────────────────────┘
565
 
566
- ABSOLUTE RULES — violations are unacceptable:
567
- • Use ONLY content from the Islamic Context block below. Zero outside knowledge.
568
- • Copy Arabic text and translations VERBATIM from the context. Never paraphrase,
569
- complete, or reconstruct a Hadith or verse from memory.
570
- If a specific Hadith or verse the user asks about is NOT present in the context
571
- block respond ONLY with:
572
- "هذا الحديث/الآية غير موجود في قاعدة البيانات المتاحة. يُرجى التحقق من مصادر موثوقة."
573
- (Arabic query) or:
574
- "This Hadith/verse is not in the available dataset. Please verify with a trusted source."
575
- (English query). Do NOT add anything else.
576
- • Never cite a reference that does not appear in the context block.
577
- • Never invent, guess, or infer content that is not explicitly in the context.
578
- • End every response with:
579
- - Arabic → "والله أعلم."
580
- - English → "And Allah knows best."
581
- """
582
-
583
- # ── FIX: few-shot now includes a "not found" refusal example ─────────────────
584
- _FEW_SHOT = """\
585
- === STRUCTURAL EXAMPLE A — evidence found (mimic structure, do not copy content) ===
586
- Question: What does Islam say about the importance of prayer?
587
-
588
- [Step 1 — Direct Answer]
589
- Prayer (Salah) is one of the Five Pillars and is described in the provided texts
590
- as the first act of worship a Muslim will be accountable for.
591
-
592
- [Step 2 — Supporting Evidence]
593
- ┌─────────────────────────────────────────────┐
594
- │ ❝ أَقِيمُوا الصَّلَاةَ ❞
595
- │ 📝 Translation: Establish prayer.
596
- │ 📖 Source: Surah Al-Baqarah 2:43
597
- └─────────────────────────────────────────────┘
598
-
599
- [Step 3 — Conclusion]
600
- The evidence shows prayer is central to the Muslim's covenant with Allah.
601
- And Allah knows best.
602
-
603
- === STRUCTURAL EXAMPLE B — evidence NOT found (mandatory refusal path) ===
604
- Question: ما أحاديث الصبر الواردة في السنة؟
605
- (No matching Hadith appears in the Islamic Context block)
606
-
607
- هذا الحديث/الآية غير موجود في قاعدة البيانات المتاحة. يُرجى التحقق من مصادر موثوقة.
608
- والله أعلم.
609
- === END EXAMPLES ===\
610
  """
611
 
612
  _SYSTEM_TEMPLATE = """\
@@ -620,9 +741,7 @@ _SYSTEM_TEMPLATE = """\
620
  === OUTPUT FORMAT ===
621
  {fmt}
622
 
623
- {few_shot}
624
-
625
- === ISLAMIC CONTEXT (your ONLY source of truth) ===
626
  {context}
627
  === END CONTEXT ===
628
  """
@@ -635,12 +754,16 @@ def build_messages(
635
  intent: str,
636
  analysis: Optional[dict] = None,
637
  ) -> List[dict]:
 
638
  if analysis:
 
 
 
 
639
  analysis_block = (
640
  f"\n[ANALYSIS RESULT]\n"
641
- f"The keyword «{analysis['keyword']}» "
642
- f"(root form: «{analysis.get('kw_stemmed', '')}») "
643
- f"appears {analysis['total_count']} times in the dataset.\n"
644
  )
645
  context = analysis_block + context
646
 
@@ -649,7 +772,6 @@ def build_messages(
649
  lang_instruction=language_instruction(lang),
650
  task=_TASK_INSTRUCTIONS.get(intent, _TASK_INSTRUCTIONS["general"]),
651
  fmt=_FORMAT_RULES,
652
- few_shot=_FEW_SHOT,
653
  context=context,
654
  )
655
 
@@ -664,28 +786,59 @@ def build_messages(
664
  ]
665
 
666
 
667
- # ═══════════════════════════════════════════════════════════════════════
668
- # SAFE "NOT FOUND" FALLBACK ANSWER
669
- # ═══════════════════════════════════════════════════════════════════════
670
  def _not_found_answer(lang: str) -> str:
671
- """
672
- Returned instead of calling the LLM when retrieval confidence is too low.
673
- Prevents hallucination on queries where the dataset has no relevant content.
674
- """
675
  if lang == "arabic":
676
  return (
677
- "لم أجد في قاعدة البيانات المتاحة ما يكفي للإجابة على هذا السؤال بدقة.\n"
678
- "يُرجى الرجوع إلى ��صادر إسلامية موثوقة للتحقق من المعلومات.\n"
679
  "والله أعلم."
680
  )
681
  return (
682
  "The available dataset does not contain sufficient information to answer "
683
- "this question accurately.\n"
684
- "Please refer to trusted Islamic sources to verify.\n"
685
  "And Allah knows best."
686
  )
687
 
688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
  # ═══════════════════════════════════════════════════════════════════════
690
  # APP STATE
691
  # ═══════════════════════════════════════════════════════════════════════
@@ -693,6 +846,7 @@ class AppState:
693
  embed_model: Optional[SentenceTransformer] = None
694
  faiss_index: Optional[faiss.Index] = None
695
  dataset: Optional[list] = None
 
696
  ready: bool = False
697
 
698
 
@@ -701,6 +855,7 @@ state = AppState()
701
 
702
  @asynccontextmanager
703
  async def lifespan(app: FastAPI):
 
704
  logger.info("⏳ Loading embed model: %s", cfg.EMBED_MODEL)
705
  state.embed_model = SentenceTransformer(cfg.EMBED_MODEL)
706
 
@@ -711,20 +866,19 @@ async def lifespan(app: FastAPI):
711
  with open(cfg.METADATA_FILE, "r", encoding="utf-8") as f:
712
  state.dataset = json.load(f)
713
 
714
- primary = cfg.LLM_MODEL
715
- try:
716
- client = ollama.Client(host=cfg.OLLAMA_HOST)
717
- client.chat(model=primary, messages=[{"role": "user", "content": "ping"}])
718
- logger.info("✅ Primary Ollama model reachable: %s", primary)
719
- except Exception as exc:
720
- logger.warning(
721
- "Primary model %s not reachable (%s). Will use fallback chain.", primary, exc
722
- )
723
 
724
  state.ready = True
725
  logger.info(
726
- "✅ QModel v3.1 ready | dataset=%d | faiss=%d | confidence_threshold=%.2f",
727
- len(state.dataset), state.faiss_index.ntotal, cfg.CONFIDENCE_THRESHOLD,
 
 
 
728
  )
729
  yield
730
  state.ready = False
@@ -735,9 +889,9 @@ async def lifespan(app: FastAPI):
735
  # FASTAPI APP
736
  # ═══════════════════════════════════════════════════════════════════════
737
  app = FastAPI(
738
- title="QModel v3.1 — Islamic RAG API",
739
- description="High-fidelity Retrieval-Augmented Generation over Qur'an & Sunnah",
740
- version="3.1.0",
741
  lifespan=lifespan,
742
  )
743
 
@@ -758,47 +912,108 @@ class ChatMessage(BaseModel):
758
  content: str = Field(..., min_length=1, max_length=4000)
759
 
760
 
761
- class ChatCompletionRequest(BaseModel):
762
- model: str = "QModel"
763
- messages: List[ChatMessage]
764
- temperature: Optional[float] = Field(cfg.TEMPERATURE, ge=0.0, le=2.0)
765
- max_tokens: Optional[int] = Field(cfg.MAX_TOKENS, ge=1, le=8192)
766
- stream: Optional[bool] = False
767
- top_k: Optional[int] = Field(cfg.TOP_K_RETURN, ge=1, le=20)
768
-
769
- @validator("messages")
770
- def has_user_message(cls, v):
771
- if not any(m.role == "user" for m in v):
772
- raise ValueError("At least one user message is required")
773
- return v
774
-
775
-
776
  class AnalysisResult(BaseModel):
777
  keyword: str
778
  kw_stemmed: str
779
  total_count: int
 
780
  examples: List[dict]
781
 
782
 
 
 
 
 
 
 
 
 
 
783
  class AskResponse(BaseModel):
784
- question: str
785
- answer: str
786
- language: str
787
- intent: str
788
- analysis: Optional[AnalysisResult] = None
789
- sources: List[dict]
790
- top_score: float # FIX: expose top retrieval score for caller transparency
791
- latency_ms: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
 
793
 
794
  # ═══════════════════════════════════════════════════════════════════════
795
- # CORE ASYNC RAG PIPELINE
796
  # ═══════════════════════════════════════════════════════════════════════
797
- async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict:
 
 
 
 
 
 
798
  t0 = time.perf_counter()
799
 
800
  # 1. Query rewriting
801
- rewrite = await rewrite_query(question)
802
  intent = rewrite.get("intent", "general")
803
 
804
  # 2. Intent detection + hybrid search — concurrently
@@ -807,7 +1022,7 @@ async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict
807
  hybrid_search(
808
  question, rewrite,
809
  state.embed_model, state.faiss_index, state.dataset,
810
- top_k,
811
  ),
812
  )
813
  analysis_kw, results = await asyncio.gather(kw_task, search_task)
@@ -827,14 +1042,10 @@ async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict
827
  intent, top_score, cfg.CONFIDENCE_THRESHOLD,
828
  )
829
 
830
- # ── FIX: confidence gate ───────────────────────────────────────────
831
- # If the best retrieved result is below the threshold, skip the LLM
832
- # entirely and return a safe "not in dataset" answer.
833
- # This is the primary defence against hallucination on Hadith queries
834
- # where the dataset has no matching content.
835
  if top_score < cfg.CONFIDENCE_THRESHOLD:
836
  logger.warning(
837
- "Low confidence (%.3f < %.2f) — returning safe fallback, skipping LLM",
838
  top_score, cfg.CONFIDENCE_THRESHOLD,
839
  )
840
  return {
@@ -847,24 +1058,19 @@ async def run_rag_pipeline(question: str, top_k: int = cfg.TOP_K_RETURN) -> dict
847
  "latency_ms": int((time.perf_counter() - t0) * 1000),
848
  }
849
 
850
- # 5. Build context + prompt
851
- context = build_context(results, intent)
852
  messages = build_messages(context, question, lang, intent, analysis)
853
 
854
- # 6. LLM call (sync client → threadpool)
855
- loop = asyncio.get_event_loop()
856
  try:
857
- answer = await loop.run_in_executor(
858
- None,
859
- lambda: chat_with_fallback(
860
- messages,
861
- max_tokens=cfg.MAX_TOKENS,
862
- temperature=cfg.TEMPERATURE,
863
- ),
864
  )
865
- except RuntimeError as exc:
866
- logger.error("All LLM models failed: %s", exc)
867
- raise HTTPException(status_code=502, detail=str(exc))
868
 
869
  latency = int((time.perf_counter() - t0) * 1000)
870
  logger.info(
@@ -896,38 +1102,219 @@ def _check_ready():
896
  # ═══════════════════════════════════════════════════════════════════════
897
  @app.get("/health", tags=["ops"])
898
  def health():
 
899
  return {
900
  "status": "ok" if state.ready else "initialising",
901
- "version": "3.1.0",
 
902
  "dataset_size": len(state.dataset) if state.dataset else 0,
903
  "faiss_total": state.faiss_index.ntotal if state.faiss_index else 0,
904
  "confidence_threshold": cfg.CONFIDENCE_THRESHOLD,
905
- "hadith_boost": cfg.HADITH_BOOST,
906
  }
907
 
908
 
909
- @app.get("/v1/models", tags=["models"])
910
  def list_models():
911
- return {
912
- "object": "list",
913
- "data": [{
914
- "id": "QModel",
915
- "object": "model",
916
- "created": int(time.time()),
917
- "owned_by": "elgendy",
918
- "description": "Islamic RAG over Qur'an & Sunnah (v3.1)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
919
  }],
920
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921
 
922
 
923
  @app.get("/debug/scores", tags=["ops"])
924
  async def debug_scores(
925
- q: str = Query(..., min_length=1, max_length=1000),
926
  top_k: int = Query(10, ge=1, le=20),
927
  ):
928
- """Returns raw retrieval scores without calling the LLM. Use to calibrate CONFIDENCE_THRESHOLD."""
929
  _check_ready()
930
- rewrite = await rewrite_query(q)
931
  results = await hybrid_search(q, rewrite, state.embed_model, state.faiss_index, state.dataset, top_k)
932
  return {
933
  "intent": rewrite.get("intent"),
@@ -937,54 +1324,16 @@ async def debug_scores(
937
  "rank": i + 1,
938
  "source": r.get("source") or r.get("reference"),
939
  "type": r.get("type"),
 
940
  "_dense": round(r.get("_dense", 0), 4),
941
  "_sparse": round(r.get("_sparse", 0), 4),
942
  "_score": round(r.get("_score", 0), 4),
943
- "snippet": r.get("english", "")[:80],
944
  }
945
  for i, r in enumerate(results)
946
  ],
947
  }
948
 
949
 
950
- @app.get("/ask", response_model=AskResponse, tags=["inference"])
951
- async def ask(
952
- q: str = Query(..., min_length=1, max_length=1000, description="Your Islamic question"),
953
- top_k: int = Query(cfg.TOP_K_RETURN, ge=1, le=20, description="Sources to retrieve"),
954
- ):
955
- _check_ready()
956
- result = await run_rag_pipeline(q, top_k=top_k)
957
- return AskResponse(question=q, **result)
958
-
959
-
960
- @app.post("/v1/chat/completions", tags=["inference"])
961
- async def chat_completions(req: ChatCompletionRequest):
962
- _check_ready()
963
- user_msgs = [m.content for m in req.messages if m.role == "user"]
964
- question = user_msgs[-1]
965
- result = await run_rag_pipeline(question, top_k=req.top_k or cfg.TOP_K_RETURN)
966
-
967
- return {
968
- "id": f"chatcmpl-{int(time.time())}",
969
- "object": "chat.completion",
970
- "created": int(time.time()),
971
- "model": req.model,
972
- "choices": [{
973
- "index": 0,
974
- "message": {"role": "assistant", "content": result["answer"]},
975
- "finish_reason": "stop",
976
- }],
977
- "usage": {
978
- "prompt_tokens": -1,
979
- "completion_tokens": -1,
980
- "total_tokens": -1,
981
- },
982
- "x_metadata": {
983
- "language": result["language"],
984
- "intent": result["intent"],
985
- "top_score": result["top_score"],
986
- "latency_ms": result["latency_ms"],
987
- "sources_count": len(result["sources"]),
988
- "analysis": result["analysis"],
989
- },
990
- }
 
1
  """
2
+ QModel v4 — Islamic RAG API
3
+ ===========================
4
+ Specialized Quran & Hadith system with dual LLM backend support.
5
+
6
+ Features:
7
+ Dual backend: Hugging Face (transformers) + Ollama
8
+ Grade filtering: Return only Sahih/Hasan Hadiths
9
+ Source filtering: Quran-only or Hadith-only queries
10
+ Hadith verification: Quick auth check endpoint
11
+ Word frequency: Enhanced with Surah grouping
12
+ No hallucinations: Confidence gating + few-shot anti-hallucination
13
+ • Arabic & English: Full bilingual support with proper normalization
14
+
15
+ Configuration via .env:
16
+ LLM_BACKEND=hf|ollama (default: hf)
17
+ HF_MODEL_NAME=<hf-model-id> (e.g. gpt2, default: Qwen/Qwen2-7B-Instruct)
18
+ OLLAMA_HOST=<url> (e.g. http://localhost:11434, default: http://localhost:11434)
19
+ OLLAMA_MODEL=<model> (e.g. llama2, default: llama2)
20
+ EMBED_MODEL=intfloat/multilingual-e5-large (embedding model)
21
  """
22
 
23
  from __future__ import annotations
 
31
  import time
32
  from collections import Counter, OrderedDict
33
  from contextlib import asynccontextmanager
34
+ from typing import Dict, List, Literal, Optional
35
 
36
  import faiss
37
  import numpy as np
38
  from dotenv import load_dotenv
39
  from fastapi import FastAPI, HTTPException, Query
40
  from fastapi.middleware.cors import CORSMiddleware
41
+ from fastapi.responses import StreamingResponse
42
  from pydantic import BaseModel, Field, validator
43
  from sentence_transformers import SentenceTransformer
44
 
 
55
 
56
 
57
  # ═══════════════════════════════════════════════════════════════════════
58
+ # CONFIG & LLM FACTORY
59
  # ═══════════════════════════════════════════════════════════════════════
60
  class Config:
61
+ """Centralized configuration with dual backend support."""
62
+
63
+ # Backend selection
64
+ LLM_BACKEND: str = os.getenv("LLM_BACKEND", "ollama") # "hf" or "ollama"
65
+
66
+ # Hugging Face backend
67
+ HF_MODEL_NAME: str = os.getenv("HF_MODEL_NAME", "Qwen/Qwen2-7B-Instruct")
68
+ HF_DEVICE: str = os.getenv("HF_DEVICE", "auto")
69
+ HF_MAX_NEW_TOKENS: int = int(os.getenv("HF_MAX_NEW_TOKENS", 2048))
70
+
71
+ # Ollama backend
72
  OLLAMA_HOST: str = os.getenv("OLLAMA_HOST", "http://localhost:11434")
73
+ OLLAMA_MODEL: str = os.getenv("OLLAMA_MODEL", "llama2")
74
+
75
+ # Embedding model
76
+ EMBED_MODEL: str = os.getenv("EMBED_MODEL", "intfloat/multilingual-e5-large")
77
+
78
+ # Index & data
79
+ FAISS_INDEX: str = os.getenv("FAISS_INDEX", "QModel.index")
80
+ METADATA_FILE: str = os.getenv("METADATA_FILE", "metadata.json")
81
+
82
+ # Retrieval
83
+ TOP_K_SEARCH: int = int(os.getenv("TOP_K_SEARCH", 20)) # candidate pool
84
+ TOP_K_RETURN: int = int(os.getenv("TOP_K_RETURN", 5)) # final results
85
+
86
+ # Generation
87
+ TEMPERATURE: float = float(os.getenv("TEMPERATURE", 0.2))
88
+ MAX_TOKENS: int = int(os.getenv("MAX_TOKENS", 2048))
89
+
90
+ # Caching
91
+ CACHE_SIZE: int = int(os.getenv("CACHE_SIZE", 512))
92
+ CACHE_TTL: int = int(os.getenv("CACHE_TTL", 3600))
93
+
94
+ # Ranking
95
+ RERANK_ALPHA: float = float(os.getenv("RERANK_ALPHA", 0.6)) # 60% dense, 40% sparse
96
  HADITH_BOOST: float = float(os.getenv("HADITH_BOOST", 0.08))
97
 
98
+ # Safety
99
+ CONFIDENCE_THRESHOLD: float = float(os.getenv("CONFIDENCE_THRESHOLD", 0.30))
100
+
101
+ # CORS
102
+ ALLOWED_ORIGINS: str = os.getenv("ALLOWED_ORIGINS", "*")
103
+
104
+ MAX_EXAMPLES: int = int(os.getenv("MAX_EXAMPLES", 3))
105
+
106
+
107
  cfg = Config()
108
 
109
+
110
+ # ═══════════════════════════════════════════════════════════════════════
111
+ # LLM ABSTRACTION LAYER
112
+ # ═══════════════════════════════════════════════════════════════════════
113
+ class LLMProvider:
114
+ """Abstract base for LLM providers."""
115
+
116
+ async def chat(
117
+ self, messages: List[dict], temperature: float, max_tokens: int
118
+ ) -> str:
119
+ raise NotImplementedError
120
+
121
+
122
+ class OllamaProvider(LLMProvider):
123
+ """Ollama-based LLM provider."""
124
+
125
+ def __init__(self, host: str, model: str):
126
+ self.host = host
127
+ self.model = model
128
+ try:
129
+ import ollama
130
+ self.client = ollama.Client(host=host)
131
+ except ImportError:
132
+ raise ImportError("Install ollama: pip install ollama")
133
+
134
+ async def chat(
135
+ self, messages: List[dict], temperature: float, max_tokens: int
136
+ ) -> str:
137
+ loop = asyncio.get_event_loop()
138
+ try:
139
+ result = await loop.run_in_executor(
140
+ None,
141
+ lambda: self.client.chat(
142
+ model=self.model,
143
+ messages=messages,
144
+ options={"temperature": temperature, "num_predict": max_tokens},
145
+ ),
146
+ )
147
+ return result["message"]["content"].strip()
148
+ except Exception as exc:
149
+ logger.error("Ollama chat failed: %s", exc)
150
+ raise
151
+
152
+
153
+ class HuggingFaceProvider(LLMProvider):
154
+ """Hugging Face transformers-based LLM provider."""
155
+
156
+ def __init__(self, model_name: str, device: str):
157
+ self.model_name = model_name
158
+ self.device = device
159
+ try:
160
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
161
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
162
+ self.model = AutoModelForCausalLM.from_pretrained(
163
+ model_name,
164
+ device_map=device,
165
+ torch_dtype="auto",
166
+ )
167
+ self.pipeline = TextGenerationPipeline(
168
+ model=self.model,
169
+ tokenizer=self.tokenizer,
170
+ device=0 if device != "cpu" else None,
171
+ )
172
+ except ImportError:
173
+ raise ImportError("Install transformers: pip install transformers torch")
174
+
175
+ async def chat(
176
+ self, messages: List[dict], temperature: float, max_tokens: int
177
+ ) -> str:
178
+ # Format messages for the model
179
+ prompt = self._format_messages(messages)
180
+
181
+ loop = asyncio.get_event_loop()
182
+ try:
183
+ result = await loop.run_in_executor(
184
+ None,
185
+ lambda: self.pipeline(
186
+ prompt,
187
+ max_new_tokens=max_tokens,
188
+ temperature=temperature,
189
+ do_sample=temperature > 0,
190
+ ),
191
+ )
192
+ # Extract generated text
193
+ generated = result[0]["generated_text"]
194
+ # Remove the prompt from generated text
195
+ output = generated[len(prompt):].strip()
196
+ return output
197
+ except Exception as exc:
198
+ logger.error("HF chat failed: %s", exc)
199
+ raise
200
+
201
+ def _format_messages(self, messages: List[dict]) -> str:
202
+ """Format messages for the model."""
203
+ prompt = ""
204
+ for msg in messages:
205
+ role = msg["role"]
206
+ content = msg["content"]
207
+ if role == "system":
208
+ prompt += f"{content}\n\n"
209
+ elif role == "user":
210
+ prompt += f"User: {content}\n"
211
+ elif role == "assistant":
212
+ prompt += f"Assistant: {content}\n"
213
+ prompt += "Assistant: "
214
+ return prompt
215
+
216
+
217
+ def get_llm_provider() -> LLMProvider:
218
+ """Factory function to get the configured LLM provider."""
219
+ if cfg.LLM_BACKEND == "ollama":
220
+ logger.info("Using Ollama backend: %s @ %s", cfg.OLLAMA_MODEL, cfg.OLLAMA_HOST)
221
+ return OllamaProvider(cfg.OLLAMA_HOST, cfg.OLLAMA_MODEL)
222
+ elif cfg.LLM_BACKEND == "hf":
223
+ logger.info("Using HuggingFace backend: %s on %s", cfg.HF_MODEL_NAME, cfg.HF_DEVICE)
224
+ return HuggingFaceProvider(cfg.HF_MODEL_NAME, cfg.HF_DEVICE)
225
+ else:
226
+ raise ValueError(f"Unknown LLM_BACKEND: {cfg.LLM_BACKEND}")
227
 
228
 
229
  # ═══════════════════════════════════════════════════════════════════════
 
267
  rewrite_cache = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL * 6)
268
 
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  # ═══════════════════════════════════════════════════════════════════════
271
  # ARABIC NLP — normalisation + light stemming
272
  # ═══════════════════════════════════════════════════════════════════════
273
+ _DIACRITICS = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u0671\u06D6-\u06ED]")
274
  _ALEF_VARS = re.compile(r"[أإآٱ]")
275
  _WAW_HAMZA = re.compile(r"ؤ")
276
  _YA_HAMZA = re.compile(r"ئ")
 
285
  "قران": "قرآن",
286
  "القران": "القرآن",
287
  "اللہ": "الله",
 
 
288
  }
289
 
290
 
291
  def normalize_arabic(text: str, *, aggressive: bool = False) -> str:
292
+ """Normalize Arabic text: diacritics, hamza, ta marbuta, etc."""
293
  text = _DIACRITICS.sub("", text)
294
  text = _TATWEEL.sub("", text)
295
  text = _ALEF_VARS.sub("ا", text)
 
314
 
315
 
316
  def light_stem(word: str) -> str:
317
+ """Light stemming: remove common Arabic affixes."""
318
  w = _AR_PREFIXES.sub("", word)
319
  w = _AR_SUFFIXES.sub("", w)
320
  return w if len(w) >= 2 else word
321
 
322
 
323
  def tokenize_ar(text: str) -> List[str]:
324
+ """Tokenize and stem Arabic text."""
325
  norm = normalize_arabic(text, aggressive=True).lower()
326
  return [light_stem(t) for t in norm.split() if t]
327
 
 
334
  )
335
 
336
 
337
+ def detect_language(text: str) -> Literal["arabic", "english", "mixed"]:
338
+ """Detect if text is Arabic, English, or mixed."""
339
  ar = len(_ARABIC_SCRIPT.findall(text))
340
  en = len(re.findall(r"[a-zA-Z]", text))
341
  tot = ar + en or 1
 
348
 
349
 
350
  def language_instruction(lang: str) -> str:
351
+ """Generate language-specific instruction for LLM."""
352
  return {
353
  "arabic": (
354
  "يجب أن تكون الإجابة كاملةً باللغة العربية الفصحى تماماً. "
 
374
  "ar_query": "<query in clear Arabic فصحى, ≤25 words>",
375
  "en_query": "<query in clear English, ≤25 words>",
376
  "keywords": ["<3-7 key Arabic or English terms from the question>"],
377
+ "intent": "<one of: fatwa | tafsir | hadith | count | auth | general>"
378
  }
379
 
380
+ Intent Detection Rules (CRITICAL):
381
+ - 'count' intent = asking for number/frequency (كم مرة, how many times, count occurrences)
382
+ - 'auth' intent = asking about authenticity (صحيح؟, هل صحيح, is it authentic, verify hadith grade)
383
+ - 'hadith' intent = asking about specific hadith meaning/text (not authenticity)
384
+ - 'tafsir' intent = asking about Quranic verses or Islamic ruling (fatwa)
385
+ - 'general' intent = other questions
386
+
387
+ Examples:
388
+ - "كم مرة ذُكرت كلمة مريم" → intent: count
389
+ - "هل حديث إنما الأعمال بالنيات صحيح" → intent: auth (asking if authentic!)
390
+ - "ما معنى حديث إنما الأعمال" → intent: hadith
391
+ - "ما حكم الربا في الإسلام" → intent: fatwa
392
  """
393
 
394
 
395
+ async def rewrite_query(raw: str, llm: LLMProvider) -> Dict:
396
+ """Rewrite query for better retrieval."""
397
  cached = await rewrite_cache.get(raw)
398
  if cached:
399
  return cached
 
405
  "intent": "general",
406
  }
407
  try:
408
+ text = await llm.chat(
409
  messages=[
410
  {"role": "system", "content": _REWRITE_SYSTEM},
411
  {"role": "user", "content": raw},
 
418
  for k in ("ar_query", "en_query", "keywords", "intent"):
419
  result.setdefault(k, fallback[k])
420
  await rewrite_cache.set(result, raw)
421
+ logger.info("Rewrite: intent=%s ar=%s", result["intent"], result["ar_query"][:60])
 
 
422
  return result
423
  except Exception as exc:
424
  logger.warning("Query rewrite failed (%s) — using fallback", exc)
 
426
 
427
 
428
  # ═══════════════════════════════════════════════════════════════════════
429
+ # INTENT DETECTION (frequency / count queries / hadith auth)
430
  # ═══════════════════════════════════════════════════════════════════════
431
  _COUNT_EN = re.compile(
432
  r"\b(how many|count|number of|frequency|occurrences? of|how often|"
 
437
  r"(كم مرة|كم عدد|كم تكرر|عدد مرات|تكرار|كم ذُكر|كم وردت?)"
438
  )
439
 
440
+ _AUTH_EN = re.compile(
441
+ r"\b(authentic|is.*authentic|authenticity|sahih|hasan|weak|daif|verify)\b",
442
+ re.I,
443
+ )
444
+ _AUTH_AR = re.compile(
445
+ r"(صحيح|حسن|ضعيف|درجة|صحة|تصحيح|هل.*صحيح|هل.*ضعيف)"
446
+ )
447
 
448
 
449
  async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
450
+ """Detect if query is asking for word frequency analysis."""
451
  if rewrite.get("intent") == "count":
452
  kws = rewrite.get("keywords", [])
453
  return kws[0] if kws else None
 
455
  if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
456
  return None
457
 
458
+ # Simple heuristic: last word after "how many"
459
+ for pat in (_COUNT_EN, _COUNT_AR):
460
+ m = pat.search(query)
461
+ if m:
462
+ tail = query[m.end():].strip().split()
463
+ if tail:
464
+ return tail[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  return None
466
 
467
 
 
469
  # OCCURRENCE ANALYSIS (exact + stemmed matching)
470
  # ═══════════════════════════════════════════════════════════════════════
471
  async def count_occurrences(keyword: str, dataset: list) -> dict:
472
+ """Count keyword occurrences with Surah grouping."""
473
  cached = await analysis_cache.get(keyword)
474
  if cached:
475
  return cached
 
477
  kw_norm = normalize_arabic(keyword, aggressive=True).lower()
478
  kw_stem = light_stem(kw_norm)
479
  count = 0
480
+ by_surah: Dict[int, Dict] = {}
481
  examples: list = []
482
 
483
  for item in dataset:
484
+ if item.get("type") != "quran":
485
+ continue
486
+
487
+ ar_norm = normalize_arabic(item.get("arabic", ""), aggressive=True).lower()
488
  combined = f"{ar_norm} {item.get('english', '')}".lower()
489
  exact = combined.count(kw_norm)
490
  stemmed = combined.count(kw_stem) - exact if kw_stem != kw_norm else 0
491
  occ = exact + stemmed
492
+
493
  if occ > 0:
494
  count += occ
495
+ surah_num = item.get("surah_number", 0)
496
+ if surah_num not in by_surah:
497
+ by_surah[surah_num] = {
498
+ "name": item.get("surah_name_en", f"Surah {surah_num}"),
499
+ "count": 0,
500
+ }
501
+ by_surah[surah_num]["count"] += occ
502
+
503
  if len(examples) < cfg.MAX_EXAMPLES:
504
  examples.append({
505
+ "reference": item.get("source", ""),
506
+ "arabic": item.get("arabic", ""),
507
  "english": item.get("english", ""),
 
508
  })
509
 
510
  result = {
511
  "keyword": keyword,
512
  "kw_stemmed": kw_stem,
513
  "total_count": count,
514
+ "by_surah": dict(sorted(by_surah.items())),
515
  "examples": examples,
516
  }
517
  await analysis_cache.set(result, keyword)
 
519
 
520
 
521
  # ═══════════════════════════════════════════════════════════════════════
522
+ # HYBRID SEARCH — dense FAISS + BM25 re-ranking + filtering
523
  # ═══════════════════════════════════════════════════════════════════════
524
  def _bm25_score(
525
  query_terms: List[str],
 
528
  k1: float = 1.5,
529
  b: float = 0.75,
530
  ) -> float:
531
+ """BM25 term-frequency scoring."""
532
  doc_tokens = tokenize_ar(doc_text)
533
  dl = len(doc_tokens)
534
  tf = Counter(doc_tokens)
 
546
  index: faiss.Index,
547
  dataset: list,
548
  top_n: int = cfg.TOP_K_RETURN,
549
+ source_type: Optional[Literal["quran", "hadith"]] = None,
550
+ grade_filter: Optional[str] = None,
551
  ) -> list:
552
+ """Hybrid search: dense + sparse with optional filtering."""
553
+ cache_key = (raw_query, top_n, source_type, grade_filter)
554
+ cached = await search_cache.get(*cache_key)
555
  if cached:
556
  return cached
557
 
 
568
 
569
  distances, indices = index.search(fused.reshape(1, -1), cfg.TOP_K_SEARCH)
570
 
571
+ # ─ 2. De-duplicate candidates & apply filters ─────────────────────
572
  seen: set = set()
573
  candidates = []
574
  for dist, idx in zip(distances[0], indices[0]):
575
  item_idx = int(idx) // 2
576
  if item_idx not in seen and 0 <= item_idx < len(dataset):
577
  seen.add(item_idx)
578
+ item = dataset[item_idx]
579
+
580
+ # Source type filter
581
+ if source_type and item.get("type") != source_type:
582
+ continue
583
+
584
+ # Grade filter (Hadith only)
585
+ if grade_filter and item.get("type") == "hadith":
586
+ item_grade = item.get("grade", "").lower()
587
+ if grade_filter.lower() not in item_grade:
588
+ continue
589
+
590
+ candidates.append({**item, "_dense": float(dist)})
591
+
592
+ if not candidates:
593
+ return []
594
 
595
  # ── 3. BM25 sparse scoring ─────────────────────────────────────────
596
  query_terms = [
 
605
  doc = c.get("arabic", "") + " " + c.get("english", "")
606
  c["_sparse"] = _bm25_score(query_terms, doc, avg_dl)
607
 
608
+ # ── 3.5. Phrase matching boost for exact snippets ───────────────────
609
+ query_norm = normalize_arabic(raw_query, aggressive=False).lower()
610
+ for c in candidates:
611
+ # For hadiths: if query contains specific text, boost exact match
612
+ if c.get("type") == "hadith":
613
+ ar_norm = normalize_arabic(c.get("arabic", ""), aggressive=False).lower()
614
+ # Check if any significant phrase (3+ words) from query appears in hadith
615
+ query_fragments = query_norm.split()
616
+ for i in range(len(query_fragments) - 2):
617
+ phrase = " ".join(query_fragments[i:i+3])
618
+ if len(phrase) > 5 and phrase in ar_norm: # phrase is 5+ chars
619
+ c["_sparse"] += 2.0 # boost exact phrase match
620
+ break
621
+
622
  # ── 4. Score fusion ────────────────────────────────────────────────
623
  α = cfg.RERANK_ALPHA
 
624
  intent = rewrite.get("intent", "general")
625
 
626
+ # For hadith authenticity queries, rely more on semantic search
627
+ if intent == "auth":
628
+ α = 0.75 # 75% dense, 25% sparse (vs default 60/40)
629
+
630
+ max_sparse = max((c["_sparse"] for c in candidates), default=1.0) or 1.0
631
+
632
  for c in candidates:
633
  base_score = α * c["_dense"] + (1 - α) * c["_sparse"] / max_sparse
 
 
 
634
  if intent == "hadith" and c.get("type") == "hadith":
635
  base_score += cfg.HADITH_BOOST
636
  c["_score"] = base_score
 
638
  candidates.sort(key=lambda x: x["_score"], reverse=True)
639
  results = candidates[:top_n]
640
 
641
+ await search_cache.set(results, *cache_key)
642
  return results
643
 
644
 
645
+ def build_context(results: list) -> str:
646
+ """Format search results into context block for LLM."""
647
  lines = []
648
  for i, r in enumerate(results, 1):
649
  source = r.get("source") or r.get("reference") or "Unknown Source"
650
+ item_type = "Quranic Verse" if r.get("type") == "quran" else "Hadith"
651
+ grade_str = f" [Grade: {r.get('grade')}]" if r.get("grade") else ""
652
+
 
 
653
  lines.append(
654
+ f"[{i}] 📌 {item_type}{grade_str} | {source} | score: {r.get('_score', 0):.3f}\n"
655
  f" Arabic : {r.get('arabic', '')}\n"
656
  f" English: {r.get('english', '')}"
657
  )
 
659
 
660
 
661
  # ═══════════════════════════════════════════════════════════════════════
662
+ # PROMPT ENGINEERING
663
  # ═════════════════════════════════════════════��═════════════════════════
664
  _PERSONA = (
665
+ "You are Sheikh QModel, a meticulous Islamic scholar with expertise "
666
+ "in Tafsir (Quranic exegesis), Hadith sciences, Fiqh, and Arabic. "
667
+ "You respond with scholarly rigor and modern clarity."
668
  )
669
 
670
  _TASK_INSTRUCTIONS: Dict[str, str] = {
671
  "tafsir": (
672
+ "The user asks about a Quranic verse. Steps:\n"
673
+ "1. Identify the verse(s) from context.\n"
674
+ "2. Provide Tafsir: linguistic analysis and deeper meaning.\n"
675
+ "3. Draw connections to related verses.\n"
676
+ "4. Answer the user's question directly."
 
677
  ),
678
  "hadith": (
679
  "The user asks about a Hadith. Steps:\n"
680
+ "1. Quote the text EXACTLY from the context below.\n"
681
+ "2. Explain the meaning and implications.\n"
682
+ "3. Note any related Hadiths.\n"
683
+ "CRITICAL: If the Hadith is NOT in context, say so clearly."
684
+ ),
685
+ "auth": (
686
+ "The user asks about Hadith authenticity. YOU MUST:\n"
687
+ "1. Check if the Hadith is in the context below.\n"
688
+ "2. If FOUND, state the grade (Sahih, Hasan, Da'if, etc.) confidently.\n"
689
+ "3. If found in Sahih Bukhari or Sahih Muslim, assert it is AUTHENTIC (Sahih).\n"
690
+ "4. Provide the Hadith text from context and explain its authenticity basis.\n"
691
+ "5. If NOT found after careful search, clearly state it's absent from the dataset.\n"
692
+ "CRITICAL: Use the context provided. Do not rely on your training data."
693
  ),
694
  "fatwa": (
695
+ "The user seeks a religious ruling. Steps:\n"
696
+ "1. Gather evidence from Quran + Sunnah in context.\n"
697
+ "2. Reason step-by-step to a conclusion.\n"
698
+ "3. If insufficient, state so explicitly."
 
699
  ),
700
  "count": (
701
+ "The user asks for word frequency. Steps:\n"
702
+ "1. State the ANALYSIS RESULT prominently.\n"
703
+ "2. List example occurrences with Surah names.\n"
704
+ "3. Comment on significance."
705
  ),
706
  "general": (
707
  "The user has a general Islamic question. Steps:\n"
708
  "1. Give a direct answer first.\n"
709
+ "2. Support with evidence from context.\n"
710
  "3. Conclude with a summary."
711
  ),
712
  }
713
 
 
714
  _FORMAT_RULES = """\
715
+ For EVERY supporting evidence, use this exact format:
716
 
717
  ┌─────────────────────────────────────────────┐
718
  │ ❝ {Arabic text} ❞
 
720
  │ 📖 Source: {exact citation from context}
721
  └─────────────────────────────────────────────┘
722
 
723
+ ABSOLUTE RULES:
724
+ • Use ONLY content from the Islamic Context block. Zero outside knowledge.
725
+ • Copy Arabic text and translations VERBATIM from context. Never paraphrase.
726
+ If a specific Hadith/verse is NOT in context → respond with:
727
+ "هذا الحديث/الآية غير موجود في قاعدة البيانات." (Arabic)
728
+ or "This Hadith/verse is not in the available dataset." (English)
729
+ Never invent or guess content.
730
+ • End with: "والله أعلم." (Arabic) or "And Allah knows best." (English)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
731
  """
732
 
733
  _SYSTEM_TEMPLATE = """\
 
741
  === OUTPUT FORMAT ===
742
  {fmt}
743
 
744
+ === ISLAMIC CONTEXT ===
 
 
745
  {context}
746
  === END CONTEXT ===
747
  """
 
754
  intent: str,
755
  analysis: Optional[dict] = None,
756
  ) -> List[dict]:
757
+ """Build system and user messages for LLM."""
758
  if analysis:
759
+ by_surah_str = "\n ".join([
760
+ f"Surah {s}: {data['name']} ({data['count']} times)"
761
+ for s, data in analysis["by_surah"].items()
762
+ ])
763
  analysis_block = (
764
  f"\n[ANALYSIS RESULT]\n"
765
+ f"The keyword «{analysis['keyword']}» appears {analysis['total_count']} times.\n"
766
+ f" {by_surah_str}\n"
 
767
  )
768
  context = analysis_block + context
769
 
 
772
  lang_instruction=language_instruction(lang),
773
  task=_TASK_INSTRUCTIONS.get(intent, _TASK_INSTRUCTIONS["general"]),
774
  fmt=_FORMAT_RULES,
 
775
  context=context,
776
  )
777
 
 
786
  ]
787
 
788
 
 
 
 
789
  def _not_found_answer(lang: str) -> str:
790
+ """Safe fallback when confidence is too low."""
 
 
 
791
  if lang == "arabic":
792
  return (
793
+ "لم أجد في قاعدة البيانات ما يكفي للإجابة على هذا السؤال بدقة.\n"
794
+ "يُرجى الرجوع إلى مصادر إسلامية موثوقة.\n"
795
  "والله أعلم."
796
  )
797
  return (
798
  "The available dataset does not contain sufficient information to answer "
799
+ "this question accurately.\nPlease refer to trusted Islamic sources.\n"
 
800
  "And Allah knows best."
801
  )
802
 
803
 
804
+ # ═══════════════════════════════════════════════════════════════════════
805
+ # HADITH GRADE INFERENCE
806
+ # ═══════════════════════════════════════════════════════════════════════
807
+ def infer_hadith_grade(item: dict) -> dict:
808
+ """Infer hadith grade from collection name if not present."""
809
+ if item.get("type") != "hadith" or item.get("grade"):
810
+ return item
811
+
812
+ # Map collection names to grades
813
+ collection = item.get("collection", "").lower()
814
+ reference = item.get("reference", "").lower()
815
+ combined = f"{collection} {reference}"
816
+
817
+ # Sahih collections (highest authenticity)
818
+ if any(s in combined for s in ["sahih al-bukhari", "sahih bukhari", "bukhari"]):
819
+ item["grade"] = "Sahih"
820
+ elif any(s in combined for s in ["sahih muslim", "sahih al-muslim"]):
821
+ item["grade"] = "Sahih"
822
+ elif any(s in combined for s in ["sunan an-nasai", "sunan an-nasa", "nasa'i", "nasa"]):
823
+ item["grade"] = "Sahih"
824
+ # Hasan collections
825
+ elif any(s in combined for s in ["jami at-tirmidhi", "tirmidhi", "at-tirmidhi"]):
826
+ item["grade"] = "Hasan"
827
+ elif any(s in combined for s in ["sunan abu dawood", "abu dawood", "abo daud", "abou daoude"]):
828
+ item["grade"] = "Hasan"
829
+ elif any(s in combined for s in ["sunan ibn majah", "ibn majah", "ibn maja"]):
830
+ item["grade"] = "Hasan"
831
+ elif any(s in combined for s in ["muwatta malik", "muwatta", "malik"]):
832
+ item["grade"] = "Hasan"
833
+ # New collections from enrichment
834
+ elif any(s in combined for s in ["musnad ahmad", "ahmad", "ahmed"]):
835
+ item["grade"] = "Hasan/Sahih"
836
+ elif any(s in combined for s in ["sunan al-darimi", "darimi", "al-darimi"]):
837
+ item["grade"] = "Hasan"
838
+
839
+ return item
840
+
841
+
842
  # ═══════════════════════════════════════════════════════════════════════
843
  # APP STATE
844
  # ═══════════════════════════════════════════════════════════════════════
 
846
  embed_model: Optional[SentenceTransformer] = None
847
  faiss_index: Optional[faiss.Index] = None
848
  dataset: Optional[list] = None
849
+ llm: Optional[LLMProvider] = None
850
  ready: bool = False
851
 
852
 
 
855
 
856
  @asynccontextmanager
857
  async def lifespan(app: FastAPI):
858
+ """Initialize state on startup."""
859
  logger.info("⏳ Loading embed model: %s", cfg.EMBED_MODEL)
860
  state.embed_model = SentenceTransformer(cfg.EMBED_MODEL)
861
 
 
866
  with open(cfg.METADATA_FILE, "r", encoding="utf-8") as f:
867
  state.dataset = json.load(f)
868
 
869
+ # Infer hadith grades from collection names
870
+ state.dataset = [infer_hadith_grade(item) for item in state.dataset]
871
+
872
+ logger.info("⏳ Initializing LLM provider: %s", cfg.LLM_BACKEND)
873
+ state.llm = get_llm_provider()
 
 
 
 
874
 
875
  state.ready = True
876
  logger.info(
877
+ "✅ QModel v4 ready | backend=%s | dataset=%d | faiss=%d | threshold=%.2f",
878
+ cfg.LLM_BACKEND,
879
+ len(state.dataset) if state.dataset else 0,
880
+ state.faiss_index.ntotal if state.faiss_index else 0,
881
+ cfg.CONFIDENCE_THRESHOLD,
882
  )
883
  yield
884
  state.ready = False
 
889
  # FASTAPI APP
890
  # ═══════════════════════════════════════════════════════════════════════
891
  app = FastAPI(
892
+ title="QModel v4 — Islamic RAG API",
893
+ description="Specialized Quran & Hadith system with dual LLM backend",
894
+ version="4.0.0",
895
  lifespan=lifespan,
896
  )
897
 
 
912
  content: str = Field(..., min_length=1, max_length=4000)
913
 
914
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
915
  class AnalysisResult(BaseModel):
916
  keyword: str
917
  kw_stemmed: str
918
  total_count: int
919
+ by_surah: Dict[int, Dict]
920
  examples: List[dict]
921
 
922
 
923
+ class SourceItem(BaseModel):
924
+ source: str
925
+ type: str
926
+ grade: Optional[str] = None
927
+ arabic: str
928
+ english: str
929
+ _score: float
930
+
931
+
932
  class AskResponse(BaseModel):
933
+ question: str
934
+ answer: str
935
+ language: str
936
+ intent: str
937
+ analysis: Optional[AnalysisResult] = None
938
+ sources: List[SourceItem]
939
+ top_score: float
940
+ latency_ms: int
941
+
942
+
943
+ class HadithVerifyResponse(BaseModel):
944
+ query: str
945
+ found: bool
946
+ collection: Optional[str] = None
947
+ grade: Optional[str] = None
948
+ reference: Optional[str] = None
949
+ arabic: Optional[str] = None
950
+ english: Optional[str] = None
951
+ latency_ms: int
952
+
953
+
954
+ # ═══════════════════════════════════════════════════════════════════════
955
+ # OPENAI-COMPATIBLE SCHEMAS (for Open-WebUI integration)
956
+ # ═══════════════════════════════════════════════════════════════════════
957
+ class ChatCompletionMessage(BaseModel):
958
+ role: str = Field(..., description="Message role: system, user, or assistant")
959
+ content: str = Field(..., description="Message content")
960
+
961
+
962
+ class ChatCompletionRequest(BaseModel):
963
+ model: str = Field(default="QModel", description="Model name")
964
+ messages: List[ChatCompletionMessage] = Field(..., description="Messages for the model")
965
+ temperature: Optional[float] = Field(default=cfg.TEMPERATURE, ge=0.0, le=2.0)
966
+ top_p: Optional[float] = Field(default=1.0, ge=0.0, le=1.0)
967
+ max_tokens: Optional[int] = Field(default=cfg.MAX_TOKENS, ge=1, le=8000)
968
+ top_k: Optional[int] = Field(default=5, ge=1, le=20, description="Islamic sources to retrieve")
969
+ stream: Optional[bool] = Field(default=False, description="Enable streaming responses")
970
+
971
+
972
+ class ChatCompletionChoice(BaseModel):
973
+ index: int
974
+ message: ChatCompletionMessage
975
+ finish_reason: str = "stop"
976
+
977
+
978
+ class ChatCompletionResponse(BaseModel):
979
+ id: str
980
+ object: str = "chat.completion"
981
+ created: int
982
+ model: str
983
+ choices: List[ChatCompletionChoice]
984
+ usage: dict
985
+ x_metadata: Optional[dict] = None # QModel-specific metadata
986
+
987
+
988
+ class ModelInfo(BaseModel):
989
+ id: str
990
+ object: str = "model"
991
+ created: int
992
+ owned_by: str = "elgendy"
993
+ permission: List[dict] = Field(default_factory=list)
994
+ root: Optional[str] = None
995
+ parent: Optional[str] = None
996
+
997
+
998
+ class ModelsListResponse(BaseModel):
999
+ object: str = "list"
1000
+ data: List[ModelInfo]
1001
 
1002
 
1003
  # ═══════════════════════════════════════════════════════════════════════
1004
+ # CORE RAG PIPELINE
1005
  # ═══════════════════════════════════════════════════════════════════════
1006
+ async def run_rag_pipeline(
1007
+ question: str,
1008
+ top_k: int = cfg.TOP_K_RETURN,
1009
+ source_type: Optional[Literal["quran", "hadith"]] = None,
1010
+ grade_filter: Optional[str] = None,
1011
+ ) -> dict:
1012
+ """Core RAG pipeline: rewrite → search → verify → generate."""
1013
  t0 = time.perf_counter()
1014
 
1015
  # 1. Query rewriting
1016
+ rewrite = await rewrite_query(question, state.llm)
1017
  intent = rewrite.get("intent", "general")
1018
 
1019
  # 2. Intent detection + hybrid search — concurrently
 
1022
  hybrid_search(
1023
  question, rewrite,
1024
  state.embed_model, state.faiss_index, state.dataset,
1025
+ top_k, source_type, grade_filter,
1026
  ),
1027
  )
1028
  analysis_kw, results = await asyncio.gather(kw_task, search_task)
 
1042
  intent, top_score, cfg.CONFIDENCE_THRESHOLD,
1043
  )
1044
 
1045
+ # 5. Confidence gate
 
 
 
 
1046
  if top_score < cfg.CONFIDENCE_THRESHOLD:
1047
  logger.warning(
1048
+ "Low confidence (%.3f < %.2f) — returning safe fallback",
1049
  top_score, cfg.CONFIDENCE_THRESHOLD,
1050
  )
1051
  return {
 
1058
  "latency_ms": int((time.perf_counter() - t0) * 1000),
1059
  }
1060
 
1061
+ # 6. Build context + prompt + LLM call
1062
+ context = build_context(results)
1063
  messages = build_messages(context, question, lang, intent, analysis)
1064
 
 
 
1065
  try:
1066
+ answer = await state.llm.chat(
1067
+ messages,
1068
+ max_tokens=cfg.MAX_TOKENS,
1069
+ temperature=cfg.TEMPERATURE,
 
 
 
1070
  )
1071
+ except Exception as exc:
1072
+ logger.error("LLM call failed: %s", exc)
1073
+ raise HTTPException(status_code=502, detail="LLM service unavailable")
1074
 
1075
  latency = int((time.perf_counter() - t0) * 1000)
1076
  logger.info(
 
1102
  # ═══════════════════════════════════════════════════════════════════════
1103
  @app.get("/health", tags=["ops"])
1104
  def health():
1105
+ """Health check endpoint."""
1106
  return {
1107
  "status": "ok" if state.ready else "initialising",
1108
+ "version": "4.0.0",
1109
+ "llm_backend": cfg.LLM_BACKEND,
1110
  "dataset_size": len(state.dataset) if state.dataset else 0,
1111
  "faiss_total": state.faiss_index.ntotal if state.faiss_index else 0,
1112
  "confidence_threshold": cfg.CONFIDENCE_THRESHOLD,
 
1113
  }
1114
 
1115
 
1116
+ @app.get("/v1/models", response_model=ModelsListResponse, tags=["models"])
1117
  def list_models():
1118
+ """List available models (OpenAI-compatible)."""
1119
+ return ModelsListResponse(
1120
+ data=[
1121
+ ModelInfo(
1122
+ id="QModel",
1123
+ created=int(time.time()),
1124
+ owned_by="elgendy",
1125
+ ),
1126
+ ModelInfo(
1127
+ id="qmodel", # Lowercase variant for compatibility
1128
+ created=int(time.time()),
1129
+ owned_by="elgendy",
1130
+ ),
1131
+ ]
1132
+ )
1133
+
1134
+
1135
+ @app.post("/v1/chat/completions", response_model=ChatCompletionResponse, tags=["inference"])
1136
+ async def chat_completions(request: ChatCompletionRequest):
1137
+ """OpenAI-compatible chat completions endpoint (for Open-WebUI integration)."""
1138
+ _check_ready()
1139
+
1140
+ # Extract user message (last message with role="user")
1141
+ user_messages = [m.content for m in request.messages if m.role == "user"]
1142
+ if not user_messages:
1143
+ raise HTTPException(status_code=400, detail="No user message in request")
1144
+
1145
+ question = user_messages[-1]
1146
+ top_k = request.top_k or cfg.TOP_K_RETURN
1147
+ temperature = request.temperature or cfg.TEMPERATURE
1148
+ max_tokens = request.max_tokens or cfg.MAX_TOKENS
1149
+
1150
+ try:
1151
+ result = await run_rag_pipeline(question, top_k=top_k)
1152
+ except HTTPException:
1153
+ raise
1154
+ except Exception as exc:
1155
+ logger.error("Pipeline error: %s", exc)
1156
+ raise HTTPException(status_code=500, detail=str(exc))
1157
+
1158
+ # Handle streaming if requested
1159
+ if request.stream:
1160
+ return StreamingResponse(
1161
+ _stream_response(result, request.model),
1162
+ media_type="text/event-stream",
1163
+ )
1164
+
1165
+ # Format response in OpenAI schema
1166
+ return ChatCompletionResponse(
1167
+ id=f"qmodel-{int(time.time() * 1000)}",
1168
+ created=int(time.time()),
1169
+ model=request.model,
1170
+ choices=[
1171
+ ChatCompletionChoice(
1172
+ index=0,
1173
+ message=ChatCompletionMessage(
1174
+ role="assistant",
1175
+ content=result["answer"],
1176
+ ),
1177
+ )
1178
+ ],
1179
+ usage={
1180
+ "prompt_tokens": -1,
1181
+ "completion_tokens": -1,
1182
+ "total_tokens": -1,
1183
+ },
1184
+ x_metadata={
1185
+ "language": result["language"],
1186
+ "intent": result["intent"],
1187
+ "top_score": round(result["top_score"], 4),
1188
+ "latency_ms": result["latency_ms"],
1189
+ "sources_count": len(result["sources"]),
1190
+ "sources": [
1191
+ {
1192
+ "source": s.get("source") or s.get("reference", ""),
1193
+ "type": s.get("type", ""),
1194
+ "grade": s.get("grade"),
1195
+ "score": round(s.get("_score", 0), 4),
1196
+ }
1197
+ for s in result.get("sources", [])[:5]
1198
+ ],
1199
+ "analysis": result.get("analysis"),
1200
+ },
1201
+ )
1202
+
1203
+
1204
+ async def _stream_response(result: dict, model: str):
1205
+ """Stream response chunks in OpenAI format."""
1206
+ import json
1207
+
1208
+ # Send answer in chunks
1209
+ answer = result.get("answer", "")
1210
+ for line in answer.split("\n"):
1211
+ chunk = {
1212
+ "id": f"qmodel-{int(time.time() * 1000)}",
1213
+ "object": "chat.completion.chunk",
1214
+ "created": int(time.time()),
1215
+ "model": model,
1216
+ "choices": [{
1217
+ "index": 0,
1218
+ "delta": {"content": line + "\n"},
1219
+ "finish_reason": None,
1220
+ }],
1221
+ }
1222
+ yield f"data: {json.dumps(chunk)}\n\n"
1223
+
1224
+ # Send final chunk
1225
+ final_chunk = {
1226
+ "id": f"qmodel-{int(time.time() * 1000)}",
1227
+ "object": "chat.completion.chunk",
1228
+ "created": int(time.time()),
1229
+ "model": model,
1230
+ "choices": [{
1231
+ "index": 0,
1232
+ "delta": {},
1233
+ "finish_reason": "stop",
1234
  }],
1235
  }
1236
+ yield f"data: {json.dumps(final_chunk)}\n\n"
1237
+ yield "data: [DONE]\n\n"
1238
+
1239
+
1240
+ @app.get("/ask", response_model=AskResponse, tags=["inference"])
1241
+ async def ask(
1242
+ q: str = Query(..., min_length=1, max_length=1000, description="Your Islamic question"),
1243
+ top_k: int = Query(cfg.TOP_K_RETURN, ge=1, le=20, description="Number of sources"),
1244
+ source_type: Optional[str] = Query(None, description="Filter: quran|hadith"),
1245
+ grade_filter: Optional[str] = Query(None, description="Filter Hadith: sahih|hasan|,all"),
1246
+ ):
1247
+ """Main inference endpoint."""
1248
+ _check_ready()
1249
+ result = await run_rag_pipeline(q, top_k, source_type, grade_filter)
1250
+
1251
+ sources = [
1252
+ SourceItem(
1253
+ source=r.get("source") or r.get("reference") or "Unknown",
1254
+ type=r.get("type", "unknown"),
1255
+ grade=r.get("grade"),
1256
+ arabic=r.get("arabic", ""),
1257
+ english=r.get("english", ""),
1258
+ _score=r.get("_score", 0.0),
1259
+ )
1260
+ for r in result["sources"]
1261
+ ]
1262
+
1263
+ return AskResponse(
1264
+ question=q,
1265
+ answer=result["answer"],
1266
+ language=result["language"],
1267
+ intent=result["intent"],
1268
+ analysis=result["analysis"],
1269
+ sources=sources,
1270
+ top_score=result["top_score"],
1271
+ latency_ms=result["latency_ms"],
1272
+ )
1273
+
1274
+
1275
+ @app.get("/hadith/verify", response_model=HadithVerifyResponse, tags=["hadith"])
1276
+ async def verify_hadith(
1277
+ q: str = Query(..., description="First few words or query of Hadith"),
1278
+ collection: Optional[str] = Query(None, description="Filter: bukhari|muslim|all"),
1279
+ ):
1280
+ """Verify if a Hadith is in authenticated collections."""
1281
+ _check_ready()
1282
+ t0 = time.perf_counter()
1283
+
1284
+ results = await hybrid_search(
1285
+ q, {"ar_query": q, "en_query": q, "keywords": q.split(), "intent": "hadith"},
1286
+ state.embed_model, state.faiss_index, state.dataset,
1287
+ top_n=5, source_type="hadith", grade_filter="sahih",
1288
+ )
1289
+
1290
+ if results:
1291
+ r = results[0]
1292
+ return HadithVerifyResponse(
1293
+ query=q,
1294
+ found=True,
1295
+ collection=r.get("collection"),
1296
+ grade=r.get("grade"),
1297
+ reference=r.get("reference"),
1298
+ arabic=r.get("arabic"),
1299
+ english=r.get("english"),
1300
+ latency_ms=int((time.perf_counter() - t0) * 1000),
1301
+ )
1302
+
1303
+ return HadithVerifyResponse(
1304
+ query=q,
1305
+ found=False,
1306
+ latency_ms=int((time.perf_counter() - t0) * 1000),
1307
+ )
1308
 
1309
 
1310
  @app.get("/debug/scores", tags=["ops"])
1311
  async def debug_scores(
1312
+ q: str = Query(..., min_length=1, max_length=1000),
1313
  top_k: int = Query(10, ge=1, le=20),
1314
  ):
1315
+ """Debug: inspect raw retrieval scores without LLM."""
1316
  _check_ready()
1317
+ rewrite = await rewrite_query(q, state.llm)
1318
  results = await hybrid_search(q, rewrite, state.embed_model, state.faiss_index, state.dataset, top_k)
1319
  return {
1320
  "intent": rewrite.get("intent"),
 
1324
  "rank": i + 1,
1325
  "source": r.get("source") or r.get("reference"),
1326
  "type": r.get("type"),
1327
+ "grade": r.get("grade"),
1328
  "_dense": round(r.get("_dense", 0), 4),
1329
  "_sparse": round(r.get("_sparse", 0), 4),
1330
  "_score": round(r.get("_score", 0), 4),
 
1331
  }
1332
  for i, r in enumerate(results)
1333
  ],
1334
  }
1335
 
1336
 
1337
+ if __name__ == "__main__":
1338
+ import uvicorn
1339
+ uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,9 +1,21 @@
1
- sentence-transformers
2
- faiss-cpu
3
- fastapi
4
- uvicorn
5
- numpy
6
- accelerate
7
- torch
8
- ollama
9
- python-dotenv
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Web framework
2
+ fastapi==0.104.1
3
+ uvicorn[standard]==0.24.0
4
+ pydantic==2.4.2
5
+
6
+ # Core: Embeddings & Search
7
+ sentence-transformers==2.2.2
8
+ faiss-cpu==1.7.4
9
+ numpy==1.24.3
10
+
11
+ # Optional: HuggingFace backend
12
+ transformers==4.34.1
13
+ torch==2.1.1
14
+ accelerate==0.24.1
15
+
16
+ # Optional: Ollama backend
17
+ ollama==0.0.48
18
+
19
+ # Configuration & Data
20
+ python-dotenv==1.0.0
21
+ requests==2.31.0