aelgendy commited on
Commit
4d5fcc9
ยท
1 Parent(s): c580971

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +21 -0
  2. build_index.py +993 -45
  3. main.py +160 -12
README.md CHANGED
@@ -1,3 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # QModel v4 โ€” Islamic RAG System
2
  **Specialized Qur'an & Hadith Knowledge System with Dual LLM Support**
3
 
 
1
+ ---
2
+ title: QModel
3
+ emoji: ๐Ÿ•Œ
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: docker
7
+ app_port: 8000
8
+ license: mit
9
+ tags:
10
+ - quran
11
+ - hadith
12
+ - islamic
13
+ - rag
14
+ - faiss
15
+ - nlp
16
+ - arabic
17
+ language:
18
+ - ar
19
+ - en
20
+ ---
21
+
22
  # QModel v4 โ€” Islamic RAG System
23
  **Specialized Qur'an & Hadith Knowledge System with Dual LLM Support**
24
 
build_index.py CHANGED
@@ -1,79 +1,1027 @@
1
  #!/usr/bin/env python3
2
  """
3
- Regenerate FAISS index with enriched metadata.
4
- This script loads the enriched metadata and generates embeddings for all documents.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  import json
 
 
 
 
 
8
  import numpy as np
9
  from pathlib import Path
 
 
 
10
  import faiss
 
11
  from sentence_transformers import SentenceTransformer
12
  from tqdm import tqdm
13
 
14
- def generate_embeddings(model_name: str = "intfloat/multilingual-e5-large"):
15
- """Generate embeddings for all documents in metadata.json"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- metadata_path = Path("/Users/elgendy/Projects/QModel/metadata.json")
18
- index_path = Path("/Users/elgendy/Projects/QModel/QModel.index")
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Load metadata
21
- print("Loading metadata...")
22
- with open(metadata_path, 'r', encoding='utf-8') as f:
23
- documents = json.load(f)
24
 
25
- print(f"Total documents: {len(documents)}")
 
 
 
 
 
26
 
27
- # Load embedding model
28
- print(f"\nLoading embedding model: {model_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  model = SentenceTransformer(model_name)
30
  embedding_dim = model.get_sentence_embedding_dimension()
31
- print(f"Embedding dimension: {embedding_dim}")
32
 
33
- # Prepare texts for embedding
34
- all_texts = []
35
  for doc in documents:
36
  if doc.get("type") == "quran":
37
- # For Quran: use Tafseer/meaning + Sura name
38
- text = f"{doc.get('surah_name_en', '')} {doc.get('english', '')}"
 
 
 
 
39
  else: # hadith
40
- # For Hadith: use collection + Arabic text (for better semantic matching)
41
- text = f"{doc.get('collection', '')} {doc.get('arabic', '')} {doc.get('english', '')}"
42
-
 
 
43
  all_texts.append(text.strip())
44
 
45
- # Generate embeddings in batches for efficiency
46
- print(f"\nGenerating embeddings for {len(all_texts)} documents...")
47
- batch_size = 32
48
  all_embeddings = []
49
-
50
- for i in tqdm(range(0, len(all_texts), batch_size), desc="Embedding batches"):
51
- batch_texts = all_texts[i:i + batch_size]
52
- batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
53
- all_embeddings.extend(batch_embeddings)
 
 
54
 
55
  embeddings = np.array(all_embeddings, dtype=np.float32)
56
- print(f"Generated embeddings shape: {embeddings.shape}")
57
 
58
- # Create FAISS index
59
- print("\nCreating FAISS index...")
60
- index = faiss.IndexFlatIP(embedding_dim) # Inner product (cosine on normalized)
61
  faiss.normalize_L2(embeddings)
62
  index.add(embeddings)
63
 
64
- # Save index
65
- print(f"Saving FAISS index to {index_path}")
66
- faiss.write_index(index, str(index_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- print(f"\n{'='*60}")
69
- print("Index Generation Complete")
70
- print(f"{'='*60}")
71
- print(f"Documents indexed: {len(documents)}")
72
- print(f"Embeddings generated: {len(all_embeddings)}")
73
- print(f"Index file size: {index_path.stat().st_size / (1024*1024):.2f} MB")
74
- print(f"Index capacity: {index.ntotal}")
75
- print(f"{'='*60}")
76
 
77
 
78
  if __name__ == "__main__":
79
- generate_embeddings()
 
1
  #!/usr/bin/env python3
2
  """
3
+ QModel Dataset Builder v2
4
+ =========================
5
+ Builds metadata.json and QModel.index from scratch using multiple
6
+ authoritative sources.
7
+
8
+ Data Sources:
9
+ Quran:
10
+ - risan/quran-json (Arabic text + English translation + chapter metadata)
11
+ - semarketir/quranjson (verse transliteration)
12
+ Tafsir:
13
+ - Kaggle tafseer dataset (primary tafsir enrichment)
14
+ - Quran.com API (fallback tafsir enrichment)
15
+ Hadith:
16
+ - AhmedBaset/hadith-json (9 books: Arabic + English, chapter structure)
17
+ - fawazahmed0/hadith-api (grade information from scholars)
18
+
19
+ Usage:
20
+ python build_index.py # full build from scratch
21
+ python build_index.py --force-download # re-download all sources
22
+ python build_index.py --data-only # generate metadata.json, skip index
23
+ python build_index.py --index-only # build index from existing metadata.json
24
+ python build_index.py --skip-tafsir # skip tafsir enrichment
25
  """
26
 
27
  import json
28
+ import os
29
+ import re
30
+ import time
31
+ import argparse
32
+ import zipfile
33
  import numpy as np
34
  from pathlib import Path
35
+ from collections import defaultdict
36
+ from typing import Any, Dict, List, Optional, Tuple
37
+
38
  import faiss
39
+ import requests
40
  from sentence_transformers import SentenceTransformer
41
  from tqdm import tqdm
42
 
43
+ # โ”€โ”€ Paths โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
44
+ BASE_DIR = Path(__file__).resolve().parent
45
+ CACHE_DIR = BASE_DIR / "data" / "cache"
46
+ METADATA_PATH = BASE_DIR / "metadata.json"
47
+ INDEX_PATH = BASE_DIR / "QModel.index"
48
+
49
+ # โ”€โ”€ Quran source URLs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
50
+ QURAN_JSON_URL = (
51
+ "https://raw.githubusercontent.com/risan/quran-json/main/data/quran.json"
52
+ )
53
+ CHAPTERS_EN_URL = (
54
+ "https://raw.githubusercontent.com/risan/quran-json/main/data/chapters/en.json"
55
+ )
56
+ SEMARKETIR_SURAH_URL_TPL = (
57
+ "https://raw.githubusercontent.com/semarketir/quranjson"
58
+ "/master/source/surah/surah_{n}.json"
59
+ )
60
+ SEMARKETIR_TRANSLATION_URL_TPL = (
61
+ "https://raw.githubusercontent.com/semarketir/quranjson"
62
+ "/master/source/translation/en/en_translation_{n}.json"
63
+ )
64
+ # CDN dist per-chapter English (Arabic + English + transliteration)
65
+ CDN_CHAPTER_EN_URL_TPL = (
66
+ "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{n}.json"
67
+ )
68
+
69
+ # โ”€โ”€ Tafsir sources โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
70
+ KAGGLE_TAFSIR_URL = (
71
+ "https://www.kaggle.com/api/v1/datasets/download/"
72
+ "abdelrahmanahmed110/quranic-ayahs-with-tafseer-json-dataset"
73
+ )
74
+ # Fallback: Quran.com API
75
+ QURAN_API_BASE = "https://api.quran.com/api/v4"
76
+ TAFSIR_EN_ID = 169 # Ibn Kathir (Abridged) โ€“ English
77
+ TAFSIR_AR_ID = 16 # Al-Muyassar โ€“ Arabic
78
+
79
+ # โ”€โ”€ Hadith source: AhmedBaset โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
80
+ AHMEDBASET_BASE_URL = (
81
+ "https://raw.githubusercontent.com/AhmedBaset/hadith-json"
82
+ "/main/db/by_book/the_9_books"
83
+ )
84
+ HADITH_BOOKS = {
85
+ "ahmed.json": {
86
+ "collection": "Musnad Ahmad",
87
+ "id_prefix": "ahmad",
88
+ "author": "Imam Ahmad ibn Hanbal",
89
+ },
90
+ "bukhari.json": {
91
+ "collection": "Sahih al-Bukhari",
92
+ "id_prefix": "bukhari",
93
+ "author": "Muhammad al-Bukhari",
94
+ },
95
+ "muslim.json": {
96
+ "collection": "Sahih Muslim",
97
+ "id_prefix": "muslim",
98
+ "author": "Muslim ibn al-Hajjaj",
99
+ },
100
+ "abudawud.json": {
101
+ "collection": "Sunan Abu Dawood",
102
+ "id_prefix": "abudawud",
103
+ "author": "Abu Dawood Sulaiman",
104
+ },
105
+ "tirmidhi.json": {
106
+ "collection": "Jami' at-Tirmidhi",
107
+ "id_prefix": "tirmidhi",
108
+ "author": "Al-Tirmidhi",
109
+ },
110
+ "ibnmajah.json": {
111
+ "collection": "Sunan Ibn Majah",
112
+ "id_prefix": "ibnmajah",
113
+ "author": "Ibn Majah al-Qazwini",
114
+ },
115
+ "nasai.json": {
116
+ "collection": "Sunan an-Nasai",
117
+ "id_prefix": "nasai",
118
+ "author": "Ahmad al-Nasai",
119
+ },
120
+ "malik.json": {
121
+ "collection": "Muwatta Malik",
122
+ "id_prefix": "malik",
123
+ "author": "Malik ibn Anas",
124
+ },
125
+ "darimi.json": {
126
+ "collection": "Sunan al-Darimi",
127
+ "id_prefix": "darimi",
128
+ "author": "Al-Darimi",
129
+ },
130
+ }
131
+
132
+ # โ”€โ”€ Hadith source: fawazahmed0 (for grades) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
133
+ FAWAZ_CDN_BASE = "https://cdn.jsdelivr.net/gh/fawazahmed0/hadith-api@1"
134
+ FAWAZ_RAW_BASE = (
135
+ "https://raw.githubusercontent.com/fawazahmed0/hadith-api/1"
136
+ )
137
+ FAWAZ_EDITION_MAP = {
138
+ "bukhari": "eng-bukhari",
139
+ "muslim": "eng-muslim",
140
+ "abudawud": "eng-abudawud",
141
+ "tirmidhi": "eng-tirmidhi",
142
+ "nasai": "eng-nasai",
143
+ "ibnmajah": "eng-ibnmajah",
144
+ "malik": "eng-malik",
145
+ "ahmad": "eng-ahmed",
146
+ "darimi": "eng-darimi",
147
+ }
148
+
149
+ # โ”€โ”€ Embedding / network config โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
150
+ DEFAULT_EMBED_MODEL = "intfloat/multilingual-e5-large"
151
+ EMBED_BATCH_SIZE = 32
152
+ REQUEST_TIMEOUT = 60
153
+ RETRY_ATTEMPTS = 3
154
+ RETRY_DELAY = 2
155
+
156
+
157
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
158
+ # UTILITIES
159
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
160
+
161
+ def _ensure_dir(path: Path):
162
+ path.mkdir(parents=True, exist_ok=True)
163
+
164
+
165
+ def download_json(
166
+ url: str,
167
+ cache_path: Optional[Path] = None,
168
+ force: bool = False,
169
+ ) -> Any:
170
+ """Download JSON with optional file caching and retries."""
171
+ if cache_path and cache_path.exists() and not force:
172
+ with open(cache_path, "r", encoding="utf-8") as f:
173
+ return json.load(f)
174
+
175
+ for attempt in range(1, RETRY_ATTEMPTS + 1):
176
+ try:
177
+ resp = requests.get(url, timeout=REQUEST_TIMEOUT)
178
+ resp.raise_for_status()
179
+ data = resp.json()
180
+ if cache_path:
181
+ _ensure_dir(cache_path.parent)
182
+ with open(cache_path, "w", encoding="utf-8") as f:
183
+ json.dump(data, f, ensure_ascii=False)
184
+ return data
185
+ except Exception as exc:
186
+ if attempt == RETRY_ATTEMPTS:
187
+ raise
188
+ print(f" Retry {attempt}/{RETRY_ATTEMPTS} for {url}: {exc}")
189
+ time.sleep(RETRY_DELAY * attempt)
190
+
191
+
192
+ def download_file(
193
+ url: str,
194
+ cache_path: Path,
195
+ force: bool = False,
196
+ auth: Optional[Tuple[str, str]] = None,
197
+ ) -> Path:
198
+ """Download a binary file with caching."""
199
+ if cache_path.exists() and cache_path.stat().st_size > 0 and not force:
200
+ return cache_path
201
+
202
+ _ensure_dir(cache_path.parent)
203
+ for attempt in range(1, RETRY_ATTEMPTS + 1):
204
+ try:
205
+ resp = requests.get(
206
+ url, timeout=REQUEST_TIMEOUT, stream=True, auth=auth,
207
+ )
208
+ resp.raise_for_status()
209
+ with open(cache_path, "wb") as f:
210
+ for chunk in resp.iter_content(chunk_size=8192):
211
+ f.write(chunk)
212
+ return cache_path
213
+ except Exception as exc:
214
+ if attempt == RETRY_ATTEMPTS:
215
+ raise
216
+ print(f" Retry {attempt}/{RETRY_ATTEMPTS}: {exc}")
217
+ time.sleep(RETRY_DELAY * attempt)
218
+
219
+
220
+ def strip_html(text: str) -> str:
221
+ """Remove HTML tags and collapse whitespace."""
222
+ clean = re.sub(r"<[^>]+>", " ", text)
223
+ return re.sub(r"\s+", " ", clean).strip()
224
+
225
+
226
+ def _kaggle_auth() -> Optional[Tuple[str, str]]:
227
+ """Return (username, key) from env vars or ~/.kaggle/kaggle.json."""
228
+ username = os.environ.get("KAGGLE_USERNAME")
229
+ key = os.environ.get("KAGGLE_KEY")
230
+ if username and key:
231
+ return (username, key)
232
+ kaggle_json = Path.home() / ".kaggle" / "kaggle.json"
233
+ if kaggle_json.exists():
234
+ with open(kaggle_json, "r") as f:
235
+ creds = json.load(f)
236
+ u, k = creds.get("username"), creds.get("key")
237
+ if u and k:
238
+ return (u, k)
239
+ return None
240
+
241
+
242
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
243
+ # STEP 1: FETCH & BUILD QURAN ENTRIES
244
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
245
+
246
+ def fetch_quran_sources(
247
+ force: bool = False,
248
+ ) -> Tuple[Dict[int, Dict], Dict[int, Dict], Dict[int, Dict], Dict[int, Dict]]:
249
+ """Download Quran data from all sources.
250
+
251
+ Returns (cdn_chapters, quran_data, chapter_meta, semarketir_translations).
252
+ cdn_chapters: { surah_num: { "id", "name", "transliteration", "translation",
253
+ "type", "total_verses", "verses": [{"id", "text",
254
+ "translation", "transliteration"}] } } (primary)
255
+ quran_data: raw quran.json { "N": [{"chapter", "verse", "text"}] }
256
+ chapter_meta: { surah_num: {"id", "name", "transliteration", "translation",
257
+ "type", "total_verses"} } (fallback metadata)
258
+ semarketir_translations: { surah_num: { "verse": {"1": "english_text"} } }
259
+ """
260
+ print("=" * 60)
261
+ print("Step 1: Fetching Quran Sources")
262
+ print("=" * 60)
263
+
264
+ # 1a. CDN per-chapter English (primary โ€“ has Arabic + English + transliteration)
265
+ print(" Downloading per-chapter English data from CDN โ€ฆ")
266
+ cdn_chapters: Dict[int, Dict] = {}
267
+ for n in tqdm(range(1, 115), desc=" CDN chapters", leave=True):
268
+ try:
269
+ url = CDN_CHAPTER_EN_URL_TPL.format(n=n)
270
+ data = download_json(
271
+ url,
272
+ cache_path=CACHE_DIR / "quran" / "cdn_en" / f"{n}.json",
273
+ force=force,
274
+ )
275
+ cdn_chapters[n] = data
276
+ except Exception as exc:
277
+ print(f"\n โœ— Chapter {n}: {exc}")
278
+ print(f" โœ“ Loaded {len(cdn_chapters)} chapters from CDN")
279
+
280
+ # 1b. risan/quran-json โ€“ full Quran text (fallback Arabic)
281
+ print(" Downloading quran.json from risan/quran-json โ€ฆ")
282
+ quran_data = download_json(
283
+ QURAN_JSON_URL,
284
+ cache_path=CACHE_DIR / "quran" / "quran.json",
285
+ force=force,
286
+ )
287
+ print(f" โœ“ Loaded {len(quran_data)} surahs")
288
+
289
+ # 1c. risan/quran-json โ€“ chapter metadata (fallback)
290
+ print(" Downloading chapters/en.json โ€ฆ")
291
+ chapters_raw = download_json(
292
+ CHAPTERS_EN_URL,
293
+ cache_path=CACHE_DIR / "quran" / "chapters_en.json",
294
+ force=force,
295
+ )
296
+ chapter_meta: Dict[int, Dict] = {}
297
+ if isinstance(chapters_raw, list):
298
+ chapter_meta = {ch["id"]: ch for ch in chapters_raw}
299
+ elif isinstance(chapters_raw, dict):
300
+ chapter_meta = {int(k): v for k, v in chapters_raw.items()}
301
+ print(f" โœ“ Loaded {len(chapter_meta)} chapter records")
302
+
303
+ # 1d. semarketir English translations (additional fallback)
304
+ print(" Downloading English translations from semarketir/quranjson โ€ฆ")
305
+ semarketir_translations: Dict[int, Dict] = {}
306
+ for n in tqdm(range(1, 115), desc=" Semarketir EN", leave=True):
307
+ try:
308
+ url = SEMARKETIR_TRANSLATION_URL_TPL.format(n=n)
309
+ data = download_json(
310
+ url,
311
+ cache_path=CACHE_DIR / "quran" / "semarketir_en" / f"en_translation_{n}.json",
312
+ force=force,
313
+ )
314
+ semarketir_translations[n] = data
315
+ except Exception as exc:
316
+ print(f"\n โœ— Surah {n} translation: {exc}")
317
+ print(f" โœ“ Loaded translation for {len(semarketir_translations)} surahs")
318
+
319
+ return cdn_chapters, quran_data, chapter_meta, semarketir_translations
320
+
321
+
322
+ def build_quran_entries(
323
+ cdn_chapters: Dict[int, Dict],
324
+ quran_data: Dict,
325
+ chapter_meta: Dict[int, Dict],
326
+ semarketir_translations: Dict[int, Dict],
327
+ ) -> List[Dict]:
328
+ """Merge Quran sources into a list of verse entries.
329
+
330
+ Priority:
331
+ Arabic text: CDN > quran.json
332
+ English: CDN > semarketir translation
333
+ Transliteration: CDN
334
+ Chapter metadata: CDN > chapter_meta (chapters/en.json)
335
+ """
336
+ print("\n" + "=" * 60)
337
+ print("Step 2: Building Quran Entries")
338
+ print("=" * 60)
339
+
340
+ # Build a fallback Arabic lookup from quran.json
341
+ # quran.json: { "N": [{"chapter": int, "verse": int, "text": str}] }
342
+ arabic_fallback: Dict[str, str] = {}
343
+ for surah_key, verses in quran_data.items():
344
+ if isinstance(verses, list):
345
+ for v in verses:
346
+ vk = f"{v.get('chapter', surah_key)}:{v.get('verse', '')}"
347
+ arabic_fallback[vk] = v.get("text", "")
348
+
349
+ # Build semarketir English fallback
350
+ # semarketir_translations: { surah_num: {"verse": {"1": "english_text"}} }
351
+ en_fallback: Dict[str, str] = {}
352
+ for surah_num, sdata in semarketir_translations.items():
353
+ verses = sdata.get("verse", {})
354
+ if isinstance(verses, dict):
355
+ for vnum_str, text in verses.items():
356
+ en_fallback[f"{surah_num}:{vnum_str}"] = text if isinstance(text, str) else ""
357
+
358
+ # Determine surah numbers to iterate
359
+ all_surahs = sorted(
360
+ set(cdn_chapters.keys())
361
+ | {int(k) for k in quran_data.keys()}
362
+ )
363
+
364
+ entries: List[Dict] = []
365
+ for surah_num in all_surahs:
366
+ cdn = cdn_chapters.get(surah_num, {})
367
+ ch = chapter_meta.get(surah_num, {})
368
+
369
+ # Chapter metadata โ€“ prefer CDN, fallback to chapters_en.json
370
+ surah_name_ar = cdn.get("name", ch.get("name", ""))
371
+ surah_name_en = cdn.get("translation", ch.get("translation", ""))
372
+ surah_translit = cdn.get("transliteration", ch.get("transliteration", ""))
373
+ revelation_type = cdn.get("type", ch.get("type", "")).lower()
374
+ total_verses = cdn.get("total_verses", ch.get("total_verses", 0))
375
+
376
+ # Verses from CDN (primary)
377
+ cdn_verses = cdn.get("verses", [])
378
+ if cdn_verses:
379
+ for verse in cdn_verses:
380
+ verse_num = verse["id"]
381
+ vk = f"{surah_num}:{verse_num}"
382
+ entries.append({
383
+ "id": vk,
384
+ "arabic": verse.get("text", arabic_fallback.get(vk, "")),
385
+ "english": verse.get("translation", en_fallback.get(vk, "")),
386
+ "source": f"Surah {surah_name_ar} {vk}",
387
+ "surah_number": surah_num,
388
+ "surah_name_en": surah_name_en,
389
+ "surah_name_ar": surah_name_ar,
390
+ "verse_number": verse_num,
391
+ "transliteration": verse.get("transliteration", ""),
392
+ "type": "quran",
393
+ "surah_name_transliteration": surah_translit,
394
+ "revelation_type": revelation_type,
395
+ "total_verses": total_verses,
396
+ })
397
+ else:
398
+ # Fallback: build from quran.json verses
399
+ raw_verses = quran_data.get(str(surah_num), [])
400
+ if isinstance(raw_verses, list):
401
+ for v in raw_verses:
402
+ verse_num = v.get("verse", v.get("id", 0))
403
+ vk = f"{surah_num}:{verse_num}"
404
+ entries.append({
405
+ "id": vk,
406
+ "arabic": v.get("text", ""),
407
+ "english": en_fallback.get(vk, ""),
408
+ "source": f"Surah {surah_name_ar} {vk}",
409
+ "surah_number": surah_num,
410
+ "surah_name_en": surah_name_en,
411
+ "surah_name_ar": surah_name_ar,
412
+ "verse_number": verse_num,
413
+ "transliteration": "",
414
+ "type": "quran",
415
+ "surah_name_transliteration": surah_translit,
416
+ "revelation_type": revelation_type,
417
+ "total_verses": total_verses,
418
+ })
419
+
420
+ print(f" โœ“ Built {len(entries):,} Quran verses across {len(all_surahs)} surahs")
421
+ return entries
422
+
423
+
424
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
425
+ # STEP 3: ENRICH QURAN WITH TAFSIR
426
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
427
+
428
+ def _extract_verse_key(item: Dict) -> Optional[str]:
429
+ """Try to extract a 'surah:verse' key from a tafsir record."""
430
+ surah_fields = [
431
+ "sura_no", "surah", "surah_number", "sura",
432
+ "chapter", "chapter_no", "SuraID", "SurahNumber",
433
+ ]
434
+ verse_fields = [
435
+ "aya_no", "ayah", "verse_number", "aya",
436
+ "verse", "ayah_number", "AyaID", "VerseNumber",
437
+ ]
438
+
439
+ surah = verse = None
440
+ for f in surah_fields:
441
+ if f in item:
442
+ surah = item[f]
443
+ break
444
+ for f in verse_fields:
445
+ if f in item:
446
+ verse = item[f]
447
+ break
448
+
449
+ if surah is not None and verse is not None:
450
+ return f"{int(surah)}:{int(verse)}"
451
+
452
+ if "verse_key" in item:
453
+ return item["verse_key"]
454
+ return None
455
+
456
+
457
+ def _extract_tafsir_text(item: Dict) -> Optional[Dict[str, str]]:
458
+ """Try to extract tafsir text from a tafsir record."""
459
+ result: Dict[str, str] = {}
460
+
461
+ en_fields = [
462
+ "tafseer_en", "tafsir_en", "tafseer_english", "tafsir_english",
463
+ "english_tafsir", "english_tafseer", "interpretation_en",
464
+ ]
465
+ ar_fields = [
466
+ "tafseer_ar", "tafsir_ar", "tafseer_arabic", "tafsir_arabic",
467
+ "arabic_tafsir", "arabic_tafseer", "interpretation_ar",
468
+ "tafseer", "tafsir",
469
+ ]
470
+
471
+ for f in en_fields:
472
+ if f in item and item[f]:
473
+ result["tafsir_en"] = strip_html(str(item[f]))
474
+ break
475
+
476
+ for f in ar_fields:
477
+ if f in item and item[f]:
478
+ val = str(item[f])
479
+ if any("\u0600" <= c <= "\u06ff" for c in val):
480
+ result["tafsir_ar"] = strip_html(val)
481
+ elif "tafsir_en" not in result:
482
+ # Treat as English if no Arabic characters detected
483
+ result["tafsir_en"] = strip_html(val)
484
+ break
485
+
486
+ # Handle nested tafsir object (e.g. {"1": "...", "2": "..."})
487
+ if not result:
488
+ for key in ("tafseer", "tafsir"):
489
+ obj = item.get(key)
490
+ if isinstance(obj, dict):
491
+ for _, val in obj.items():
492
+ if val:
493
+ result["tafsir_en"] = strip_html(str(val))
494
+ break
495
+ break
496
+
497
+ return result if result else None
498
+
499
+
500
+ def _load_tafsir_from_records(records: List[Dict]) -> Dict[str, Dict[str, str]]:
501
+ """Build verse-key โ†’ tafsir dict from a list of records."""
502
+ tafsir_map: Dict[str, Dict[str, str]] = {}
503
+ for item in records:
504
+ verse_key = _extract_verse_key(item)
505
+ if not verse_key:
506
+ continue
507
+ text = _extract_tafsir_text(item)
508
+ if text:
509
+ tafsir_map.setdefault(verse_key, {}).update(text)
510
+ return tafsir_map
511
+
512
+
513
+ def fetch_kaggle_tafsir(
514
+ force: bool = False,
515
+ ) -> Optional[Dict[str, Dict[str, str]]]:
516
+ """Download and parse the Kaggle tafsir dataset (ZIP).
517
+
518
+ Returns { "surah:verse": {"tafsir_en": โ€ฆ, "tafsir_ar": โ€ฆ} } or None.
519
+ """
520
+ zip_path = CACHE_DIR / "tafsir" / "kaggle_tafsir.zip"
521
+ extract_dir = CACHE_DIR / "tafsir" / "kaggle_extracted"
522
+
523
+ # Download
524
+ try:
525
+ print(" Downloading Kaggle tafsir dataset โ€ฆ")
526
+ auth = _kaggle_auth()
527
+ download_file(KAGGLE_TAFSIR_URL, zip_path, force=force, auth=auth)
528
+ except Exception as exc:
529
+ print(f" โœ— Kaggle download failed: {exc}")
530
+ print(
531
+ " Tip: set KAGGLE_USERNAME and KAGGLE_KEY env vars, "
532
+ "or place kaggle.json in ~/.kaggle/"
533
+ )
534
+ return None
535
+
536
+ # Verify it's actually a ZIP
537
+ if not zipfile.is_zipfile(zip_path):
538
+ print(" โœ— Downloaded file is not a valid ZIP (may need Kaggle auth)")
539
+ return None
540
+
541
+ # Extract
542
+ try:
543
+ _ensure_dir(extract_dir)
544
+ with zipfile.ZipFile(zip_path, "r") as zf:
545
+ zf.extractall(extract_dir)
546
+ print(f" โœ“ Extracted to {extract_dir}")
547
+ except Exception as exc:
548
+ print(f" โœ— Failed to extract ZIP: {exc}")
549
+ return None
550
+
551
+ # Parse JSON files inside the archive
552
+ json_files = list(extract_dir.rglob("*.json"))
553
+ if not json_files:
554
+ print(" โœ— No JSON files found in Kaggle archive")
555
+ return None
556
+
557
+ print(f" Found {len(json_files)} JSON file(s) in archive")
558
+ tafsir_map: Dict[str, Dict[str, str]] = {}
559
+
560
+ for jf in json_files:
561
+ try:
562
+ with open(jf, "r", encoding="utf-8") as f:
563
+ data = json.load(f)
564
+ except Exception as exc:
565
+ print(f" โœ— Error parsing {jf.name}: {exc}")
566
+ continue
567
+
568
+ if isinstance(data, list):
569
+ tafsir_map.update(_load_tafsir_from_records(data))
570
+ elif isinstance(data, dict):
571
+ # Might be keyed by surah number or some other grouping
572
+ for _key, value in data.items():
573
+ if isinstance(value, list):
574
+ tafsir_map.update(_load_tafsir_from_records(value))
575
+ elif isinstance(value, dict):
576
+ vk = _extract_verse_key(value)
577
+ if vk:
578
+ tt = _extract_tafsir_text(value)
579
+ if tt:
580
+ tafsir_map.setdefault(vk, {}).update(tt)
581
+
582
+ if tafsir_map:
583
+ print(f" โœ“ Loaded tafsir for {len(tafsir_map):,} verses from Kaggle")
584
+ return tafsir_map if tafsir_map else None
585
+
586
 
587
+ def _fetch_tafsir_chapter_api(
588
+ tafsir_id: int, chapter: int,
589
+ ) -> Dict[str, str]:
590
+ """Fetch all tafsir entries for a chapter from Quran.com API."""
591
+ result: Dict[str, str] = {}
592
+ page = 1
593
+ while True:
594
+ url = (
595
+ f"{QURAN_API_BASE}/tafsirs/{tafsir_id}/by_chapter/{chapter}"
596
+ f"?per_page=50&page={page}"
597
+ )
598
+ resp = requests.get(url, timeout=REQUEST_TIMEOUT)
599
+ resp.raise_for_status()
600
+ data = resp.json()
601
 
602
+ for entry in data.get("tafsirs", []):
603
+ raw = entry.get("text", "")
604
+ if raw:
605
+ result[entry["verse_key"]] = strip_html(raw)
606
 
607
+ pagination = data.get("pagination", {})
608
+ if pagination.get("next_page") is None:
609
+ break
610
+ page = pagination["next_page"]
611
+ time.sleep(0.3)
612
+ return result
613
 
614
+
615
+ def fetch_qurancom_tafsir(
616
+ surah_numbers: List[int],
617
+ ) -> Dict[str, Dict[str, str]]:
618
+ """Fallback: fetch tafsir from Quran.com API."""
619
+ print(" Falling back to Quran.com API for tafsir โ€ฆ")
620
+ tafsir_map: Dict[str, Dict[str, str]] = {}
621
+
622
+ for surah_num in tqdm(surah_numbers, desc=" Fetching tafsir"):
623
+ try:
624
+ en_entries = _fetch_tafsir_chapter_api(TAFSIR_EN_ID, surah_num)
625
+ time.sleep(0.3)
626
+ ar_entries = _fetch_tafsir_chapter_api(TAFSIR_AR_ID, surah_num)
627
+ time.sleep(0.3)
628
+
629
+ for vk, text in en_entries.items():
630
+ tafsir_map.setdefault(vk, {})["tafsir_en"] = text
631
+ for vk, text in ar_entries.items():
632
+ tafsir_map.setdefault(vk, {})["tafsir_ar"] = text
633
+ except Exception as exc:
634
+ print(f"\n โœ— Surah {surah_num}: {exc}")
635
+
636
+ return tafsir_map
637
+
638
+
639
+ def enrich_quran_with_tafsir(
640
+ entries: List[Dict],
641
+ force_download: bool = False,
642
+ ) -> List[Dict]:
643
+ """Add tafsir fields to Quran entries (Kaggle โ†’ Quran.com fallback)."""
644
+ print("\n" + "=" * 60)
645
+ print("Step 3: Enriching Quran with Tafsir")
646
+ print("=" * 60)
647
+
648
+ tafsir_map = fetch_kaggle_tafsir(force=force_download)
649
+
650
+ if not tafsir_map:
651
+ surah_numbers = sorted(
652
+ {e["surah_number"] for e in entries if e.get("type") == "quran"}
653
+ )
654
+ tafsir_map = fetch_qurancom_tafsir(surah_numbers)
655
+
656
+ if not tafsir_map:
657
+ print(" โœ— No tafsir data available")
658
+ return entries
659
+
660
+ enriched = 0
661
+ for entry in entries:
662
+ if entry.get("type") != "quran":
663
+ continue
664
+ verse_key = f"{entry['surah_number']}:{entry['verse_number']}"
665
+ tafsir = tafsir_map.get(verse_key, {})
666
+ entry["tafsir_en"] = tafsir.get("tafsir_en", "")
667
+ entry["tafsir_ar"] = tafsir.get("tafsir_ar", "")
668
+ if entry["tafsir_en"] or entry["tafsir_ar"]:
669
+ enriched += 1
670
+
671
+ print(f" โœ“ Enriched {enriched:,} verses with tafsir")
672
+ return entries
673
+
674
+
675
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
676
+ # STEP 4: FETCH & BUILD HADITH ENTRIES
677
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
678
+
679
+ def _pick_best_grade(grades: List[Dict]) -> str:
680
+ """Pick the most authoritative grade from a list of scholar grades."""
681
+ priority = ["darussalam", "al-albani", "zubair ali zai"]
682
+ grade_map = {}
683
+ for g in grades:
684
+ name = g.get("name", "").lower()
685
+ grade_text = g.get("grade", "")
686
+ if grade_text:
687
+ grade_map[name] = grade_text
688
+
689
+ for scholar in priority:
690
+ for name, grade in grade_map.items():
691
+ if scholar in name:
692
+ return grade
693
+
694
+ for g in grades:
695
+ if g.get("grade"):
696
+ return g["grade"]
697
+ return ""
698
+
699
+
700
+ def _fetch_fawaz_grades(
701
+ edition: str, force: bool = False,
702
+ ) -> Optional[Dict[int, str]]:
703
+ """Fetch grades for a hadith edition from fawazahmed0."""
704
+ cache_path = CACHE_DIR / "hadith" / "fawazahmed0" / f"{edition}.json"
705
+
706
+ urls = [
707
+ f"{FAWAZ_CDN_BASE}/editions/{edition}.json",
708
+ f"{FAWAZ_RAW_BASE}/editions/{edition}.json",
709
+ ]
710
+
711
+ data = None
712
+ for url in urls:
713
+ try:
714
+ data = download_json(url, cache_path=cache_path, force=force)
715
+ break
716
+ except Exception:
717
+ continue
718
+
719
+ if not data:
720
+ return None
721
+
722
+ grades: Dict[int, str] = {}
723
+ for hadith in data.get("hadiths", []):
724
+ hnum = hadith.get("hadithnumber")
725
+ if hnum is None:
726
+ continue
727
+ grade_list = hadith.get("grades", [])
728
+ if grade_list:
729
+ grades[int(hnum)] = _pick_best_grade(grade_list)
730
+ return grades
731
+
732
+
733
+ def fetch_hadith_sources(
734
+ force: bool = False,
735
+ ) -> Tuple[Dict[str, Dict], Dict[str, Dict[int, str]]]:
736
+ """Download hadith data from AhmedBaset and grades from fawazahmed0.
737
+
738
+ Returns (ahmedbaset_books, fawaz_grades).
739
+ """
740
+ print("\n" + "=" * 60)
741
+ print("Step 4a: Fetching Hadith Sources")
742
+ print("=" * 60)
743
+
744
+ # AhmedBaset hadith books
745
+ print(" Downloading from AhmedBaset/hadith-json โ€ฆ")
746
+ ahmedbaset_books: Dict[str, Dict] = {}
747
+ for filename in tqdm(HADITH_BOOKS.keys(), desc=" Books"):
748
+ try:
749
+ url = f"{AHMEDBASET_BASE_URL}/{filename}"
750
+ data = download_json(
751
+ url,
752
+ cache_path=CACHE_DIR / "hadith" / "ahmedbaset" / filename,
753
+ force=force,
754
+ )
755
+ ahmedbaset_books[filename] = data
756
+ except Exception as exc:
757
+ print(f"\n โœ— {filename}: {exc}")
758
+ print(f" โœ“ Loaded {len(ahmedbaset_books)} books")
759
+
760
+ # fawazahmed0 editions (for grades)
761
+ print(" Downloading grade data from fawazahmed0/hadith-api โ€ฆ")
762
+ fawaz_grades: Dict[str, Dict[int, str]] = {}
763
+ for prefix, edition in tqdm(FAWAZ_EDITION_MAP.items(), desc=" Editions"):
764
+ grades = _fetch_fawaz_grades(edition, force)
765
+ if grades:
766
+ fawaz_grades[prefix] = grades
767
+ print(f" โœ“ Loaded grades for {len(fawaz_grades)} collections")
768
+
769
+ return ahmedbaset_books, fawaz_grades
770
+
771
+
772
+ def build_hadith_entries(
773
+ ahmedbaset_books: Dict[str, Dict],
774
+ fawaz_grades: Dict[str, Dict[int, str]],
775
+ ) -> List[Dict]:
776
+ """Merge AhmedBaset data with fawazahmed0 grades into hadith entries."""
777
+ print("\n" + "=" * 60)
778
+ print("Step 4b: Building Hadith Entries")
779
+ print("=" * 60)
780
+
781
+ entries: List[Dict] = []
782
+ stats: Dict[str, int] = defaultdict(int)
783
+
784
+ for filename, book_config in HADITH_BOOKS.items():
785
+ book_data = ahmedbaset_books.get(filename)
786
+ if not book_data:
787
+ print(f" โœ— Skipping {filename} (not downloaded)")
788
+ continue
789
+
790
+ prefix = book_config["id_prefix"]
791
+ grades = fawaz_grades.get(prefix, {})
792
+ hadiths = book_data.get("hadiths", [])
793
+ chapter_map = {
794
+ ch.get("id"): ch.get("arabic", "")
795
+ for ch in book_data.get("chapters", [])
796
+ }
797
+
798
+ for hadith in hadiths:
799
+ hadith_num = hadith.get("idInBook", hadith.get("id", ""))
800
+
801
+ # English text
802
+ if isinstance(hadith.get("english"), dict):
803
+ parts = []
804
+ if hadith["english"].get("narrator"):
805
+ parts.append(hadith["english"]["narrator"])
806
+ if hadith["english"].get("text"):
807
+ parts.append(hadith["english"]["text"])
808
+ english = " ".join(parts)
809
+ else:
810
+ english = str(hadith.get("english", ""))
811
+
812
+ # Chapter name
813
+ chapter_name = ""
814
+ if "chapterId" in hadith:
815
+ chapter_name = chapter_map.get(hadith["chapterId"], "")
816
+
817
+ # Grade from fawazahmed0
818
+ grade = ""
819
+ if hadith_num:
820
+ grade = grades.get(int(hadith_num), "")
821
+
822
+ entries.append(
823
+ {
824
+ "id": f"{prefix}_{hadith_num}",
825
+ "arabic": hadith.get("arabic", ""),
826
+ "english": english,
827
+ "reference": f"{book_config['collection']} {hadith_num}",
828
+ "hadith_number": hadith_num,
829
+ "collection": book_config["collection"],
830
+ "chapter": chapter_name,
831
+ "grade": grade,
832
+ "type": "hadith",
833
+ "author": book_config["author"],
834
+ }
835
+ )
836
+ stats[book_config["collection"]] += 1
837
+
838
+ print(f" โœ“ Built {len(entries):,} hadith entries")
839
+ print("\n Breakdown:")
840
+ for collection, count in sorted(stats.items()):
841
+ print(f" {collection}: {count:,}")
842
+
843
+ graded = sum(1 for e in entries if e.get("grade"))
844
+ print(f"\n Hadiths with grades: {graded:,} / {len(entries):,}")
845
+ return entries
846
+
847
+
848
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
849
+ # STEP 5: GENERATE METADATA
850
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
851
+
852
+ def generate_metadata(
853
+ quran_entries: List[Dict],
854
+ hadith_entries: List[Dict],
855
+ ) -> List[Dict]:
856
+ """Combine all entries and write metadata.json."""
857
+ print("\n" + "=" * 60)
858
+ print("Step 5: Generating metadata.json")
859
+ print("=" * 60)
860
+
861
+ documents = quran_entries + hadith_entries
862
+
863
+ print(f" Quran entries: {len(quran_entries):,}")
864
+ print(f" Hadith entries: {len(hadith_entries):,}")
865
+ print(f" Total: {len(documents):,}")
866
+
867
+ # Check for duplicate IDs
868
+ ids = [d["id"] for d in documents]
869
+ if len(ids) != len(set(ids)):
870
+ dupes = len(ids) - len(set(ids))
871
+ print(f" โš  Warning: {dupes} duplicate IDs found")
872
+
873
+ print(f" Writing to {METADATA_PATH} โ€ฆ")
874
+ with open(METADATA_PATH, "w", encoding="utf-8") as f:
875
+ json.dump(documents, f, ensure_ascii=False, indent=2)
876
+
877
+ size_mb = METADATA_PATH.stat().st_size / (1024 * 1024)
878
+ print(f" โœ“ File size: {size_mb:.2f} MB")
879
+ return documents
880
+
881
+
882
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
883
+ # STEP 6: BUILD FAISS INDEX
884
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
885
+
886
+ def build_faiss_index(
887
+ documents: List[Dict],
888
+ model_name: str = DEFAULT_EMBED_MODEL,
889
+ ):
890
+ """Generate embeddings and build FAISS index."""
891
+ print("\n" + "=" * 60)
892
+ print("Step 6: Building FAISS Index")
893
+ print("=" * 60)
894
+
895
+ print(f" Loading embedding model: {model_name}")
896
  model = SentenceTransformer(model_name)
897
  embedding_dim = model.get_sentence_embedding_dimension()
898
+ print(f" Embedding dimension: {embedding_dim}")
899
 
900
+ # Build text for each document
901
+ all_texts: List[str] = []
902
  for doc in documents:
903
  if doc.get("type") == "quran":
904
+ # Include truncated tafsir for richer semantic matching
905
+ tafsir_snippet = doc.get("tafsir_en", "")[:500]
906
+ text = (
907
+ f"{doc.get('arabic', '')} {doc.get('english', '')} "
908
+ f"{tafsir_snippet}"
909
+ )
910
  else: # hadith
911
+ text = (
912
+ f"{doc.get('collection', '')} "
913
+ f"{doc.get('arabic', '')} "
914
+ f"{doc.get('english', '')}"
915
+ )
916
  all_texts.append(text.strip())
917
 
918
+ print(f"\n Generating embeddings for {len(all_texts):,} documents โ€ฆ")
 
 
919
  all_embeddings = []
920
+ for i in tqdm(
921
+ range(0, len(all_texts), EMBED_BATCH_SIZE),
922
+ desc=" Embedding batches",
923
+ ):
924
+ batch = all_texts[i : i + EMBED_BATCH_SIZE]
925
+ batch_emb = model.encode(batch, convert_to_numpy=True)
926
+ all_embeddings.extend(batch_emb)
927
 
928
  embeddings = np.array(all_embeddings, dtype=np.float32)
929
+ print(f" Embeddings shape: {embeddings.shape}")
930
 
931
+ print("\n Creating FAISS index (IndexFlatIP + L2 normalization) โ€ฆ")
932
+ index = faiss.IndexFlatIP(embedding_dim)
 
933
  faiss.normalize_L2(embeddings)
934
  index.add(embeddings)
935
 
936
+ print(f" Saving to {INDEX_PATH}")
937
+ faiss.write_index(index, str(INDEX_PATH))
938
+
939
+ size_mb = INDEX_PATH.stat().st_size / (1024 * 1024)
940
+ print(f"\n {'=' * 50}")
941
+ print(f" Index Build Complete")
942
+ print(f" {'=' * 50}")
943
+ print(f" Documents indexed: {index.ntotal:,}")
944
+ print(f" Index file size: {size_mb:.2f} MB")
945
+
946
+
947
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
948
+ # CLI
949
+ # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
950
+
951
+ def main():
952
+ parser = argparse.ArgumentParser(
953
+ description=(
954
+ "QModel Dataset Builder v2 โ€” builds metadata.json and "
955
+ "QModel.index from scratch using multiple authoritative sources"
956
+ ),
957
+ )
958
+ parser.add_argument(
959
+ "--index-only",
960
+ action="store_true",
961
+ help="Only build FAISS index from existing metadata.json",
962
+ )
963
+ parser.add_argument(
964
+ "--data-only",
965
+ action="store_true",
966
+ help="Only generate metadata.json, skip index building",
967
+ )
968
+ parser.add_argument(
969
+ "--skip-tafsir",
970
+ action="store_true",
971
+ help="Skip tafsir enrichment",
972
+ )
973
+ parser.add_argument(
974
+ "--force-download",
975
+ action="store_true",
976
+ help="Re-download all sources even if cached",
977
+ )
978
+ parser.add_argument(
979
+ "--model",
980
+ default=DEFAULT_EMBED_MODEL,
981
+ help=f"Sentence-transformer model for embeddings (default: {DEFAULT_EMBED_MODEL})",
982
+ )
983
+ args = parser.parse_args()
984
+
985
+ # โ”€โ”€ index-only: skip all data fetching โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
986
+ if args.index_only:
987
+ print("Loading existing metadata.json โ€ฆ")
988
+ with open(METADATA_PATH, "r", encoding="utf-8") as f:
989
+ documents = json.load(f)
990
+ build_faiss_index(documents, model_name=args.model)
991
+ print("\nโœ“ Done!")
992
+ return
993
+
994
+ force = args.force_download
995
+
996
+ # Step 1: Fetch Quran sources
997
+ cdn_chapters, quran_data, chapter_meta, sem_translations = fetch_quran_sources(force=force)
998
+
999
+ # Step 2: Build Quran entries
1000
+ quran_entries = build_quran_entries(cdn_chapters, quran_data, chapter_meta, sem_translations)
1001
+
1002
+ # Step 3: Enrich with tafsir
1003
+ if not args.skip_tafsir:
1004
+ quran_entries = enrich_quran_with_tafsir(
1005
+ quran_entries, force_download=force,
1006
+ )
1007
+ else:
1008
+ print("\nSkipping tafsir enrichment (--skip-tafsir)")
1009
+
1010
+ # Step 4: Fetch and build hadith entries
1011
+ ahmedbaset_books, fawaz_grades = fetch_hadith_sources(force=force)
1012
+ hadith_entries = build_hadith_entries(ahmedbaset_books, fawaz_grades)
1013
+
1014
+ # Step 5: Generate metadata.json
1015
+ documents = generate_metadata(quran_entries, hadith_entries)
1016
+
1017
+ # Step 6: Build FAISS index
1018
+ if not args.data_only:
1019
+ build_faiss_index(documents, model_name=args.model)
1020
+ else:
1021
+ print("\nSkipping index build (--data-only)")
1022
 
1023
+ print("\nโœ“ Done!")
 
 
 
 
 
 
 
1024
 
1025
 
1026
  if __name__ == "__main__":
1027
+ main()
main.py CHANGED
@@ -270,7 +270,7 @@ rewrite_cache = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL * 6)
270
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
271
  # ARABIC NLP โ€” normalisation + light stemming
272
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
273
- _DIACRITICS = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u0671\u06D6-\u06ED]")
274
  _ALEF_VARS = re.compile(r"[ุฃุฅุขูฑ]")
275
  _WAW_HAMZA = re.compile(r"ุค")
276
  _YA_HAMZA = re.compile(r"ุฆ")
@@ -374,18 +374,25 @@ Reply ONLY with a valid JSON object โ€” no markdown, no preamble:
374
  "ar_query": "<query in clear Arabic ูุตุญู‰, โ‰ค25 words>",
375
  "en_query": "<query in clear English, โ‰ค25 words>",
376
  "keywords": ["<3-7 key Arabic or English terms from the question>"],
377
- "intent": "<one of: fatwa | tafsir | hadith | count | auth | general>"
378
  }
379
 
380
  Intent Detection Rules (CRITICAL):
381
- - 'count' intent = asking for number/frequency (ูƒู… ู…ุฑุฉ, how many times, count occurrences)
 
 
 
382
  - 'auth' intent = asking about authenticity (ุตุญูŠุญุŸ, ู‡ู„ ุตุญูŠุญ, is it authentic, verify hadith grade)
383
  - 'hadith' intent = asking about specific hadith meaning/text (not authenticity)
384
  - 'tafsir' intent = asking about Quranic verses or Islamic ruling (fatwa)
385
  - 'general' intent = other questions
386
 
387
  Examples:
388
- - "ูƒู… ู…ุฑุฉ ุฐููƒุฑุช ูƒู„ู…ุฉ ู…ุฑูŠู…" โ†’ intent: count
 
 
 
 
389
  - "ู‡ู„ ุญุฏูŠุซ ุฅู†ู…ุง ุงู„ุฃุนู…ุงู„ ุจุงู„ู†ูŠุงุช ุตุญูŠุญ" โ†’ intent: auth (asking if authentic!)
390
  - "ู…ุง ู…ุนู†ู‰ ุญุฏูŠุซ ุฅู†ู…ุง ุงู„ุฃุนู…ุงู„" โ†’ intent: hadith
391
  - "ู…ุง ุญูƒู… ุงู„ุฑุจุง ููŠ ุงู„ุฅุณู„ุงู…" โ†’ intent: fatwa
@@ -445,11 +452,116 @@ _AUTH_AR = re.compile(
445
  r"(ุตุญูŠุญ|ุญุณู†|ุถุนูŠู|ุฏุฑุฌุฉ|ุตุญุฉ|ุชุตุญูŠุญ|ู‡ู„.*ุตุญูŠุญ|ู‡ู„.*ุถุนูŠู)"
446
  )
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
450
  """Detect if query is asking for word frequency analysis."""
 
 
 
 
 
 
451
  if rewrite.get("intent") == "count":
452
  kws = rewrite.get("keywords", [])
 
 
 
 
453
  return kws[0] if kws else None
454
 
455
  if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
@@ -572,7 +684,7 @@ async def hybrid_search(
572
  seen: set = set()
573
  candidates = []
574
  for dist, idx in zip(distances[0], indices[0]):
575
- item_idx = int(idx) // 2
576
  if item_idx not in seen and 0 <= item_idx < len(dataset):
577
  seen.add(item_idx)
578
  item = dataset[item_idx]
@@ -703,6 +815,13 @@ _TASK_INSTRUCTIONS: Dict[str, str] = {
703
  "2. List example occurrences with Surah names.\n"
704
  "3. Comment on significance."
705
  ),
 
 
 
 
 
 
 
706
  "general": (
707
  "The user has a general Islamic question. Steps:\n"
708
  "1. Give a direct answer first.\n"
@@ -753,8 +872,21 @@ def build_messages(
753
  lang: str,
754
  intent: str,
755
  analysis: Optional[dict] = None,
 
756
  ) -> List[dict]:
757
  """Build system and user messages for LLM."""
 
 
 
 
 
 
 
 
 
 
 
 
758
  if analysis:
759
  by_surah_str = "\n ".join([
760
  f"Surah {s}: {data['name']} ({data['count']} times)"
@@ -1016,7 +1148,8 @@ async def run_rag_pipeline(
1016
  rewrite = await rewrite_query(question, state.llm)
1017
  intent = rewrite.get("intent", "general")
1018
 
1019
- # 2. Intent detection + hybrid search โ€” concurrently
 
1020
  kw_task, search_task = (
1021
  detect_analysis_intent(question, rewrite),
1022
  hybrid_search(
@@ -1025,11 +1158,26 @@ async def run_rag_pipeline(
1025
  top_k, source_type, grade_filter,
1026
  ),
1027
  )
1028
- analysis_kw, results = await asyncio.gather(kw_task, search_task)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1029
 
1030
- # 3. Keyword frequency count (if needed)
1031
  analysis = None
1032
- if analysis_kw:
1033
  analysis = await count_occurrences(analysis_kw, state.dataset)
1034
  logger.info("Analysis: kw=%s count=%d", analysis_kw, analysis["total_count"])
1035
 
@@ -1042,8 +1190,8 @@ async def run_rag_pipeline(
1042
  intent, top_score, cfg.CONFIDENCE_THRESHOLD,
1043
  )
1044
 
1045
- # 5. Confidence gate
1046
- if top_score < cfg.CONFIDENCE_THRESHOLD:
1047
  logger.warning(
1048
  "Low confidence (%.3f < %.2f) โ€” returning safe fallback",
1049
  top_score, cfg.CONFIDENCE_THRESHOLD,
@@ -1060,7 +1208,7 @@ async def run_rag_pipeline(
1060
 
1061
  # 6. Build context + prompt + LLM call
1062
  context = build_context(results)
1063
- messages = build_messages(context, question, lang, intent, analysis)
1064
 
1065
  try:
1066
  answer = await state.llm.chat(
 
270
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
271
  # ARABIC NLP โ€” normalisation + light stemming
272
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
273
+ _DIACRITICS = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u06D6-\u06ED]")
274
  _ALEF_VARS = re.compile(r"[ุฃุฅุขูฑ]")
275
  _WAW_HAMZA = re.compile(r"ุค")
276
  _YA_HAMZA = re.compile(r"ุฆ")
 
374
  "ar_query": "<query in clear Arabic ูุตุญู‰, โ‰ค25 words>",
375
  "en_query": "<query in clear English, โ‰ค25 words>",
376
  "keywords": ["<3-7 key Arabic or English terms from the question>"],
377
+ "intent": "<one of: fatwa | tafsir | hadith | count | surah_info | auth | general>"
378
  }
379
 
380
  Intent Detection Rules (CRITICAL):
381
+ - 'surah_info' intent = asking about surah metadata: verse count, revelation type, surah number
382
+ (ูƒู… ุนุฏุฏ ุขูŠุงุช ุณูˆุฑุฉ, ูƒู… ุขูŠุฉ ููŠ ุณูˆุฑุฉ, how many verses in surah, is surah X meccan/medinan)
383
+ - 'count' intent = asking for WORD frequency/occurrence count (ูƒู… ู…ุฑุฉ ุฐููƒุฑุช ูƒู„ู…ุฉ, how many times is word X mentioned)
384
+ NOTE: "ูƒู… ุนุฏุฏ ุขูŠุงุช ุณูˆุฑุฉ" is surah_info NOT count!
385
  - 'auth' intent = asking about authenticity (ุตุญูŠุญุŸ, ู‡ู„ ุตุญูŠุญ, is it authentic, verify hadith grade)
386
  - 'hadith' intent = asking about specific hadith meaning/text (not authenticity)
387
  - 'tafsir' intent = asking about Quranic verses or Islamic ruling (fatwa)
388
  - 'general' intent = other questions
389
 
390
  Examples:
391
+ - "ูƒู… ุนุฏุฏ ุขูŠุงุช ุณูˆุฑุฉ ุขู„ ุนู…ุฑุงู†" โ†’ intent: surah_info (asking about surah metadata!)
392
+ - "ูƒู… ุขูŠุฉ ููŠ ุณูˆุฑุฉ ุงู„ุจู‚ุฑุฉ" โ†’ intent: surah_info
393
+ - "how many verses in surah al-baqara" โ†’ intent: surah_info
394
+ - "ู‡ู„ ุณูˆุฑุฉ ุงู„ูุงุชุญุฉ ู…ูƒูŠุฉ ุฃู… ู…ุฏู†ูŠุฉ" โ†’ intent: surah_info
395
+ - "ูƒู… ู…ุฑุฉ ุฐููƒุฑุช ูƒู„ู…ุฉ ู…ุฑูŠู…" โ†’ intent: count (asking about WORD frequency!)
396
  - "ู‡ู„ ุญุฏูŠุซ ุฅู†ู…ุง ุงู„ุฃุนู…ุงู„ ุจุงู„ู†ูŠุงุช ุตุญูŠุญ" โ†’ intent: auth (asking if authentic!)
397
  - "ู…ุง ู…ุนู†ู‰ ุญุฏูŠุซ ุฅู†ู…ุง ุงู„ุฃุนู…ุงู„" โ†’ intent: hadith
398
  - "ู…ุง ุญูƒู… ุงู„ุฑุจุง ููŠ ุงู„ุฅุณู„ุงู…" โ†’ intent: fatwa
 
452
  r"(ุตุญูŠุญ|ุญุณู†|ุถุนูŠู|ุฏุฑุฌุฉ|ุตุญุฉ|ุชุตุญูŠุญ|ู‡ู„.*ุตุญูŠุญ|ู‡ู„.*ุถุนูŠู)"
453
  )
454
 
455
+ # โ”€โ”€ Surah metadata queries (verse count, revelation type, etc.) โ”€โ”€โ”€โ”€โ”€โ”€โ”€
456
+ _SURAH_VERSES_AR = re.compile(
457
+ r"ูƒู…\s+(?:ุนุฏุฏ\s+)?ุขูŠุงุช?\s*(?:ููŠ\s+|ูู‰\s+)?(?:ุณูˆุฑุฉ|ุณูˆุฑู‡)"
458
+ r"|ุนุฏุฏ\s+ุขูŠุงุช?\s+(?:ุณูˆุฑุฉ|ุณูˆุฑู‡)"
459
+ r"|ูƒู…\s+ุขูŠุฉ\s+(?:ููŠ|ูู‰)\s+(?:ุณูˆุฑุฉ|ุณูˆุฑู‡)"
460
+ r"|(?:ุณูˆุฑุฉ|ุณูˆุฑู‡)\s+[\u0600-\u06FF\s]+\s+(?:ูƒู…\s+ุขูŠุฉ|ุนุฏุฏ\s+ุขูŠุงุช?)"
461
+ )
462
+ _SURAH_VERSES_EN = re.compile(
463
+ r"(?:how many|number of)\s+(?:verses?|ayat|ayahs?)\s+(?:in|of|does)\b"
464
+ r"|\bsurah?\b.*\b(?:how many|number of)\s+(?:verses?|ayat|ayahs?)",
465
+ re.I,
466
+ )
467
+ _SURAH_TYPE_AR = re.compile(
468
+ r"(?:ุณูˆุฑุฉ|ุณูˆุฑู‡)\s+[\u0600-\u06FF\s]+\s+(?:ู…ูƒูŠุฉ|ู…ุฏู†ูŠุฉ|ู…ูƒูŠ|ู…ุฏู†ูŠ)"
469
+ r"|(?:ู‡ู„|ู…ุง\s+ู†ูˆุน)\s+(?:ุณูˆุฑุฉ|ุณูˆุฑู‡)\s+[\u0600-\u06FF\s]+\s+(?:ู…ูƒูŠุฉ|ู…ุฏู†ูŠุฉ)"
470
+ )
471
+ _SURAH_NAME_AR = re.compile(
472
+ r"(?:ุณูˆุฑุฉ|ุณูˆุฑู‡)\s+([\u0600-\u06FF\u0750-\u077F\s]+)"
473
+ )
474
+ _SURAH_NAME_EN = re.compile(
475
+ r"\bsurah?\s+([a-zA-Z'\-]+(?:[\s\-][a-zA-Z'\-]+)*)",
476
+ re.I,
477
+ )
478
+
479
+
480
+ def _extract_surah_name(query: str) -> Optional[str]:
481
+ """Extract surah name from a query string."""
482
+ for pat in (_SURAH_NAME_AR, _SURAH_NAME_EN):
483
+ m = pat.search(query)
484
+ if m:
485
+ name = m.group(1).strip()
486
+ # Clean trailing punctuation and question words
487
+ name = re.sub(r'[\sุŸ?!]+$', '', name)
488
+ name = re.sub(r'\s+(ูƒู…|ุนุฏุฏ|ู‡ู„|ู…ุง|ููŠ|ูู‰)$', '', name)
489
+ if name:
490
+ return name
491
+ return None
492
+
493
+
494
+ async def detect_surah_info(query: str, rewrite: dict) -> Optional[dict]:
495
+ """Detect if query asks about surah metadata (verse count, type, etc.)."""
496
+ is_verse_q = bool(_SURAH_VERSES_AR.search(query) or _SURAH_VERSES_EN.search(query))
497
+ is_type_q = bool(_SURAH_TYPE_AR.search(query))
498
+
499
+ if not (is_verse_q or is_type_q):
500
+ # Also check LLM rewrite intent
501
+ if rewrite.get("intent") == "surah_info":
502
+ is_verse_q = True
503
+ elif rewrite.get("intent") == "count":
504
+ kw_text = " ".join(rewrite.get("keywords", []))
505
+ if any(w in kw_text for w in ("ุขูŠุงุช", "ุขูŠุฉ", "verses", "ayat")):
506
+ is_verse_q = True
507
+ else:
508
+ return None
509
+ else:
510
+ return None
511
+
512
+ surah_name = _extract_surah_name(query)
513
+ if not surah_name:
514
+ return None
515
+
516
+ return {
517
+ "surah_query": surah_name,
518
+ "query_type": "verses" if is_verse_q else "type",
519
+ }
520
+
521
+
522
+ async def lookup_surah_info(surah_query: str, dataset: list) -> Optional[dict]:
523
+ """Look up surah metadata from dataset entries."""
524
+ query_norm = normalize_arabic(surah_query, aggressive=True).lower()
525
+ query_clean = re.sub(r"^(ุงู„|al[\-\s']*)", "", query_norm, flags=re.I).strip()
526
+
527
+ for item in dataset:
528
+ if item.get("type") != "quran":
529
+ continue
530
+ for field in ("surah_name_ar", "surah_name_en", "surah_name_transliteration"):
531
+ val = item.get(field, "")
532
+ if not val:
533
+ continue
534
+ val_norm = normalize_arabic(val, aggressive=True).lower()
535
+ val_clean = re.sub(r"^(ุงู„|al[\-\s']*)", "", val_norm, flags=re.I).strip()
536
+ if (query_norm in val_norm or val_norm in query_norm
537
+ or (query_clean and val_clean
538
+ and (query_clean in val_clean or val_clean in query_clean))
539
+ or (query_clean and query_clean in val_norm)):
540
+ return {
541
+ "surah_number": item.get("surah_number"),
542
+ "surah_name_ar": item.get("surah_name_ar", ""),
543
+ "surah_name_en": item.get("surah_name_en", ""),
544
+ "surah_name_transliteration": item.get("surah_name_transliteration", ""),
545
+ "total_verses": item.get("total_verses"),
546
+ "revelation_type": item.get("revelation_type", ""),
547
+ }
548
+ return None
549
+
550
 
551
  async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
552
  """Detect if query is asking for word frequency analysis."""
553
+ # Skip surah metadata queries โ€” those are handled by detect_surah_info
554
+ if (_SURAH_VERSES_AR.search(query) or _SURAH_VERSES_EN.search(query)
555
+ or _SURAH_TYPE_AR.search(query)
556
+ or rewrite.get("intent") == "surah_info"):
557
+ return None
558
+
559
  if rewrite.get("intent") == "count":
560
  kws = rewrite.get("keywords", [])
561
+ # Skip if keywords suggest surah metadata, not word frequency
562
+ kw_text = " ".join(kws)
563
+ if any(w in kw_text for w in ("ุขูŠุงุช", "ุขูŠุฉ", "verses", "ayat")):
564
+ return None
565
  return kws[0] if kws else None
566
 
567
  if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
 
684
  seen: set = set()
685
  candidates = []
686
  for dist, idx in zip(distances[0], indices[0]):
687
+ item_idx = int(idx)
688
  if item_idx not in seen and 0 <= item_idx < len(dataset):
689
  seen.add(item_idx)
690
  item = dataset[item_idx]
 
815
  "2. List example occurrences with Surah names.\n"
816
  "3. Comment on significance."
817
  ),
818
+ "surah_info": (
819
+ "The user asks about surah metadata. Steps:\n"
820
+ "1. State the answer from the SURAH INFORMATION block EXACTLY.\n"
821
+ "2. Use the total_verses number precisely โ€” do NOT guess or calculate.\n"
822
+ "3. Mention the revelation type (Meccan/Medinan) if available.\n"
823
+ "4. Optionally add brief scholarly context about the surah."
824
+ ),
825
  "general": (
826
  "The user has a general Islamic question. Steps:\n"
827
  "1. Give a direct answer first.\n"
 
872
  lang: str,
873
  intent: str,
874
  analysis: Optional[dict] = None,
875
+ surah_info: Optional[dict] = None,
876
  ) -> List[dict]:
877
  """Build system and user messages for LLM."""
878
+ if surah_info:
879
+ info_block = (
880
+ f"\n[SURAH INFORMATION]\n"
881
+ f"Surah Name (Arabic): {surah_info['surah_name_ar']}\n"
882
+ f"Surah Name (English): {surah_info['surah_name_en']}\n"
883
+ f"Surah Number: {surah_info['surah_number']}\n"
884
+ f"Total Verses: {surah_info['total_verses']}\n"
885
+ f"Revelation Type: {surah_info['revelation_type']}\n"
886
+ f"Transliteration: {surah_info['surah_name_transliteration']}\n"
887
+ )
888
+ context = info_block + context
889
+
890
  if analysis:
891
  by_surah_str = "\n ".join([
892
  f"Surah {s}: {data['name']} ({data['count']} times)"
 
1148
  rewrite = await rewrite_query(question, state.llm)
1149
  intent = rewrite.get("intent", "general")
1150
 
1151
+ # 2. Surah info detection + analysis intent + hybrid search โ€” concurrently
1152
+ surah_task = detect_surah_info(question, rewrite)
1153
  kw_task, search_task = (
1154
  detect_analysis_intent(question, rewrite),
1155
  hybrid_search(
 
1158
  top_k, source_type, grade_filter,
1159
  ),
1160
  )
1161
+ surah_det, analysis_kw, results = await asyncio.gather(
1162
+ surah_task, kw_task, search_task,
1163
+ )
1164
+
1165
+ # 3a. Surah metadata lookup (if detected)
1166
+ surah_info = None
1167
+ if surah_det:
1168
+ surah_info = await lookup_surah_info(surah_det["surah_query"], state.dataset)
1169
+ if surah_info:
1170
+ intent = "surah_info"
1171
+ logger.info(
1172
+ "Surah info: %s โ†’ %s (%d verses)",
1173
+ surah_det["surah_query"],
1174
+ surah_info["surah_name_en"],
1175
+ surah_info.get("total_verses", 0),
1176
+ )
1177
 
1178
+ # 3b. Keyword frequency count (if needed and NOT a surah info query)
1179
  analysis = None
1180
+ if analysis_kw and not surah_info:
1181
  analysis = await count_occurrences(analysis_kw, state.dataset)
1182
  logger.info("Analysis: kw=%s count=%d", analysis_kw, analysis["total_count"])
1183
 
 
1190
  intent, top_score, cfg.CONFIDENCE_THRESHOLD,
1191
  )
1192
 
1193
+ # 5. Confidence gate โ€” skip for surah_info (metadata is from dataset, not search)
1194
+ if not surah_info and top_score < cfg.CONFIDENCE_THRESHOLD:
1195
  logger.warning(
1196
  "Low confidence (%.3f < %.2f) โ€” returning safe fallback",
1197
  top_score, cfg.CONFIDENCE_THRESHOLD,
 
1208
 
1209
  # 6. Build context + prompt + LLM call
1210
  context = build_context(results)
1211
+ messages = build_messages(context, question, lang, intent, analysis, surah_info)
1212
 
1213
  try:
1214
  answer = await state.llm.chat(