Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +21 -0
build_index.py +993 -45
main.py +160 -12

README.md CHANGED Viewed

@@ -1,3 +1,24 @@
 # QModel v4 — Islamic RAG System
 **Specialized Qur'an & Hadith Knowledge System with Dual LLM Support**

+---
+title: QModel
+emoji: 🕌
+colorFrom: green
+colorTo: blue
+sdk: docker
+app_port: 8000
+license: mit
+tags:
+  - quran
+  - hadith
+  - islamic
+  - rag
+  - faiss
+  - nlp
+  - arabic
+language:
+  - ar
+  - en
+---
 # QModel v4 — Islamic RAG System
 **Specialized Qur'an & Hadith Knowledge System with Dual LLM Support**

build_index.py CHANGED Viewed

@@ -1,79 +1,1027 @@
 #!/usr/bin/env python3
 """
-Regenerate FAISS index with enriched metadata.
-This script loads the enriched metadata and generates embeddings for all documents.
 """
 import json
 import numpy as np
 from pathlib import Path
 import faiss
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
-def generate_embeddings(model_name: str = "intfloat/multilingual-e5-large"):
-    """Generate embeddings for all documents in metadata.json"""
-    metadata_path = Path("/Users/elgendy/Projects/QModel/metadata.json")
-    index_path = Path("/Users/elgendy/Projects/QModel/QModel.index")
-    # Load metadata
-    print("Loading metadata...")
-    with open(metadata_path, 'r', encoding='utf-8') as f:
-        documents = json.load(f)
-    print(f"Total documents: {len(documents)}")
-    # Load embedding model
-    print(f"\nLoading embedding model: {model_name}")
     model = SentenceTransformer(model_name)
     embedding_dim = model.get_sentence_embedding_dimension()
-    print(f"Embedding dimension: {embedding_dim}")
-    # Prepare texts for embedding
-    all_texts = []
     for doc in documents:
         if doc.get("type") == "quran":
-            # For Quran: use Tafseer/meaning + Sura name
-            text = f"{doc.get('surah_name_en', '')} {doc.get('english', '')}"
         else:  # hadith
-            # For Hadith: use collection + Arabic text (for better semantic matching)
-            text = f"{doc.get('collection', '')} {doc.get('arabic', '')} {doc.get('english', '')}"
         all_texts.append(text.strip())
-    # Generate embeddings in batches for efficiency
-    print(f"\nGenerating embeddings for {len(all_texts)} documents...")
-    batch_size = 32
     all_embeddings = []
-    for i in tqdm(range(0, len(all_texts), batch_size), desc="Embedding batches"):
-        batch_texts = all_texts[i:i + batch_size]
-        batch_embeddings = model.encode(batch_texts, convert_to_numpy=True)
-        all_embeddings.extend(batch_embeddings)
     embeddings = np.array(all_embeddings, dtype=np.float32)
-    print(f"Generated embeddings shape: {embeddings.shape}")
-    # Create FAISS index
-    print("\nCreating FAISS index...")
-    index = faiss.IndexFlatIP(embedding_dim)  # Inner product (cosine on normalized)
     faiss.normalize_L2(embeddings)
     index.add(embeddings)
-    # Save index
-    print(f"Saving FAISS index to {index_path}")
-    faiss.write_index(index, str(index_path))
-    print(f"\n{'='*60}")
-    print("Index Generation Complete")
-    print(f"{'='*60}")
-    print(f"Documents indexed: {len(documents)}")
-    print(f"Embeddings generated: {len(all_embeddings)}")
-    print(f"Index file size: {index_path.stat().st_size / (1024*1024):.2f} MB")
-    print(f"Index capacity: {index.ntotal}")
-    print(f"{'='*60}")
 if __name__ == "__main__":
-    generate_embeddings()

 #!/usr/bin/env python3
 """
+QModel Dataset Builder v2
+=========================
+Builds metadata.json and QModel.index from scratch using multiple
+authoritative sources.
+Data Sources:
+  Quran:
+    - risan/quran-json          (Arabic text + English translation + chapter metadata)
+    - semarketir/quranjson      (verse transliteration)
+  Tafsir:
+    - Kaggle tafseer dataset    (primary tafsir enrichment)
+    - Quran.com API             (fallback tafsir enrichment)
+  Hadith:
+    - AhmedBaset/hadith-json    (9 books: Arabic + English, chapter structure)
+    - fawazahmed0/hadith-api    (grade information from scholars)
+Usage:
+    python build_index.py                    # full build from scratch
+    python build_index.py --force-download   # re-download all sources
+    python build_index.py --data-only        # generate metadata.json, skip index
+    python build_index.py --index-only       # build index from existing metadata.json
+    python build_index.py --skip-tafsir      # skip tafsir enrichment
 """
 import json
+import os
+import re
+import time
+import argparse
+import zipfile
 import numpy as np
 from pathlib import Path
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple
 import faiss
+import requests
 from sentence_transformers import SentenceTransformer
 from tqdm import tqdm
+# ── Paths ──────────────────────────────────────────────────────────────
+BASE_DIR = Path(__file__).resolve().parent
+CACHE_DIR = BASE_DIR / "data" / "cache"
+METADATA_PATH = BASE_DIR / "metadata.json"
+INDEX_PATH = BASE_DIR / "QModel.index"
+# ── Quran source URLs ─────────────────────────────────────────────────
+QURAN_JSON_URL = (
+    "https://raw.githubusercontent.com/risan/quran-json/main/data/quran.json"
+)
+CHAPTERS_EN_URL = (
+    "https://raw.githubusercontent.com/risan/quran-json/main/data/chapters/en.json"
+)
+SEMARKETIR_SURAH_URL_TPL = (
+    "https://raw.githubusercontent.com/semarketir/quranjson"
+    "/master/source/surah/surah_{n}.json"
+)
+SEMARKETIR_TRANSLATION_URL_TPL = (
+    "https://raw.githubusercontent.com/semarketir/quranjson"
+    "/master/source/translation/en/en_translation_{n}.json"
+)
+# CDN dist per-chapter English (Arabic + English + transliteration)
+CDN_CHAPTER_EN_URL_TPL = (
+    "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{n}.json"
+)
+# ── Tafsir sources ────────────────────────────────────────────────────
+KAGGLE_TAFSIR_URL = (
+    "https://www.kaggle.com/api/v1/datasets/download/"
+    "abdelrahmanahmed110/quranic-ayahs-with-tafseer-json-dataset"
+)
+# Fallback: Quran.com API
+QURAN_API_BASE = "https://api.quran.com/api/v4"
+TAFSIR_EN_ID = 169   # Ibn Kathir (Abridged) – English
+TAFSIR_AR_ID = 16    # Al-Muyassar – Arabic
+# ── Hadith source: AhmedBaset ─────────────────────────────────────────
+AHMEDBASET_BASE_URL = (
+    "https://raw.githubusercontent.com/AhmedBaset/hadith-json"
+    "/main/db/by_book/the_9_books"
+)
+HADITH_BOOKS = {
+    "ahmed.json": {
+        "collection": "Musnad Ahmad",
+        "id_prefix": "ahmad",
+        "author": "Imam Ahmad ibn Hanbal",
+    },
+    "bukhari.json": {
+        "collection": "Sahih al-Bukhari",
+        "id_prefix": "bukhari",
+        "author": "Muhammad al-Bukhari",
+    },
+    "muslim.json": {
+        "collection": "Sahih Muslim",
+        "id_prefix": "muslim",
+        "author": "Muslim ibn al-Hajjaj",
+    },
+    "abudawud.json": {
+        "collection": "Sunan Abu Dawood",
+        "id_prefix": "abudawud",
+        "author": "Abu Dawood Sulaiman",
+    },
+    "tirmidhi.json": {
+        "collection": "Jami' at-Tirmidhi",
+        "id_prefix": "tirmidhi",
+        "author": "Al-Tirmidhi",
+    },
+    "ibnmajah.json": {
+        "collection": "Sunan Ibn Majah",
+        "id_prefix": "ibnmajah",
+        "author": "Ibn Majah al-Qazwini",
+    },
+    "nasai.json": {
+        "collection": "Sunan an-Nasai",
+        "id_prefix": "nasai",
+        "author": "Ahmad al-Nasai",
+    },
+    "malik.json": {
+        "collection": "Muwatta Malik",
+        "id_prefix": "malik",
+        "author": "Malik ibn Anas",
+    },
+    "darimi.json": {
+        "collection": "Sunan al-Darimi",
+        "id_prefix": "darimi",
+        "author": "Al-Darimi",
+    },
+}
+# ── Hadith source: fawazahmed0 (for grades) ───────────────────────────
+FAWAZ_CDN_BASE = "https://cdn.jsdelivr.net/gh/fawazahmed0/hadith-api@1"
+FAWAZ_RAW_BASE = (
+    "https://raw.githubusercontent.com/fawazahmed0/hadith-api/1"
+)
+FAWAZ_EDITION_MAP = {
+    "bukhari": "eng-bukhari",
+    "muslim": "eng-muslim",
+    "abudawud": "eng-abudawud",
+    "tirmidhi": "eng-tirmidhi",
+    "nasai": "eng-nasai",
+    "ibnmajah": "eng-ibnmajah",
+    "malik": "eng-malik",
+    "ahmad": "eng-ahmed",
+    "darimi": "eng-darimi",
+}
+# ── Embedding / network config ────────────────────────────────────────
+DEFAULT_EMBED_MODEL = "intfloat/multilingual-e5-large"
+EMBED_BATCH_SIZE = 32
+REQUEST_TIMEOUT = 60
+RETRY_ATTEMPTS = 3
+RETRY_DELAY = 2
+# ══════════════════════════════════════════════════════════════════════
+# UTILITIES
+# ══════════════════════════════════════════════════════════════════════
+def _ensure_dir(path: Path):
+    path.mkdir(parents=True, exist_ok=True)
+def download_json(
+    url: str,
+    cache_path: Optional[Path] = None,
+    force: bool = False,
+) -> Any:
+    """Download JSON with optional file caching and retries."""
+    if cache_path and cache_path.exists() and not force:
+        with open(cache_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    for attempt in range(1, RETRY_ATTEMPTS + 1):
+        try:
+            resp = requests.get(url, timeout=REQUEST_TIMEOUT)
+            resp.raise_for_status()
+            data = resp.json()
+            if cache_path:
+                _ensure_dir(cache_path.parent)
+                with open(cache_path, "w", encoding="utf-8") as f:
+                    json.dump(data, f, ensure_ascii=False)
+            return data
+        except Exception as exc:
+            if attempt == RETRY_ATTEMPTS:
+                raise
+            print(f"    Retry {attempt}/{RETRY_ATTEMPTS} for {url}: {exc}")
+            time.sleep(RETRY_DELAY * attempt)
+def download_file(
+    url: str,
+    cache_path: Path,
+    force: bool = False,
+    auth: Optional[Tuple[str, str]] = None,
+) -> Path:
+    """Download a binary file with caching."""
+    if cache_path.exists() and cache_path.stat().st_size > 0 and not force:
+        return cache_path
+    _ensure_dir(cache_path.parent)
+    for attempt in range(1, RETRY_ATTEMPTS + 1):
+        try:
+            resp = requests.get(
+                url, timeout=REQUEST_TIMEOUT, stream=True, auth=auth,
+            )
+            resp.raise_for_status()
+            with open(cache_path, "wb") as f:
+                for chunk in resp.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            return cache_path
+        except Exception as exc:
+            if attempt == RETRY_ATTEMPTS:
+                raise
+            print(f"    Retry {attempt}/{RETRY_ATTEMPTS}: {exc}")
+            time.sleep(RETRY_DELAY * attempt)
+def strip_html(text: str) -> str:
+    """Remove HTML tags and collapse whitespace."""
+    clean = re.sub(r"<[^>]+>", " ", text)
+    return re.sub(r"\s+", " ", clean).strip()
+def _kaggle_auth() -> Optional[Tuple[str, str]]:
+    """Return (username, key) from env vars or ~/.kaggle/kaggle.json."""
+    username = os.environ.get("KAGGLE_USERNAME")
+    key = os.environ.get("KAGGLE_KEY")
+    if username and key:
+        return (username, key)
+    kaggle_json = Path.home() / ".kaggle" / "kaggle.json"
+    if kaggle_json.exists():
+        with open(kaggle_json, "r") as f:
+            creds = json.load(f)
+        u, k = creds.get("username"), creds.get("key")
+        if u and k:
+            return (u, k)
+    return None
+# ══════════════════════════════════════════════════════════════════════
+# STEP 1: FETCH & BUILD QURAN ENTRIES
+# ══════════════════════════════════════════════════════════════════════
+def fetch_quran_sources(
+    force: bool = False,
+) -> Tuple[Dict[int, Dict], Dict[int, Dict], Dict[int, Dict], Dict[int, Dict]]:
+    """Download Quran data from all sources.
+    Returns (cdn_chapters, quran_data, chapter_meta, semarketir_translations).
+      cdn_chapters:   { surah_num: { "id", "name", "transliteration", "translation",
+                        "type", "total_verses", "verses": [{"id", "text",
+                        "translation", "transliteration"}] } }      (primary)
+      quran_data:     raw quran.json  { "N": [{"chapter", "verse", "text"}] }
+      chapter_meta:   { surah_num: {"id", "name", "transliteration", "translation",
+                        "type", "total_verses"} }                    (fallback metadata)
+      semarketir_translations: { surah_num: { "verse": {"1": "english_text"} } }
+    """
+    print("=" * 60)
+    print("Step 1: Fetching Quran Sources")
+    print("=" * 60)
+    # 1a. CDN per-chapter English (primary – has Arabic + English + transliteration)
+    print("  Downloading per-chapter English data from CDN …")
+    cdn_chapters: Dict[int, Dict] = {}
+    for n in tqdm(range(1, 115), desc="    CDN chapters", leave=True):
+        try:
+            url = CDN_CHAPTER_EN_URL_TPL.format(n=n)
+            data = download_json(
+                url,
+                cache_path=CACHE_DIR / "quran" / "cdn_en" / f"{n}.json",
+                force=force,
+            )
+            cdn_chapters[n] = data
+        except Exception as exc:
+            print(f"\n    ✗ Chapter {n}: {exc}")
+    print(f"    ✓ Loaded {len(cdn_chapters)} chapters from CDN")
+    # 1b. risan/quran-json – full Quran text (fallback Arabic)
+    print("  Downloading quran.json from risan/quran-json …")
+    quran_data = download_json(
+        QURAN_JSON_URL,
+        cache_path=CACHE_DIR / "quran" / "quran.json",
+        force=force,
+    )
+    print(f"    ✓ Loaded {len(quran_data)} surahs")
+    # 1c. risan/quran-json – chapter metadata (fallback)
+    print("  Downloading chapters/en.json …")
+    chapters_raw = download_json(
+        CHAPTERS_EN_URL,
+        cache_path=CACHE_DIR / "quran" / "chapters_en.json",
+        force=force,
+    )
+    chapter_meta: Dict[int, Dict] = {}
+    if isinstance(chapters_raw, list):
+        chapter_meta = {ch["id"]: ch for ch in chapters_raw}
+    elif isinstance(chapters_raw, dict):
+        chapter_meta = {int(k): v for k, v in chapters_raw.items()}
+    print(f"    ✓ Loaded {len(chapter_meta)} chapter records")
+    # 1d. semarketir English translations (additional fallback)
+    print("  Downloading English translations from semarketir/quranjson …")
+    semarketir_translations: Dict[int, Dict] = {}
+    for n in tqdm(range(1, 115), desc="    Semarketir EN", leave=True):
+        try:
+            url = SEMARKETIR_TRANSLATION_URL_TPL.format(n=n)
+            data = download_json(
+                url,
+                cache_path=CACHE_DIR / "quran" / "semarketir_en" / f"en_translation_{n}.json",
+                force=force,
+            )
+            semarketir_translations[n] = data
+        except Exception as exc:
+            print(f"\n    ✗ Surah {n} translation: {exc}")
+    print(f"    ✓ Loaded translation for {len(semarketir_translations)} surahs")
+    return cdn_chapters, quran_data, chapter_meta, semarketir_translations
+def build_quran_entries(
+    cdn_chapters: Dict[int, Dict],
+    quran_data: Dict,
+    chapter_meta: Dict[int, Dict],
+    semarketir_translations: Dict[int, Dict],
+) -> List[Dict]:
+    """Merge Quran sources into a list of verse entries.
+    Priority:
+      Arabic text:      CDN > quran.json
+      English:          CDN > semarketir translation
+      Transliteration:  CDN
+      Chapter metadata: CDN > chapter_meta (chapters/en.json)
+    """
+    print("\n" + "=" * 60)
+    print("Step 2: Building Quran Entries")
+    print("=" * 60)
+    # Build a fallback Arabic lookup from quran.json
+    # quran.json: { "N": [{"chapter": int, "verse": int, "text": str}] }
+    arabic_fallback: Dict[str, str] = {}
+    for surah_key, verses in quran_data.items():
+        if isinstance(verses, list):
+            for v in verses:
+                vk = f"{v.get('chapter', surah_key)}:{v.get('verse', '')}"
+                arabic_fallback[vk] = v.get("text", "")
+    # Build semarketir English fallback
+    # semarketir_translations: { surah_num: {"verse": {"1": "english_text"}} }
+    en_fallback: Dict[str, str] = {}
+    for surah_num, sdata in semarketir_translations.items():
+        verses = sdata.get("verse", {})
+        if isinstance(verses, dict):
+            for vnum_str, text in verses.items():
+                en_fallback[f"{surah_num}:{vnum_str}"] = text if isinstance(text, str) else ""
+    # Determine surah numbers to iterate
+    all_surahs = sorted(
+        set(cdn_chapters.keys())
+        | {int(k) for k in quran_data.keys()}
+    )
+    entries: List[Dict] = []
+    for surah_num in all_surahs:
+        cdn = cdn_chapters.get(surah_num, {})
+        ch = chapter_meta.get(surah_num, {})
+        # Chapter metadata – prefer CDN, fallback to chapters_en.json
+        surah_name_ar = cdn.get("name", ch.get("name", ""))
+        surah_name_en = cdn.get("translation", ch.get("translation", ""))
+        surah_translit = cdn.get("transliteration", ch.get("transliteration", ""))
+        revelation_type = cdn.get("type", ch.get("type", "")).lower()
+        total_verses = cdn.get("total_verses", ch.get("total_verses", 0))
+        # Verses from CDN (primary)
+        cdn_verses = cdn.get("verses", [])
+        if cdn_verses:
+            for verse in cdn_verses:
+                verse_num = verse["id"]
+                vk = f"{surah_num}:{verse_num}"
+                entries.append({
+                    "id": vk,
+                    "arabic": verse.get("text", arabic_fallback.get(vk, "")),
+                    "english": verse.get("translation", en_fallback.get(vk, "")),
+                    "source": f"Surah {surah_name_ar} {vk}",
+                    "surah_number": surah_num,
+                    "surah_name_en": surah_name_en,
+                    "surah_name_ar": surah_name_ar,
+                    "verse_number": verse_num,
+                    "transliteration": verse.get("transliteration", ""),
+                    "type": "quran",
+                    "surah_name_transliteration": surah_translit,
+                    "revelation_type": revelation_type,
+                    "total_verses": total_verses,
+                })
+        else:
+            # Fallback: build from quran.json verses
+            raw_verses = quran_data.get(str(surah_num), [])
+            if isinstance(raw_verses, list):
+                for v in raw_verses:
+                    verse_num = v.get("verse", v.get("id", 0))
+                    vk = f"{surah_num}:{verse_num}"
+                    entries.append({
+                        "id": vk,
+                        "arabic": v.get("text", ""),
+                        "english": en_fallback.get(vk, ""),
+                        "source": f"Surah {surah_name_ar} {vk}",
+                        "surah_number": surah_num,
+                        "surah_name_en": surah_name_en,
+                        "surah_name_ar": surah_name_ar,
+                        "verse_number": verse_num,
+                        "transliteration": "",
+                        "type": "quran",
+                        "surah_name_transliteration": surah_translit,
+                        "revelation_type": revelation_type,
+                        "total_verses": total_verses,
+                    })
+    print(f"  ✓ Built {len(entries):,} Quran verses across {len(all_surahs)} surahs")
+    return entries
+# ══════════════════════════════════════════════════════════════════════
+# STEP 3: ENRICH QURAN WITH TAFSIR
+# ══════════════════════════════════════════════════════════════════════
+def _extract_verse_key(item: Dict) -> Optional[str]:
+    """Try to extract a 'surah:verse' key from a tafsir record."""
+    surah_fields = [
+        "sura_no", "surah", "surah_number", "sura",
+        "chapter", "chapter_no", "SuraID", "SurahNumber",
+    ]
+    verse_fields = [
+        "aya_no", "ayah", "verse_number", "aya",
+        "verse", "ayah_number", "AyaID", "VerseNumber",
+    ]
+    surah = verse = None
+    for f in surah_fields:
+        if f in item:
+            surah = item[f]
+            break
+    for f in verse_fields:
+        if f in item:
+            verse = item[f]
+            break
+    if surah is not None and verse is not None:
+        return f"{int(surah)}:{int(verse)}"
+    if "verse_key" in item:
+        return item["verse_key"]
+    return None
+def _extract_tafsir_text(item: Dict) -> Optional[Dict[str, str]]:
+    """Try to extract tafsir text from a tafsir record."""
+    result: Dict[str, str] = {}
+    en_fields = [
+        "tafseer_en", "tafsir_en", "tafseer_english", "tafsir_english",
+        "english_tafsir", "english_tafseer", "interpretation_en",
+    ]
+    ar_fields = [
+        "tafseer_ar", "tafsir_ar", "tafseer_arabic", "tafsir_arabic",
+        "arabic_tafsir", "arabic_tafseer", "interpretation_ar",
+        "tafseer", "tafsir",
+    ]
+    for f in en_fields:
+        if f in item and item[f]:
+            result["tafsir_en"] = strip_html(str(item[f]))
+            break
+    for f in ar_fields:
+        if f in item and item[f]:
+            val = str(item[f])
+            if any("\u0600" <= c <= "\u06ff" for c in val):
+                result["tafsir_ar"] = strip_html(val)
+            elif "tafsir_en" not in result:
+                # Treat as English if no Arabic characters detected
+                result["tafsir_en"] = strip_html(val)
+            break
+    # Handle nested tafsir object (e.g. {"1": "...", "2": "..."})
+    if not result:
+        for key in ("tafseer", "tafsir"):
+            obj = item.get(key)
+            if isinstance(obj, dict):
+                for _, val in obj.items():
+                    if val:
+                        result["tafsir_en"] = strip_html(str(val))
+                        break
+                break
+    return result if result else None
+def _load_tafsir_from_records(records: List[Dict]) -> Dict[str, Dict[str, str]]:
+    """Build verse-key → tafsir dict from a list of records."""
+    tafsir_map: Dict[str, Dict[str, str]] = {}
+    for item in records:
+        verse_key = _extract_verse_key(item)
+        if not verse_key:
+            continue
+        text = _extract_tafsir_text(item)
+        if text:
+            tafsir_map.setdefault(verse_key, {}).update(text)
+    return tafsir_map
+def fetch_kaggle_tafsir(
+    force: bool = False,
+) -> Optional[Dict[str, Dict[str, str]]]:
+    """Download and parse the Kaggle tafsir dataset (ZIP).
+    Returns { "surah:verse": {"tafsir_en": …, "tafsir_ar": …} } or None.
+    """
+    zip_path = CACHE_DIR / "tafsir" / "kaggle_tafsir.zip"
+    extract_dir = CACHE_DIR / "tafsir" / "kaggle_extracted"
+    # Download
+    try:
+        print("  Downloading Kaggle tafsir dataset …")
+        auth = _kaggle_auth()
+        download_file(KAGGLE_TAFSIR_URL, zip_path, force=force, auth=auth)
+    except Exception as exc:
+        print(f"  ✗ Kaggle download failed: {exc}")
+        print(
+            "    Tip: set KAGGLE_USERNAME and KAGGLE_KEY env vars, "
+            "or place kaggle.json in ~/.kaggle/"
+        )
+        return None
+    # Verify it's actually a ZIP
+    if not zipfile.is_zipfile(zip_path):
+        print("  ✗ Downloaded file is not a valid ZIP (may need Kaggle auth)")
+        return None
+    # Extract
+    try:
+        _ensure_dir(extract_dir)
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(extract_dir)
+        print(f"    ✓ Extracted to {extract_dir}")
+    except Exception as exc:
+        print(f"  ✗ Failed to extract ZIP: {exc}")
+        return None
+    # Parse JSON files inside the archive
+    json_files = list(extract_dir.rglob("*.json"))
+    if not json_files:
+        print("  ✗ No JSON files found in Kaggle archive")
+        return None
+    print(f"    Found {len(json_files)} JSON file(s) in archive")
+    tafsir_map: Dict[str, Dict[str, str]] = {}
+    for jf in json_files:
+        try:
+            with open(jf, "r", encoding="utf-8") as f:
+                data = json.load(f)
+        except Exception as exc:
+            print(f"    ✗ Error parsing {jf.name}: {exc}")
+            continue
+        if isinstance(data, list):
+            tafsir_map.update(_load_tafsir_from_records(data))
+        elif isinstance(data, dict):
+            # Might be keyed by surah number or some other grouping
+            for _key, value in data.items():
+                if isinstance(value, list):
+                    tafsir_map.update(_load_tafsir_from_records(value))
+                elif isinstance(value, dict):
+                    vk = _extract_verse_key(value)
+                    if vk:
+                        tt = _extract_tafsir_text(value)
+                        if tt:
+                            tafsir_map.setdefault(vk, {}).update(tt)
+    if tafsir_map:
+        print(f"    ✓ Loaded tafsir for {len(tafsir_map):,} verses from Kaggle")
+    return tafsir_map if tafsir_map else None
+def _fetch_tafsir_chapter_api(
+    tafsir_id: int, chapter: int,
+) -> Dict[str, str]:
+    """Fetch all tafsir entries for a chapter from Quran.com API."""
+    result: Dict[str, str] = {}
+    page = 1
+    while True:
+        url = (
+            f"{QURAN_API_BASE}/tafsirs/{tafsir_id}/by_chapter/{chapter}"
+            f"?per_page=50&page={page}"
+        )
+        resp = requests.get(url, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        data = resp.json()
+        for entry in data.get("tafsirs", []):
+            raw = entry.get("text", "")
+            if raw:
+                result[entry["verse_key"]] = strip_html(raw)
+        pagination = data.get("pagination", {})
+        if pagination.get("next_page") is None:
+            break
+        page = pagination["next_page"]
+        time.sleep(0.3)
+    return result
+def fetch_qurancom_tafsir(
+    surah_numbers: List[int],
+) -> Dict[str, Dict[str, str]]:
+    """Fallback: fetch tafsir from Quran.com API."""
+    print("  Falling back to Quran.com API for tafsir …")
+    tafsir_map: Dict[str, Dict[str, str]] = {}
+    for surah_num in tqdm(surah_numbers, desc="    Fetching tafsir"):
+        try:
+            en_entries = _fetch_tafsir_chapter_api(TAFSIR_EN_ID, surah_num)
+            time.sleep(0.3)
+            ar_entries = _fetch_tafsir_chapter_api(TAFSIR_AR_ID, surah_num)
+            time.sleep(0.3)
+            for vk, text in en_entries.items():
+                tafsir_map.setdefault(vk, {})["tafsir_en"] = text
+            for vk, text in ar_entries.items():
+                tafsir_map.setdefault(vk, {})["tafsir_ar"] = text
+        except Exception as exc:
+            print(f"\n    ✗ Surah {surah_num}: {exc}")
+    return tafsir_map
+def enrich_quran_with_tafsir(
+    entries: List[Dict],
+    force_download: bool = False,
+) -> List[Dict]:
+    """Add tafsir fields to Quran entries (Kaggle → Quran.com fallback)."""
+    print("\n" + "=" * 60)
+    print("Step 3: Enriching Quran with Tafsir")
+    print("=" * 60)
+    tafsir_map = fetch_kaggle_tafsir(force=force_download)
+    if not tafsir_map:
+        surah_numbers = sorted(
+            {e["surah_number"] for e in entries if e.get("type") == "quran"}
+        )
+        tafsir_map = fetch_qurancom_tafsir(surah_numbers)
+    if not tafsir_map:
+        print("  ✗ No tafsir data available")
+        return entries
+    enriched = 0
+    for entry in entries:
+        if entry.get("type") != "quran":
+            continue
+        verse_key = f"{entry['surah_number']}:{entry['verse_number']}"
+        tafsir = tafsir_map.get(verse_key, {})
+        entry["tafsir_en"] = tafsir.get("tafsir_en", "")
+        entry["tafsir_ar"] = tafsir.get("tafsir_ar", "")
+        if entry["tafsir_en"] or entry["tafsir_ar"]:
+            enriched += 1
+    print(f"  ✓ Enriched {enriched:,} verses with tafsir")
+    return entries
+# ══════════════════════════════════════════════════════════════════════
+# STEP 4: FETCH & BUILD HADITH ENTRIES
+# ══════════════════════════════════════════════════════════════════════
+def _pick_best_grade(grades: List[Dict]) -> str:
+    """Pick the most authoritative grade from a list of scholar grades."""
+    priority = ["darussalam", "al-albani", "zubair ali zai"]
+    grade_map = {}
+    for g in grades:
+        name = g.get("name", "").lower()
+        grade_text = g.get("grade", "")
+        if grade_text:
+            grade_map[name] = grade_text
+    for scholar in priority:
+        for name, grade in grade_map.items():
+            if scholar in name:
+                return grade
+    for g in grades:
+        if g.get("grade"):
+            return g["grade"]
+    return ""
+def _fetch_fawaz_grades(
+    edition: str, force: bool = False,
+) -> Optional[Dict[int, str]]:
+    """Fetch grades for a hadith edition from fawazahmed0."""
+    cache_path = CACHE_DIR / "hadith" / "fawazahmed0" / f"{edition}.json"
+    urls = [
+        f"{FAWAZ_CDN_BASE}/editions/{edition}.json",
+        f"{FAWAZ_RAW_BASE}/editions/{edition}.json",
+    ]
+    data = None
+    for url in urls:
+        try:
+            data = download_json(url, cache_path=cache_path, force=force)
+            break
+        except Exception:
+            continue
+    if not data:
+        return None
+    grades: Dict[int, str] = {}
+    for hadith in data.get("hadiths", []):
+        hnum = hadith.get("hadithnumber")
+        if hnum is None:
+            continue
+        grade_list = hadith.get("grades", [])
+        if grade_list:
+            grades[int(hnum)] = _pick_best_grade(grade_list)
+    return grades
+def fetch_hadith_sources(
+    force: bool = False,
+) -> Tuple[Dict[str, Dict], Dict[str, Dict[int, str]]]:
+    """Download hadith data from AhmedBaset and grades from fawazahmed0.
+    Returns (ahmedbaset_books, fawaz_grades).
+    """
+    print("\n" + "=" * 60)
+    print("Step 4a: Fetching Hadith Sources")
+    print("=" * 60)
+    # AhmedBaset hadith books
+    print("  Downloading from AhmedBaset/hadith-json …")
+    ahmedbaset_books: Dict[str, Dict] = {}
+    for filename in tqdm(HADITH_BOOKS.keys(), desc="    Books"):
+        try:
+            url = f"{AHMEDBASET_BASE_URL}/{filename}"
+            data = download_json(
+                url,
+                cache_path=CACHE_DIR / "hadith" / "ahmedbaset" / filename,
+                force=force,
+            )
+            ahmedbaset_books[filename] = data
+        except Exception as exc:
+            print(f"\n    ✗ {filename}: {exc}")
+    print(f"    ✓ Loaded {len(ahmedbaset_books)} books")
+    # fawazahmed0 editions (for grades)
+    print("  Downloading grade data from fawazahmed0/hadith-api …")
+    fawaz_grades: Dict[str, Dict[int, str]] = {}
+    for prefix, edition in tqdm(FAWAZ_EDITION_MAP.items(), desc="    Editions"):
+        grades = _fetch_fawaz_grades(edition, force)
+        if grades:
+            fawaz_grades[prefix] = grades
+    print(f"    ✓ Loaded grades for {len(fawaz_grades)} collections")
+    return ahmedbaset_books, fawaz_grades
+def build_hadith_entries(
+    ahmedbaset_books: Dict[str, Dict],
+    fawaz_grades: Dict[str, Dict[int, str]],
+) -> List[Dict]:
+    """Merge AhmedBaset data with fawazahmed0 grades into hadith entries."""
+    print("\n" + "=" * 60)
+    print("Step 4b: Building Hadith Entries")
+    print("=" * 60)
+    entries: List[Dict] = []
+    stats: Dict[str, int] = defaultdict(int)
+    for filename, book_config in HADITH_BOOKS.items():
+        book_data = ahmedbaset_books.get(filename)
+        if not book_data:
+            print(f"  ✗ Skipping {filename} (not downloaded)")
+            continue
+        prefix = book_config["id_prefix"]
+        grades = fawaz_grades.get(prefix, {})
+        hadiths = book_data.get("hadiths", [])
+        chapter_map = {
+            ch.get("id"): ch.get("arabic", "")
+            for ch in book_data.get("chapters", [])
+        }
+        for hadith in hadiths:
+            hadith_num = hadith.get("idInBook", hadith.get("id", ""))
+            # English text
+            if isinstance(hadith.get("english"), dict):
+                parts = []
+                if hadith["english"].get("narrator"):
+                    parts.append(hadith["english"]["narrator"])
+                if hadith["english"].get("text"):
+                    parts.append(hadith["english"]["text"])
+                english = " ".join(parts)
+            else:
+                english = str(hadith.get("english", ""))
+            # Chapter name
+            chapter_name = ""
+            if "chapterId" in hadith:
+                chapter_name = chapter_map.get(hadith["chapterId"], "")
+            # Grade from fawazahmed0
+            grade = ""
+            if hadith_num:
+                grade = grades.get(int(hadith_num), "")
+            entries.append(
+                {
+                    "id": f"{prefix}_{hadith_num}",
+                    "arabic": hadith.get("arabic", ""),
+                    "english": english,
+                    "reference": f"{book_config['collection']} {hadith_num}",
+                    "hadith_number": hadith_num,
+                    "collection": book_config["collection"],
+                    "chapter": chapter_name,
+                    "grade": grade,
+                    "type": "hadith",
+                    "author": book_config["author"],
+                }
+            )
+            stats[book_config["collection"]] += 1
+    print(f"  ✓ Built {len(entries):,} hadith entries")
+    print("\n  Breakdown:")
+    for collection, count in sorted(stats.items()):
+        print(f"    {collection}: {count:,}")
+    graded = sum(1 for e in entries if e.get("grade"))
+    print(f"\n  Hadiths with grades: {graded:,} / {len(entries):,}")
+    return entries
+# ══════════════════════════════════════════════════════════════════════
+# STEP 5: GENERATE METADATA
+# ══════════════════════════════════════════════════════════════════════
+def generate_metadata(
+    quran_entries: List[Dict],
+    hadith_entries: List[Dict],
+) -> List[Dict]:
+    """Combine all entries and write metadata.json."""
+    print("\n" + "=" * 60)
+    print("Step 5: Generating metadata.json")
+    print("=" * 60)
+    documents = quran_entries + hadith_entries
+    print(f"  Quran entries:  {len(quran_entries):,}")
+    print(f"  Hadith entries: {len(hadith_entries):,}")
+    print(f"  Total:          {len(documents):,}")
+    # Check for duplicate IDs
+    ids = [d["id"] for d in documents]
+    if len(ids) != len(set(ids)):
+        dupes = len(ids) - len(set(ids))
+        print(f"  ⚠ Warning: {dupes} duplicate IDs found")
+    print(f"  Writing to {METADATA_PATH} …")
+    with open(METADATA_PATH, "w", encoding="utf-8") as f:
+        json.dump(documents, f, ensure_ascii=False, indent=2)
+    size_mb = METADATA_PATH.stat().st_size / (1024 * 1024)
+    print(f"  ✓ File size: {size_mb:.2f} MB")
+    return documents
+# ══════════════════════════════════════════════════════════════════════
+# STEP 6: BUILD FAISS INDEX
+# ══════════════════════════════════════════════════════════════════════
+def build_faiss_index(
+    documents: List[Dict],
+    model_name: str = DEFAULT_EMBED_MODEL,
+):
+    """Generate embeddings and build FAISS index."""
+    print("\n" + "=" * 60)
+    print("Step 6: Building FAISS Index")
+    print("=" * 60)
+    print(f"  Loading embedding model: {model_name}")
     model = SentenceTransformer(model_name)
     embedding_dim = model.get_sentence_embedding_dimension()
+    print(f"  Embedding dimension: {embedding_dim}")
+    # Build text for each document
+    all_texts: List[str] = []
     for doc in documents:
         if doc.get("type") == "quran":
+            # Include truncated tafsir for richer semantic matching
+            tafsir_snippet = doc.get("tafsir_en", "")[:500]
+            text = (
+                f"{doc.get('arabic', '')} {doc.get('english', '')} "
+                f"{tafsir_snippet}"
+            )
         else:  # hadith
+            text = (
+                f"{doc.get('collection', '')} "
+                f"{doc.get('arabic', '')} "
+                f"{doc.get('english', '')}"
+            )
         all_texts.append(text.strip())
+    print(f"\n  Generating embeddings for {len(all_texts):,} documents …")
     all_embeddings = []
+    for i in tqdm(
+        range(0, len(all_texts), EMBED_BATCH_SIZE),
+        desc="  Embedding batches",
+    ):
+        batch = all_texts[i : i + EMBED_BATCH_SIZE]
+        batch_emb = model.encode(batch, convert_to_numpy=True)
+        all_embeddings.extend(batch_emb)
     embeddings = np.array(all_embeddings, dtype=np.float32)
+    print(f"  Embeddings shape: {embeddings.shape}")
+    print("\n  Creating FAISS index (IndexFlatIP + L2 normalization) …")
+    index = faiss.IndexFlatIP(embedding_dim)
     faiss.normalize_L2(embeddings)
     index.add(embeddings)
+    print(f"  Saving to {INDEX_PATH}")
+    faiss.write_index(index, str(INDEX_PATH))
+    size_mb = INDEX_PATH.stat().st_size / (1024 * 1024)
+    print(f"\n  {'=' * 50}")
+    print(f"  Index Build Complete")
+    print(f"  {'=' * 50}")
+    print(f"  Documents indexed: {index.ntotal:,}")
+    print(f"  Index file size:   {size_mb:.2f} MB")
+# ══════════════════════════════════════════════════════════════════════
+# CLI
+# ══════════════════════════════════════════════════════════════════════
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "QModel Dataset Builder v2 — builds metadata.json and "
+            "QModel.index from scratch using multiple authoritative sources"
+        ),
+    )
+    parser.add_argument(
+        "--index-only",
+        action="store_true",
+        help="Only build FAISS index from existing metadata.json",
+    )
+    parser.add_argument(
+        "--data-only",
+        action="store_true",
+        help="Only generate metadata.json, skip index building",
+    )
+    parser.add_argument(
+        "--skip-tafsir",
+        action="store_true",
+        help="Skip tafsir enrichment",
+    )
+    parser.add_argument(
+        "--force-download",
+        action="store_true",
+        help="Re-download all sources even if cached",
+    )
+    parser.add_argument(
+        "--model",
+        default=DEFAULT_EMBED_MODEL,
+        help=f"Sentence-transformer model for embeddings (default: {DEFAULT_EMBED_MODEL})",
+    )
+    args = parser.parse_args()
+    # ── index-only: skip all data fetching ─────────────────────────
+    if args.index_only:
+        print("Loading existing metadata.json …")
+        with open(METADATA_PATH, "r", encoding="utf-8") as f:
+            documents = json.load(f)
+        build_faiss_index(documents, model_name=args.model)
+        print("\n✓ Done!")
+        return
+    force = args.force_download
+    # Step 1: Fetch Quran sources
+    cdn_chapters, quran_data, chapter_meta, sem_translations = fetch_quran_sources(force=force)
+    # Step 2: Build Quran entries
+    quran_entries = build_quran_entries(cdn_chapters, quran_data, chapter_meta, sem_translations)
+    # Step 3: Enrich with tafsir
+    if not args.skip_tafsir:
+        quran_entries = enrich_quran_with_tafsir(
+            quran_entries, force_download=force,
+        )
+    else:
+        print("\nSkipping tafsir enrichment (--skip-tafsir)")
+    # Step 4: Fetch and build hadith entries
+    ahmedbaset_books, fawaz_grades = fetch_hadith_sources(force=force)
+    hadith_entries = build_hadith_entries(ahmedbaset_books, fawaz_grades)
+    # Step 5: Generate metadata.json
+    documents = generate_metadata(quran_entries, hadith_entries)
+    # Step 6: Build FAISS index
+    if not args.data_only:
+        build_faiss_index(documents, model_name=args.model)
+    else:
+        print("\nSkipping index build (--data-only)")
+    print("\n✓ Done!")
 if __name__ == "__main__":
+    main()

main.py CHANGED Viewed

@@ -270,7 +270,7 @@ rewrite_cache  = TTLCache(maxsize=cfg.CACHE_SIZE, ttl=cfg.CACHE_TTL * 6)
 # ═══════════════════════════════════════════════════════════════════════
 # ARABIC NLP  — normalisation + light stemming
 # ═══════════════════════════════════════════════════════════════════════
-_DIACRITICS   = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u0671\u06D6-\u06ED]")
 _ALEF_VARS    = re.compile(r"[أإآٱ]")
 _WAW_HAMZA    = re.compile(r"ؤ")
 _YA_HAMZA     = re.compile(r"ئ")
@@ -374,18 +374,25 @@ Reply ONLY with a valid JSON object — no markdown, no preamble:
   "ar_query": "<query in clear Arabic فصحى, ≤25 words>",
   "en_query": "<query in clear English, ≤25 words>",
   "keywords": ["<3-7 key Arabic or English terms from the question>"],
-  "intent": "<one of: fatwa | tafsir | hadith | count | auth | general>"
 }
 Intent Detection Rules (CRITICAL):
-- 'count' intent = asking for number/frequency (كم مرة, how many times, count occurrences)
 - 'auth' intent = asking about authenticity (صحيح؟, هل صحيح, is it authentic, verify hadith grade)
 - 'hadith' intent = asking about specific hadith meaning/text (not authenticity)
 - 'tafsir' intent = asking about Quranic verses or Islamic ruling (fatwa)
 - 'general' intent = other questions
 Examples:
-- "كم مرة ذُكرت كلمة مريم" → intent: count
 - "هل حديث إنما الأعمال بالنيات صحيح" → intent: auth (asking if authentic!)
 - "ما معنى حديث إنما الأعمال" → intent: hadith
 - "ما حكم الربا في الإسلام" → intent: fatwa
@@ -445,11 +452,116 @@ _AUTH_AR = re.compile(
     r"(صحيح|حسن|ضعيف|درجة|صحة|تصحيح|هل.*صحيح|هل.*ضعيف)"
 )
 async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
     """Detect if query is asking for word frequency analysis."""
     if rewrite.get("intent") == "count":
         kws = rewrite.get("keywords", [])
         return kws[0] if kws else None
     if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
@@ -572,7 +684,7 @@ async def hybrid_search(
     seen: set  = set()
     candidates = []
     for dist, idx in zip(distances[0], indices[0]):
-        item_idx = int(idx) // 2
         if item_idx not in seen and 0 <= item_idx < len(dataset):
             seen.add(item_idx)
             item = dataset[item_idx]
@@ -703,6 +815,13 @@ _TASK_INSTRUCTIONS: Dict[str, str] = {
         "2. List example occurrences with Surah names.\n"
         "3. Comment on significance."
     ),
     "general": (
         "The user has a general Islamic question. Steps:\n"
         "1. Give a direct answer first.\n"
@@ -753,8 +872,21 @@ def build_messages(
     lang: str,
     intent: str,
     analysis: Optional[dict] = None,
 ) -> List[dict]:
     """Build system and user messages for LLM."""
     if analysis:
         by_surah_str = "\n  ".join([
             f"Surah {s}: {data['name']} ({data['count']} times)"
@@ -1016,7 +1148,8 @@ async def run_rag_pipeline(
     rewrite = await rewrite_query(question, state.llm)
     intent  = rewrite.get("intent", "general")
-    # 2. Intent detection + hybrid search — concurrently
     kw_task, search_task = (
         detect_analysis_intent(question, rewrite),
         hybrid_search(
@@ -1025,11 +1158,26 @@ async def run_rag_pipeline(
             top_k, source_type, grade_filter,
         ),
     )
-    analysis_kw, results = await asyncio.gather(kw_task, search_task)
-    # 3. Keyword frequency count (if needed)
     analysis = None
-    if analysis_kw:
         analysis = await count_occurrences(analysis_kw, state.dataset)
         logger.info("Analysis: kw=%s count=%d", analysis_kw, analysis["total_count"])
@@ -1042,8 +1190,8 @@ async def run_rag_pipeline(
         intent, top_score, cfg.CONFIDENCE_THRESHOLD,
     )
-    # 5. Confidence gate
-    if top_score < cfg.CONFIDENCE_THRESHOLD:
         logger.warning(
             "Low confidence (%.3f < %.2f) — returning safe fallback",
             top_score, cfg.CONFIDENCE_THRESHOLD,
@@ -1060,7 +1208,7 @@ async def run_rag_pipeline(
     # 6. Build context + prompt + LLM call
     context  = build_context(results)
-    messages = build_messages(context, question, lang, intent, analysis)
     try:
         answer = await state.llm.chat(

 # ═══════════════════════════════════════════════════════════════════════
 # ARABIC NLP  — normalisation + light stemming
 # ═══════════════════════════════════════════════════════════════════════
+_DIACRITICS   = re.compile(r"[\u064B-\u0655\u0656-\u0658\u0670\u06D6-\u06ED]")
 _ALEF_VARS    = re.compile(r"[أإآٱ]")
 _WAW_HAMZA    = re.compile(r"ؤ")
 _YA_HAMZA     = re.compile(r"ئ")
   "ar_query": "<query in clear Arabic فصحى, ≤25 words>",
   "en_query": "<query in clear English, ≤25 words>",
   "keywords": ["<3-7 key Arabic or English terms from the question>"],
+  "intent": "<one of: fatwa | tafsir | hadith | count | surah_info | auth | general>"
 }
 Intent Detection Rules (CRITICAL):
+- 'surah_info' intent = asking about surah metadata: verse count, revelation type, surah number
+  (كم عدد آيات سورة, كم آية في سورة, how many verses in surah, is surah X meccan/medinan)
+- 'count' intent = asking for WORD frequency/occurrence count (كم مرة ذُكرت كلمة, how many times is word X mentioned)
+  NOTE: "كم عدد آيات سورة" is surah_info NOT count!
 - 'auth' intent = asking about authenticity (صحيح؟, هل صحيح, is it authentic, verify hadith grade)
 - 'hadith' intent = asking about specific hadith meaning/text (not authenticity)
 - 'tafsir' intent = asking about Quranic verses or Islamic ruling (fatwa)
 - 'general' intent = other questions
 Examples:
+- "كم عدد آيات سورة آل عمران" → intent: surah_info (asking about surah metadata!)
+- "كم آية في سورة البقرة" → intent: surah_info
+- "how many verses in surah al-baqara" → intent: surah_info
+- "هل سورة الفاتحة مكية أم مدنية" → intent: surah_info
+- "كم مرة ذُكرت كلمة مريم" → intent: count (asking about WORD frequency!)
 - "هل حديث إنما الأعمال بالنيات صحيح" → intent: auth (asking if authentic!)
 - "ما معنى حديث إنما الأعمال" → intent: hadith
 - "ما حكم الربا في الإسلام" → intent: fatwa
     r"(صحيح|حسن|ضعيف|درجة|صحة|تصحيح|هل.*صحيح|هل.*ضعيف)"
 )
+# ── Surah metadata queries (verse count, revelation type, etc.) ───────
+_SURAH_VERSES_AR = re.compile(
+    r"كم\s+(?:عدد\s+)?آيات?\s*(?:في\s+|فى\s+)?(?:سورة|سوره)"
+    r"|عدد\s+آيات?\s+(?:سورة|سوره)"
+    r"|كم\s+آية\s+(?:في|فى)\s+(?:سورة|سوره)"
+    r"|(?:سورة|سوره)\s+[\u0600-\u06FF\s]+\s+(?:كم\s+آية|عدد\s+آيات?)"
+)
+_SURAH_VERSES_EN = re.compile(
+    r"(?:how many|number of)\s+(?:verses?|ayat|ayahs?)\s+(?:in|of|does)\b"
+    r"|\bsurah?\b.*\b(?:how many|number of)\s+(?:verses?|ayat|ayahs?)",
+    re.I,
+)
+_SURAH_TYPE_AR = re.compile(
+    r"(?:سورة|سوره)\s+[\u0600-\u06FF\s]+\s+(?:مكية|مدنية|مكي|مدني)"
+    r"|(?:هل|ما\s+نوع)\s+(?:سورة|سوره)\s+[\u0600-\u06FF\s]+\s+(?:مكية|مدنية)"
+)
+_SURAH_NAME_AR = re.compile(
+    r"(?:سورة|سوره)\s+([\u0600-\u06FF\u0750-\u077F\s]+)"
+)
+_SURAH_NAME_EN = re.compile(
+    r"\bsurah?\s+([a-zA-Z'\-]+(?:[\s\-][a-zA-Z'\-]+)*)",
+    re.I,
+)
+def _extract_surah_name(query: str) -> Optional[str]:
+    """Extract surah name from a query string."""
+    for pat in (_SURAH_NAME_AR, _SURAH_NAME_EN):
+        m = pat.search(query)
+        if m:
+            name = m.group(1).strip()
+            # Clean trailing punctuation and question words
+            name = re.sub(r'[\s؟?!]+$', '', name)
+            name = re.sub(r'\s+(كم|عدد|هل|ما|في|فى)$', '', name)
+            if name:
+                return name
+    return None
+async def detect_surah_info(query: str, rewrite: dict) -> Optional[dict]:
+    """Detect if query asks about surah metadata (verse count, type, etc.)."""
+    is_verse_q = bool(_SURAH_VERSES_AR.search(query) or _SURAH_VERSES_EN.search(query))
+    is_type_q  = bool(_SURAH_TYPE_AR.search(query))
+    if not (is_verse_q or is_type_q):
+        # Also check LLM rewrite intent
+        if rewrite.get("intent") == "surah_info":
+            is_verse_q = True
+        elif rewrite.get("intent") == "count":
+            kw_text = " ".join(rewrite.get("keywords", []))
+            if any(w in kw_text for w in ("آيات", "آية", "verses", "ayat")):
+                is_verse_q = True
+            else:
+                return None
+        else:
+            return None
+    surah_name = _extract_surah_name(query)
+    if not surah_name:
+        return None
+    return {
+        "surah_query": surah_name,
+        "query_type": "verses" if is_verse_q else "type",
+    }
+async def lookup_surah_info(surah_query: str, dataset: list) -> Optional[dict]:
+    """Look up surah metadata from dataset entries."""
+    query_norm = normalize_arabic(surah_query, aggressive=True).lower()
+    query_clean = re.sub(r"^(ال|al[\-\s']*)", "", query_norm, flags=re.I).strip()
+    for item in dataset:
+        if item.get("type") != "quran":
+            continue
+        for field in ("surah_name_ar", "surah_name_en", "surah_name_transliteration"):
+            val = item.get(field, "")
+            if not val:
+                continue
+            val_norm = normalize_arabic(val, aggressive=True).lower()
+            val_clean = re.sub(r"^(ال|al[\-\s']*)", "", val_norm, flags=re.I).strip()
+            if (query_norm in val_norm or val_norm in query_norm
+                    or (query_clean and val_clean
+                        and (query_clean in val_clean or val_clean in query_clean))
+                    or (query_clean and query_clean in val_norm)):
+                return {
+                    "surah_number": item.get("surah_number"),
+                    "surah_name_ar": item.get("surah_name_ar", ""),
+                    "surah_name_en": item.get("surah_name_en", ""),
+                    "surah_name_transliteration": item.get("surah_name_transliteration", ""),
+                    "total_verses": item.get("total_verses"),
+                    "revelation_type": item.get("revelation_type", ""),
+                }
+    return None
 async def detect_analysis_intent(query: str, rewrite: Dict) -> Optional[str]:
     """Detect if query is asking for word frequency analysis."""
+    # Skip surah metadata queries — those are handled by detect_surah_info
+    if (_SURAH_VERSES_AR.search(query) or _SURAH_VERSES_EN.search(query)
+            or _SURAH_TYPE_AR.search(query)
+            or rewrite.get("intent") == "surah_info"):
+        return None
     if rewrite.get("intent") == "count":
         kws = rewrite.get("keywords", [])
+        # Skip if keywords suggest surah metadata, not word frequency
+        kw_text = " ".join(kws)
+        if any(w in kw_text for w in ("آيات", "آية", "verses", "ayat")):
+            return None
         return kws[0] if kws else None
     if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
     seen: set  = set()
     candidates = []
     for dist, idx in zip(distances[0], indices[0]):
+        item_idx = int(idx)
         if item_idx not in seen and 0 <= item_idx < len(dataset):
             seen.add(item_idx)
             item = dataset[item_idx]
         "2. List example occurrences with Surah names.\n"
         "3. Comment on significance."
     ),
+    "surah_info": (
+        "The user asks about surah metadata. Steps:\n"
+        "1. State the answer from the SURAH INFORMATION block EXACTLY.\n"
+        "2. Use the total_verses number precisely — do NOT guess or calculate.\n"
+        "3. Mention the revelation type (Meccan/Medinan) if available.\n"
+        "4. Optionally add brief scholarly context about the surah."
+    ),
     "general": (
         "The user has a general Islamic question. Steps:\n"
         "1. Give a direct answer first.\n"
     lang: str,
     intent: str,
     analysis: Optional[dict] = None,
+    surah_info: Optional[dict] = None,
 ) -> List[dict]:
     """Build system and user messages for LLM."""
+    if surah_info:
+        info_block = (
+            f"\n[SURAH INFORMATION]\n"
+            f"Surah Name (Arabic): {surah_info['surah_name_ar']}\n"
+            f"Surah Name (English): {surah_info['surah_name_en']}\n"
+            f"Surah Number: {surah_info['surah_number']}\n"
+            f"Total Verses: {surah_info['total_verses']}\n"
+            f"Revelation Type: {surah_info['revelation_type']}\n"
+            f"Transliteration: {surah_info['surah_name_transliteration']}\n"
+        )
+        context = info_block + context
     if analysis:
         by_surah_str = "\n  ".join([
             f"Surah {s}: {data['name']} ({data['count']} times)"
     rewrite = await rewrite_query(question, state.llm)
     intent  = rewrite.get("intent", "general")
+    # 2. Surah info detection + analysis intent + hybrid search — concurrently
+    surah_task = detect_surah_info(question, rewrite)
     kw_task, search_task = (
         detect_analysis_intent(question, rewrite),
         hybrid_search(
             top_k, source_type, grade_filter,
         ),
     )
+    surah_det, analysis_kw, results = await asyncio.gather(
+        surah_task, kw_task, search_task,
+    )
+    # 3a. Surah metadata lookup (if detected)
+    surah_info = None
+    if surah_det:
+        surah_info = await lookup_surah_info(surah_det["surah_query"], state.dataset)
+        if surah_info:
+            intent = "surah_info"
+            logger.info(
+                "Surah info: %s → %s (%d verses)",
+                surah_det["surah_query"],
+                surah_info["surah_name_en"],
+                surah_info.get("total_verses", 0),
+            )
+    # 3b. Keyword frequency count (if needed and NOT a surah info query)
     analysis = None
+    if analysis_kw and not surah_info:
         analysis = await count_occurrences(analysis_kw, state.dataset)
         logger.info("Analysis: kw=%s count=%d", analysis_kw, analysis["total_count"])
         intent, top_score, cfg.CONFIDENCE_THRESHOLD,
     )
+    # 5. Confidence gate — skip for surah_info (metadata is from dataset, not search)
+    if not surah_info and top_score < cfg.CONFIDENCE_THRESHOLD:
         logger.warning(
             "Low confidence (%.3f < %.2f) — returning safe fallback",
             top_score, cfg.CONFIDENCE_THRESHOLD,
     # 6. Build context + prompt + LLM call
     context  = build_context(results)
+    messages = build_messages(context, question, lang, intent, analysis, surah_info)
     try:
         answer = await state.llm.chat(