| |
| """ |
| QModel Dataset Builder v2 |
| ========================= |
| Builds metadata.json and QModel.index from scratch using multiple |
| authoritative sources. |
| |
| Data Sources: |
| Quran: |
| - risan/quran-json (Arabic text + English translation + chapter metadata) |
| - semarketir/quranjson (verse transliteration) |
| Tafsir: |
| - Kaggle tafseer dataset (primary tafsir enrichment) |
| - Quran.com API (fallback tafsir enrichment) |
| Hadith: |
| - AhmedBaset/hadith-json (9 books: Arabic + English, chapter structure) |
| - fawazahmed0/hadith-api (grade information from scholars) |
| |
| Usage: |
| python build_index.py # full build from scratch |
| python build_index.py --force-download # re-download all sources |
| python build_index.py --data-only # generate metadata.json, skip index |
| python build_index.py --index-only # build index from existing metadata.json |
| python build_index.py --skip-tafsir # skip tafsir enrichment |
| """ |
|
|
| import json |
| import os |
| import re |
| import time |
| import argparse |
| import zipfile |
| import numpy as np |
| from pathlib import Path |
| from collections import defaultdict |
| from typing import Any, Dict, List, Optional, Tuple |
|
|
| import faiss |
| import requests |
| from sentence_transformers import SentenceTransformer |
| from tqdm import tqdm |
|
|
| |
| BASE_DIR = Path(__file__).resolve().parent |
| CACHE_DIR = BASE_DIR / "data" / "cache" |
| METADATA_PATH = BASE_DIR / "metadata.json" |
| INDEX_PATH = BASE_DIR / "QModel.index" |
|
|
| |
| QURAN_JSON_URL = ( |
| "https://raw.githubusercontent.com/risan/quran-json/main/data/quran.json" |
| ) |
| CHAPTERS_EN_URL = ( |
| "https://raw.githubusercontent.com/risan/quran-json/main/data/chapters/en.json" |
| ) |
| SEMARKETIR_SURAH_URL_TPL = ( |
| "https://raw.githubusercontent.com/semarketir/quranjson" |
| "/master/source/surah/surah_{n}.json" |
| ) |
| SEMARKETIR_TRANSLATION_URL_TPL = ( |
| "https://raw.githubusercontent.com/semarketir/quranjson" |
| "/master/source/translation/en/en_translation_{n}.json" |
| ) |
| |
| CDN_CHAPTER_EN_URL_TPL = ( |
| "https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{n}.json" |
| ) |
|
|
| |
| KAGGLE_TAFSIR_URL = ( |
| "https://www.kaggle.com/api/v1/datasets/download/" |
| "abdelrahmanahmed110/quranic-ayahs-with-tafseer-json-dataset" |
| ) |
| |
| QURAN_API_BASE = "https://api.quran.com/api/v4" |
| TAFSIR_EN_ID = 169 |
| TAFSIR_AR_ID = 16 |
|
|
| |
| AHMEDBASET_BASE_URL = ( |
| "https://raw.githubusercontent.com/AhmedBaset/hadith-json" |
| "/main/db/by_book/the_9_books" |
| ) |
| HADITH_BOOKS = { |
| "ahmed.json": { |
| "collection": "Musnad Ahmad", |
| "id_prefix": "ahmad", |
| "author": "Imam Ahmad ibn Hanbal", |
| }, |
| "bukhari.json": { |
| "collection": "Sahih al-Bukhari", |
| "id_prefix": "bukhari", |
| "author": "Muhammad al-Bukhari", |
| }, |
| "muslim.json": { |
| "collection": "Sahih Muslim", |
| "id_prefix": "muslim", |
| "author": "Muslim ibn al-Hajjaj", |
| }, |
| "abudawud.json": { |
| "collection": "Sunan Abu Dawood", |
| "id_prefix": "abudawud", |
| "author": "Abu Dawood Sulaiman", |
| }, |
| "tirmidhi.json": { |
| "collection": "Jami' at-Tirmidhi", |
| "id_prefix": "tirmidhi", |
| "author": "Al-Tirmidhi", |
| }, |
| "ibnmajah.json": { |
| "collection": "Sunan Ibn Majah", |
| "id_prefix": "ibnmajah", |
| "author": "Ibn Majah al-Qazwini", |
| }, |
| "nasai.json": { |
| "collection": "Sunan an-Nasai", |
| "id_prefix": "nasai", |
| "author": "Ahmad al-Nasai", |
| }, |
| "malik.json": { |
| "collection": "Muwatta Malik", |
| "id_prefix": "malik", |
| "author": "Malik ibn Anas", |
| }, |
| "darimi.json": { |
| "collection": "Sunan al-Darimi", |
| "id_prefix": "darimi", |
| "author": "Al-Darimi", |
| }, |
| } |
|
|
| |
| FAWAZ_CDN_BASE = "https://cdn.jsdelivr.net/gh/fawazahmed0/hadith-api@1" |
| FAWAZ_RAW_BASE = ( |
| "https://raw.githubusercontent.com/fawazahmed0/hadith-api/1" |
| ) |
| FAWAZ_EDITION_MAP = { |
| "bukhari": "eng-bukhari", |
| "muslim": "eng-muslim", |
| "abudawud": "eng-abudawud", |
| "tirmidhi": "eng-tirmidhi", |
| "nasai": "eng-nasai", |
| "ibnmajah": "eng-ibnmajah", |
| "malik": "eng-malik", |
| "ahmad": "eng-ahmed", |
| "darimi": "eng-darimi", |
| } |
|
|
| |
| DEFAULT_EMBED_MODEL = "intfloat/multilingual-e5-large" |
| EMBED_BATCH_SIZE = 32 |
| REQUEST_TIMEOUT = 60 |
| RETRY_ATTEMPTS = 3 |
| RETRY_DELAY = 2 |
|
|
|
|
| |
| |
| |
|
|
| def _ensure_dir(path: Path): |
| path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
| def download_json( |
| url: str, |
| cache_path: Optional[Path] = None, |
| force: bool = False, |
| ) -> Any: |
| """Download JSON with optional file caching and retries.""" |
| if cache_path and cache_path.exists() and not force: |
| with open(cache_path, "r", encoding="utf-8") as f: |
| return json.load(f) |
|
|
| for attempt in range(1, RETRY_ATTEMPTS + 1): |
| try: |
| resp = requests.get(url, timeout=REQUEST_TIMEOUT) |
| resp.raise_for_status() |
| data = resp.json() |
| if cache_path: |
| _ensure_dir(cache_path.parent) |
| with open(cache_path, "w", encoding="utf-8") as f: |
| json.dump(data, f, ensure_ascii=False) |
| return data |
| except Exception as exc: |
| if attempt == RETRY_ATTEMPTS: |
| raise |
| print(f" Retry {attempt}/{RETRY_ATTEMPTS} for {url}: {exc}") |
| time.sleep(RETRY_DELAY * attempt) |
|
|
|
|
| def download_file( |
| url: str, |
| cache_path: Path, |
| force: bool = False, |
| auth: Optional[Tuple[str, str]] = None, |
| ) -> Path: |
| """Download a binary file with caching.""" |
| if cache_path.exists() and cache_path.stat().st_size > 0 and not force: |
| return cache_path |
|
|
| _ensure_dir(cache_path.parent) |
| for attempt in range(1, RETRY_ATTEMPTS + 1): |
| try: |
| resp = requests.get( |
| url, timeout=REQUEST_TIMEOUT, stream=True, auth=auth, |
| ) |
| resp.raise_for_status() |
| with open(cache_path, "wb") as f: |
| for chunk in resp.iter_content(chunk_size=8192): |
| f.write(chunk) |
| return cache_path |
| except Exception as exc: |
| if attempt == RETRY_ATTEMPTS: |
| raise |
| print(f" Retry {attempt}/{RETRY_ATTEMPTS}: {exc}") |
| time.sleep(RETRY_DELAY * attempt) |
|
|
|
|
| def strip_html(text: str) -> str: |
| """Remove HTML tags and collapse whitespace.""" |
| clean = re.sub(r"<[^>]+>", " ", text) |
| return re.sub(r"\s+", " ", clean).strip() |
|
|
|
|
| def _kaggle_auth() -> Optional[Tuple[str, str]]: |
| """Return (username, key) from env vars or ~/.kaggle/kaggle.json.""" |
| username = os.environ.get("KAGGLE_USERNAME") |
| key = os.environ.get("KAGGLE_KEY") |
| if username and key: |
| return (username, key) |
| kaggle_json = Path.home() / ".kaggle" / "kaggle.json" |
| if kaggle_json.exists(): |
| with open(kaggle_json, "r") as f: |
| creds = json.load(f) |
| u, k = creds.get("username"), creds.get("key") |
| if u and k: |
| return (u, k) |
| return None |
|
|
|
|
| |
| |
| |
|
|
| def fetch_quran_sources( |
| force: bool = False, |
| ) -> Tuple[Dict[int, Dict], Dict[int, Dict], Dict[int, Dict], Dict[int, Dict]]: |
| """Download Quran data from all sources. |
| |
| Returns (cdn_chapters, quran_data, chapter_meta, semarketir_translations). |
| cdn_chapters: { surah_num: { "id", "name", "transliteration", "translation", |
| "type", "total_verses", "verses": [{"id", "text", |
| "translation", "transliteration"}] } } (primary) |
| quran_data: raw quran.json { "N": [{"chapter", "verse", "text"}] } |
| chapter_meta: { surah_num: {"id", "name", "transliteration", "translation", |
| "type", "total_verses"} } (fallback metadata) |
| semarketir_translations: { surah_num: { "verse": {"1": "english_text"} } } |
| """ |
| print("=" * 60) |
| print("Step 1: Fetching Quran Sources") |
| print("=" * 60) |
|
|
| |
| print(" Downloading per-chapter English data from CDN β¦") |
| cdn_chapters: Dict[int, Dict] = {} |
| for n in tqdm(range(1, 115), desc=" CDN chapters", leave=True): |
| try: |
| url = CDN_CHAPTER_EN_URL_TPL.format(n=n) |
| data = download_json( |
| url, |
| cache_path=CACHE_DIR / "quran" / "cdn_en" / f"{n}.json", |
| force=force, |
| ) |
| cdn_chapters[n] = data |
| except Exception as exc: |
| print(f"\n β Chapter {n}: {exc}") |
| print(f" β Loaded {len(cdn_chapters)} chapters from CDN") |
|
|
| |
| print(" Downloading quran.json from risan/quran-json β¦") |
| quran_data = download_json( |
| QURAN_JSON_URL, |
| cache_path=CACHE_DIR / "quran" / "quran.json", |
| force=force, |
| ) |
| print(f" β Loaded {len(quran_data)} surahs") |
|
|
| |
| print(" Downloading chapters/en.json β¦") |
| chapters_raw = download_json( |
| CHAPTERS_EN_URL, |
| cache_path=CACHE_DIR / "quran" / "chapters_en.json", |
| force=force, |
| ) |
| chapter_meta: Dict[int, Dict] = {} |
| if isinstance(chapters_raw, list): |
| chapter_meta = {ch["id"]: ch for ch in chapters_raw} |
| elif isinstance(chapters_raw, dict): |
| chapter_meta = {int(k): v for k, v in chapters_raw.items()} |
| print(f" β Loaded {len(chapter_meta)} chapter records") |
|
|
| |
| print(" Downloading English translations from semarketir/quranjson β¦") |
| semarketir_translations: Dict[int, Dict] = {} |
| for n in tqdm(range(1, 115), desc=" Semarketir EN", leave=True): |
| try: |
| url = SEMARKETIR_TRANSLATION_URL_TPL.format(n=n) |
| data = download_json( |
| url, |
| cache_path=CACHE_DIR / "quran" / "semarketir_en" / f"en_translation_{n}.json", |
| force=force, |
| ) |
| semarketir_translations[n] = data |
| except Exception as exc: |
| print(f"\n β Surah {n} translation: {exc}") |
| print(f" β Loaded translation for {len(semarketir_translations)} surahs") |
|
|
| return cdn_chapters, quran_data, chapter_meta, semarketir_translations |
|
|
|
|
| def build_quran_entries( |
| cdn_chapters: Dict[int, Dict], |
| quran_data: Dict, |
| chapter_meta: Dict[int, Dict], |
| semarketir_translations: Dict[int, Dict], |
| ) -> List[Dict]: |
| """Merge Quran sources into a list of verse entries. |
| |
| Priority: |
| Arabic text: CDN > quran.json |
| English: CDN > semarketir translation |
| Transliteration: CDN |
| Chapter metadata: CDN > chapter_meta (chapters/en.json) |
| """ |
| print("\n" + "=" * 60) |
| print("Step 2: Building Quran Entries") |
| print("=" * 60) |
|
|
| |
| |
| arabic_fallback: Dict[str, str] = {} |
| for surah_key, verses in quran_data.items(): |
| if isinstance(verses, list): |
| for v in verses: |
| vk = f"{v.get('chapter', surah_key)}:{v.get('verse', '')}" |
| arabic_fallback[vk] = v.get("text", "") |
|
|
| |
| |
| en_fallback: Dict[str, str] = {} |
| for surah_num, sdata in semarketir_translations.items(): |
| verses = sdata.get("verse", {}) |
| if isinstance(verses, dict): |
| for vnum_str, text in verses.items(): |
| en_fallback[f"{surah_num}:{vnum_str}"] = text if isinstance(text, str) else "" |
|
|
| |
| all_surahs = sorted( |
| set(cdn_chapters.keys()) |
| | {int(k) for k in quran_data.keys()} |
| ) |
|
|
| entries: List[Dict] = [] |
| for surah_num in all_surahs: |
| cdn = cdn_chapters.get(surah_num, {}) |
| ch = chapter_meta.get(surah_num, {}) |
|
|
| |
| surah_name_ar = cdn.get("name", ch.get("name", "")) |
| surah_name_en = cdn.get("translation", ch.get("translation", "")) |
| surah_translit = cdn.get("transliteration", ch.get("transliteration", "")) |
| revelation_type = cdn.get("type", ch.get("type", "")).lower() |
| total_verses = cdn.get("total_verses", ch.get("total_verses", 0)) |
|
|
| |
| cdn_verses = cdn.get("verses", []) |
| if cdn_verses: |
| for verse in cdn_verses: |
| verse_num = verse["id"] |
| vk = f"{surah_num}:{verse_num}" |
| entries.append({ |
| "id": vk, |
| "arabic": verse.get("text", arabic_fallback.get(vk, "")), |
| "english": verse.get("translation", en_fallback.get(vk, "")), |
| "source": f"Surah {surah_name_ar} {vk}", |
| "surah_number": surah_num, |
| "surah_name_en": surah_name_en, |
| "surah_name_ar": surah_name_ar, |
| "verse_number": verse_num, |
| "transliteration": verse.get("transliteration", ""), |
| "type": "quran", |
| "surah_name_transliteration": surah_translit, |
| "revelation_type": revelation_type, |
| "total_verses": total_verses, |
| }) |
| else: |
| |
| raw_verses = quran_data.get(str(surah_num), []) |
| if isinstance(raw_verses, list): |
| for v in raw_verses: |
| verse_num = v.get("verse", v.get("id", 0)) |
| vk = f"{surah_num}:{verse_num}" |
| entries.append({ |
| "id": vk, |
| "arabic": v.get("text", ""), |
| "english": en_fallback.get(vk, ""), |
| "source": f"Surah {surah_name_ar} {vk}", |
| "surah_number": surah_num, |
| "surah_name_en": surah_name_en, |
| "surah_name_ar": surah_name_ar, |
| "verse_number": verse_num, |
| "transliteration": "", |
| "type": "quran", |
| "surah_name_transliteration": surah_translit, |
| "revelation_type": revelation_type, |
| "total_verses": total_verses, |
| }) |
|
|
| print(f" β Built {len(entries):,} Quran verses across {len(all_surahs)} surahs") |
| return entries |
|
|
|
|
| |
| |
| |
|
|
| def _extract_verse_key(item: Dict) -> Optional[str]: |
| """Try to extract a 'surah:verse' key from a tafsir record.""" |
| surah_fields = [ |
| "sura_no", "surah", "surah_number", "sura", |
| "chapter", "chapter_no", "SuraID", "SurahNumber", |
| ] |
| verse_fields = [ |
| "aya_no", "ayah", "verse_number", "aya", |
| "verse", "ayah_number", "AyaID", "VerseNumber", |
| ] |
|
|
| surah = verse = None |
| for f in surah_fields: |
| if f in item: |
| surah = item[f] |
| break |
| for f in verse_fields: |
| if f in item: |
| verse = item[f] |
| break |
|
|
| if surah is not None and verse is not None: |
| return f"{int(surah)}:{int(verse)}" |
|
|
| if "verse_key" in item: |
| return item["verse_key"] |
| return None |
|
|
|
|
| def _extract_tafsir_text(item: Dict) -> Optional[Dict[str, str]]: |
| """Try to extract tafsir text from a tafsir record.""" |
| result: Dict[str, str] = {} |
|
|
| en_fields = [ |
| "tafseer_en", "tafsir_en", "tafseer_english", "tafsir_english", |
| "english_tafsir", "english_tafseer", "interpretation_en", |
| ] |
| ar_fields = [ |
| "tafseer_ar", "tafsir_ar", "tafseer_arabic", "tafsir_arabic", |
| "arabic_tafsir", "arabic_tafseer", "interpretation_ar", |
| "tafseer", "tafsir", |
| ] |
|
|
| for f in en_fields: |
| if f in item and item[f]: |
| result["tafsir_en"] = strip_html(str(item[f])) |
| break |
|
|
| for f in ar_fields: |
| if f in item and item[f]: |
| val = str(item[f]) |
| if any("\u0600" <= c <= "\u06ff" for c in val): |
| result["tafsir_ar"] = strip_html(val) |
| elif "tafsir_en" not in result: |
| |
| result["tafsir_en"] = strip_html(val) |
| break |
|
|
| |
| if not result: |
| for key in ("tafseer", "tafsir"): |
| obj = item.get(key) |
| if isinstance(obj, dict): |
| for _, val in obj.items(): |
| if val: |
| result["tafsir_en"] = strip_html(str(val)) |
| break |
| break |
|
|
| return result if result else None |
|
|
|
|
| def _load_tafsir_from_records(records: List[Dict]) -> Dict[str, Dict[str, str]]: |
| """Build verse-key β tafsir dict from a list of records.""" |
| tafsir_map: Dict[str, Dict[str, str]] = {} |
| for item in records: |
| verse_key = _extract_verse_key(item) |
| if not verse_key: |
| continue |
| text = _extract_tafsir_text(item) |
| if text: |
| tafsir_map.setdefault(verse_key, {}).update(text) |
| return tafsir_map |
|
|
|
|
| def fetch_kaggle_tafsir( |
| force: bool = False, |
| ) -> Optional[Dict[str, Dict[str, str]]]: |
| """Download and parse the Kaggle tafsir dataset (ZIP). |
| |
| Returns { "surah:verse": {"tafsir_en": β¦, "tafsir_ar": β¦} } or None. |
| """ |
| zip_path = CACHE_DIR / "tafsir" / "kaggle_tafsir.zip" |
| extract_dir = CACHE_DIR / "tafsir" / "kaggle_extracted" |
|
|
| |
| try: |
| print(" Downloading Kaggle tafsir dataset β¦") |
| auth = _kaggle_auth() |
| download_file(KAGGLE_TAFSIR_URL, zip_path, force=force, auth=auth) |
| except Exception as exc: |
| print(f" β Kaggle download failed: {exc}") |
| print( |
| " Tip: set KAGGLE_USERNAME and KAGGLE_KEY env vars, " |
| "or place kaggle.json in ~/.kaggle/" |
| ) |
| return None |
|
|
| |
| if not zipfile.is_zipfile(zip_path): |
| print(" β Downloaded file is not a valid ZIP (may need Kaggle auth)") |
| return None |
|
|
| |
| try: |
| _ensure_dir(extract_dir) |
| with zipfile.ZipFile(zip_path, "r") as zf: |
| zf.extractall(extract_dir) |
| print(f" β Extracted to {extract_dir}") |
| except Exception as exc: |
| print(f" β Failed to extract ZIP: {exc}") |
| return None |
|
|
| |
| json_files = list(extract_dir.rglob("*.json")) |
| if not json_files: |
| print(" β No JSON files found in Kaggle archive") |
| return None |
|
|
| print(f" Found {len(json_files)} JSON file(s) in archive") |
| tafsir_map: Dict[str, Dict[str, str]] = {} |
|
|
| for jf in json_files: |
| try: |
| with open(jf, "r", encoding="utf-8") as f: |
| data = json.load(f) |
| except Exception as exc: |
| print(f" β Error parsing {jf.name}: {exc}") |
| continue |
|
|
| if isinstance(data, list): |
| tafsir_map.update(_load_tafsir_from_records(data)) |
| elif isinstance(data, dict): |
| |
| for _key, value in data.items(): |
| if isinstance(value, list): |
| tafsir_map.update(_load_tafsir_from_records(value)) |
| elif isinstance(value, dict): |
| vk = _extract_verse_key(value) |
| if vk: |
| tt = _extract_tafsir_text(value) |
| if tt: |
| tafsir_map.setdefault(vk, {}).update(tt) |
|
|
| if tafsir_map: |
| print(f" β Loaded tafsir for {len(tafsir_map):,} verses from Kaggle") |
| return tafsir_map if tafsir_map else None |
|
|
|
|
| def _fetch_tafsir_chapter_api( |
| tafsir_id: int, chapter: int, |
| ) -> Dict[str, str]: |
| """Fetch all tafsir entries for a chapter from Quran.com API.""" |
| result: Dict[str, str] = {} |
| page = 1 |
| while True: |
| url = ( |
| f"{QURAN_API_BASE}/tafsirs/{tafsir_id}/by_chapter/{chapter}" |
| f"?per_page=50&page={page}" |
| ) |
| resp = requests.get(url, timeout=REQUEST_TIMEOUT) |
| resp.raise_for_status() |
| data = resp.json() |
|
|
| for entry in data.get("tafsirs", []): |
| raw = entry.get("text", "") |
| if raw: |
| result[entry["verse_key"]] = strip_html(raw) |
|
|
| pagination = data.get("pagination", {}) |
| if pagination.get("next_page") is None: |
| break |
| page = pagination["next_page"] |
| time.sleep(0.3) |
| return result |
|
|
|
|
| def fetch_qurancom_tafsir( |
| surah_numbers: List[int], |
| ) -> Dict[str, Dict[str, str]]: |
| """Fallback: fetch tafsir from Quran.com API.""" |
| print(" Falling back to Quran.com API for tafsir β¦") |
| tafsir_map: Dict[str, Dict[str, str]] = {} |
|
|
| for surah_num in tqdm(surah_numbers, desc=" Fetching tafsir"): |
| try: |
| en_entries = _fetch_tafsir_chapter_api(TAFSIR_EN_ID, surah_num) |
| time.sleep(0.3) |
| ar_entries = _fetch_tafsir_chapter_api(TAFSIR_AR_ID, surah_num) |
| time.sleep(0.3) |
|
|
| for vk, text in en_entries.items(): |
| tafsir_map.setdefault(vk, {})["tafsir_en"] = text |
| for vk, text in ar_entries.items(): |
| tafsir_map.setdefault(vk, {})["tafsir_ar"] = text |
| except Exception as exc: |
| print(f"\n β Surah {surah_num}: {exc}") |
|
|
| return tafsir_map |
|
|
|
|
| def enrich_quran_with_tafsir( |
| entries: List[Dict], |
| force_download: bool = False, |
| ) -> List[Dict]: |
| """Add tafsir fields to Quran entries (Kaggle β Quran.com fallback).""" |
| print("\n" + "=" * 60) |
| print("Step 3: Enriching Quran with Tafsir") |
| print("=" * 60) |
|
|
| tafsir_map = fetch_kaggle_tafsir(force=force_download) |
|
|
| if not tafsir_map: |
| surah_numbers = sorted( |
| {e["surah_number"] for e in entries if e.get("type") == "quran"} |
| ) |
| tafsir_map = fetch_qurancom_tafsir(surah_numbers) |
|
|
| if not tafsir_map: |
| print(" β No tafsir data available") |
| return entries |
|
|
| enriched = 0 |
| for entry in entries: |
| if entry.get("type") != "quran": |
| continue |
| verse_key = f"{entry['surah_number']}:{entry['verse_number']}" |
| tafsir = tafsir_map.get(verse_key, {}) |
| entry["tafsir_en"] = tafsir.get("tafsir_en", "") |
| entry["tafsir_ar"] = tafsir.get("tafsir_ar", "") |
| if entry["tafsir_en"] or entry["tafsir_ar"]: |
| enriched += 1 |
|
|
| print(f" β Enriched {enriched:,} verses with tafsir") |
| return entries |
|
|
|
|
| |
| |
| |
|
|
| def _pick_best_grade(grades: List[Dict]) -> str: |
| """Pick the most authoritative grade from a list of scholar grades.""" |
| priority = ["darussalam", "al-albani", "zubair ali zai"] |
| grade_map = {} |
| for g in grades: |
| name = g.get("name", "").lower() |
| grade_text = g.get("grade", "") |
| if grade_text: |
| grade_map[name] = grade_text |
|
|
| for scholar in priority: |
| for name, grade in grade_map.items(): |
| if scholar in name: |
| return grade |
|
|
| for g in grades: |
| if g.get("grade"): |
| return g["grade"] |
| return "" |
|
|
|
|
| def _fetch_fawaz_grades( |
| edition: str, force: bool = False, |
| ) -> Optional[Dict[int, str]]: |
| """Fetch grades for a hadith edition from fawazahmed0.""" |
| cache_path = CACHE_DIR / "hadith" / "fawazahmed0" / f"{edition}.json" |
|
|
| urls = [ |
| f"{FAWAZ_CDN_BASE}/editions/{edition}.json", |
| f"{FAWAZ_RAW_BASE}/editions/{edition}.json", |
| ] |
|
|
| data = None |
| for url in urls: |
| try: |
| data = download_json(url, cache_path=cache_path, force=force) |
| break |
| except Exception: |
| continue |
|
|
| if not data: |
| return None |
|
|
| grades: Dict[int, str] = {} |
| for hadith in data.get("hadiths", []): |
| hnum = hadith.get("hadithnumber") |
| if hnum is None: |
| continue |
| grade_list = hadith.get("grades", []) |
| if grade_list: |
| grades[int(hnum)] = _pick_best_grade(grade_list) |
| return grades |
|
|
|
|
| def fetch_hadith_sources( |
| force: bool = False, |
| ) -> Tuple[Dict[str, Dict], Dict[str, Dict[int, str]]]: |
| """Download hadith data from AhmedBaset and grades from fawazahmed0. |
| |
| Returns (ahmedbaset_books, fawaz_grades). |
| """ |
| print("\n" + "=" * 60) |
| print("Step 4a: Fetching Hadith Sources") |
| print("=" * 60) |
|
|
| |
| print(" Downloading from AhmedBaset/hadith-json β¦") |
| ahmedbaset_books: Dict[str, Dict] = {} |
| for filename in tqdm(HADITH_BOOKS.keys(), desc=" Books"): |
| try: |
| url = f"{AHMEDBASET_BASE_URL}/{filename}" |
| data = download_json( |
| url, |
| cache_path=CACHE_DIR / "hadith" / "ahmedbaset" / filename, |
| force=force, |
| ) |
| ahmedbaset_books[filename] = data |
| except Exception as exc: |
| print(f"\n β {filename}: {exc}") |
| print(f" β Loaded {len(ahmedbaset_books)} books") |
|
|
| |
| print(" Downloading grade data from fawazahmed0/hadith-api β¦") |
| fawaz_grades: Dict[str, Dict[int, str]] = {} |
| for prefix, edition in tqdm(FAWAZ_EDITION_MAP.items(), desc=" Editions"): |
| grades = _fetch_fawaz_grades(edition, force) |
| if grades: |
| fawaz_grades[prefix] = grades |
| print(f" β Loaded grades for {len(fawaz_grades)} collections") |
|
|
| return ahmedbaset_books, fawaz_grades |
|
|
|
|
| def build_hadith_entries( |
| ahmedbaset_books: Dict[str, Dict], |
| fawaz_grades: Dict[str, Dict[int, str]], |
| ) -> List[Dict]: |
| """Merge AhmedBaset data with fawazahmed0 grades into hadith entries.""" |
| print("\n" + "=" * 60) |
| print("Step 4b: Building Hadith Entries") |
| print("=" * 60) |
|
|
| entries: List[Dict] = [] |
| stats: Dict[str, int] = defaultdict(int) |
|
|
| for filename, book_config in HADITH_BOOKS.items(): |
| book_data = ahmedbaset_books.get(filename) |
| if not book_data: |
| print(f" β Skipping {filename} (not downloaded)") |
| continue |
|
|
| prefix = book_config["id_prefix"] |
| grades = fawaz_grades.get(prefix, {}) |
| hadiths = book_data.get("hadiths", []) |
| chapter_map = { |
| ch.get("id"): ch.get("arabic", "") |
| for ch in book_data.get("chapters", []) |
| } |
|
|
| for hadith in hadiths: |
| hadith_num = hadith.get("idInBook", hadith.get("id", "")) |
|
|
| |
| if isinstance(hadith.get("english"), dict): |
| parts = [] |
| if hadith["english"].get("narrator"): |
| parts.append(hadith["english"]["narrator"]) |
| if hadith["english"].get("text"): |
| parts.append(hadith["english"]["text"]) |
| english = " ".join(parts) |
| else: |
| english = str(hadith.get("english", "")) |
|
|
| |
| chapter_name = "" |
| if "chapterId" in hadith: |
| chapter_name = chapter_map.get(hadith["chapterId"], "") |
|
|
| |
| grade = "" |
| if hadith_num: |
| grade = grades.get(int(hadith_num), "") |
|
|
| entries.append( |
| { |
| "id": f"{prefix}_{hadith_num}", |
| "arabic": hadith.get("arabic", ""), |
| "english": english, |
| "reference": f"{book_config['collection']} {hadith_num}", |
| "hadith_number": hadith_num, |
| "collection": book_config["collection"], |
| "chapter": chapter_name, |
| "grade": grade, |
| "type": "hadith", |
| "author": book_config["author"], |
| } |
| ) |
| stats[book_config["collection"]] += 1 |
|
|
| print(f" β Built {len(entries):,} hadith entries") |
| print("\n Breakdown:") |
| for collection, count in sorted(stats.items()): |
| print(f" {collection}: {count:,}") |
|
|
| graded = sum(1 for e in entries if e.get("grade")) |
| print(f"\n Hadiths with grades: {graded:,} / {len(entries):,}") |
| return entries |
|
|
|
|
| |
| |
| |
|
|
| def generate_metadata( |
| quran_entries: List[Dict], |
| hadith_entries: List[Dict], |
| ) -> List[Dict]: |
| """Combine all entries and write metadata.json.""" |
| print("\n" + "=" * 60) |
| print("Step 5: Generating metadata.json") |
| print("=" * 60) |
|
|
| documents = quran_entries + hadith_entries |
|
|
| print(f" Quran entries: {len(quran_entries):,}") |
| print(f" Hadith entries: {len(hadith_entries):,}") |
| print(f" Total: {len(documents):,}") |
|
|
| |
| ids = [d["id"] for d in documents] |
| if len(ids) != len(set(ids)): |
| dupes = len(ids) - len(set(ids)) |
| print(f" β Warning: {dupes} duplicate IDs found") |
|
|
| print(f" Writing to {METADATA_PATH} β¦") |
| with open(METADATA_PATH, "w", encoding="utf-8") as f: |
| json.dump(documents, f, ensure_ascii=False, indent=2) |
|
|
| size_mb = METADATA_PATH.stat().st_size / (1024 * 1024) |
| print(f" β File size: {size_mb:.2f} MB") |
| return documents |
|
|
|
|
| |
| |
| |
|
|
| def build_faiss_index( |
| documents: List[Dict], |
| model_name: str = DEFAULT_EMBED_MODEL, |
| ): |
| """Generate embeddings and build FAISS index.""" |
| print("\n" + "=" * 60) |
| print("Step 6: Building FAISS Index") |
| print("=" * 60) |
|
|
| print(f" Loading embedding model: {model_name}") |
| model = SentenceTransformer(model_name) |
| embedding_dim = model.get_sentence_embedding_dimension() |
| print(f" Embedding dimension: {embedding_dim}") |
|
|
| |
| all_texts: List[str] = [] |
| for doc in documents: |
| if doc.get("type") == "quran": |
| |
| tafsir_snippet = doc.get("tafsir_en", "")[:500] |
| text = ( |
| f"{doc.get('arabic', '')} {doc.get('english', '')} " |
| f"{tafsir_snippet}" |
| ) |
| else: |
| text = ( |
| f"{doc.get('collection', '')} " |
| f"{doc.get('arabic', '')} " |
| f"{doc.get('english', '')}" |
| ) |
| all_texts.append(text.strip()) |
|
|
| print(f"\n Generating embeddings for {len(all_texts):,} documents β¦") |
| all_embeddings = [] |
| for i in tqdm( |
| range(0, len(all_texts), EMBED_BATCH_SIZE), |
| desc=" Embedding batches", |
| ): |
| batch = all_texts[i : i + EMBED_BATCH_SIZE] |
| batch_emb = model.encode(batch, convert_to_numpy=True) |
| all_embeddings.extend(batch_emb) |
|
|
| embeddings = np.array(all_embeddings, dtype=np.float32) |
| print(f" Embeddings shape: {embeddings.shape}") |
|
|
| print("\n Creating FAISS index (IndexFlatIP + L2 normalization) β¦") |
| index = faiss.IndexFlatIP(embedding_dim) |
| faiss.normalize_L2(embeddings) |
| index.add(embeddings) |
|
|
| print(f" Saving to {INDEX_PATH}") |
| faiss.write_index(index, str(INDEX_PATH)) |
|
|
| size_mb = INDEX_PATH.stat().st_size / (1024 * 1024) |
| print(f"\n {'=' * 50}") |
| print(f" Index Build Complete") |
| print(f" {'=' * 50}") |
| print(f" Documents indexed: {index.ntotal:,}") |
| print(f" Index file size: {size_mb:.2f} MB") |
|
|
|
|
| |
| |
| |
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description=( |
| "QModel Dataset Builder v2 β builds metadata.json and " |
| "QModel.index from scratch using multiple authoritative sources" |
| ), |
| ) |
| parser.add_argument( |
| "--index-only", |
| action="store_true", |
| help="Only build FAISS index from existing metadata.json", |
| ) |
| parser.add_argument( |
| "--data-only", |
| action="store_true", |
| help="Only generate metadata.json, skip index building", |
| ) |
| parser.add_argument( |
| "--skip-tafsir", |
| action="store_true", |
| help="Skip tafsir enrichment", |
| ) |
| parser.add_argument( |
| "--force-download", |
| action="store_true", |
| help="Re-download all sources even if cached", |
| ) |
| parser.add_argument( |
| "--model", |
| default=DEFAULT_EMBED_MODEL, |
| help=f"Sentence-transformer model for embeddings (default: {DEFAULT_EMBED_MODEL})", |
| ) |
| args = parser.parse_args() |
|
|
| |
| if args.index_only: |
| print("Loading existing metadata.json β¦") |
| with open(METADATA_PATH, "r", encoding="utf-8") as f: |
| documents = json.load(f) |
| build_faiss_index(documents, model_name=args.model) |
| print("\nβ Done!") |
| return |
|
|
| force = args.force_download |
|
|
| |
| cdn_chapters, quran_data, chapter_meta, sem_translations = fetch_quran_sources(force=force) |
|
|
| |
| quran_entries = build_quran_entries(cdn_chapters, quran_data, chapter_meta, sem_translations) |
|
|
| |
| if not args.skip_tafsir: |
| quran_entries = enrich_quran_with_tafsir( |
| quran_entries, force_download=force, |
| ) |
| else: |
| print("\nSkipping tafsir enrichment (--skip-tafsir)") |
|
|
| |
| ahmedbaset_books, fawaz_grades = fetch_hadith_sources(force=force) |
| hadith_entries = build_hadith_entries(ahmedbaset_books, fawaz_grades) |
|
|
| |
| documents = generate_metadata(quran_entries, hadith_entries) |
|
|
| |
| if not args.data_only: |
| build_faiss_index(documents, model_name=args.model) |
| else: |
| print("\nSkipping index build (--data-only)") |
|
|
| print("\nβ Done!") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|