QModel / build_index.py
aelgendy's picture
Upload folder using huggingface_hub
4d5fcc9
#!/usr/bin/env python3
"""
QModel Dataset Builder v2
=========================
Builds metadata.json and QModel.index from scratch using multiple
authoritative sources.
Data Sources:
Quran:
- risan/quran-json (Arabic text + English translation + chapter metadata)
- semarketir/quranjson (verse transliteration)
Tafsir:
- Kaggle tafseer dataset (primary tafsir enrichment)
- Quran.com API (fallback tafsir enrichment)
Hadith:
- AhmedBaset/hadith-json (9 books: Arabic + English, chapter structure)
- fawazahmed0/hadith-api (grade information from scholars)
Usage:
python build_index.py # full build from scratch
python build_index.py --force-download # re-download all sources
python build_index.py --data-only # generate metadata.json, skip index
python build_index.py --index-only # build index from existing metadata.json
python build_index.py --skip-tafsir # skip tafsir enrichment
"""
import json
import os
import re
import time
import argparse
import zipfile
import numpy as np
from pathlib import Path
from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple
import faiss
import requests
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
# ── Paths ──────────────────────────────────────────────────────────────
BASE_DIR = Path(__file__).resolve().parent
CACHE_DIR = BASE_DIR / "data" / "cache"
METADATA_PATH = BASE_DIR / "metadata.json"
INDEX_PATH = BASE_DIR / "QModel.index"
# ── Quran source URLs ─────────────────────────────────────────────────
QURAN_JSON_URL = (
"https://raw.githubusercontent.com/risan/quran-json/main/data/quran.json"
)
CHAPTERS_EN_URL = (
"https://raw.githubusercontent.com/risan/quran-json/main/data/chapters/en.json"
)
SEMARKETIR_SURAH_URL_TPL = (
"https://raw.githubusercontent.com/semarketir/quranjson"
"/master/source/surah/surah_{n}.json"
)
SEMARKETIR_TRANSLATION_URL_TPL = (
"https://raw.githubusercontent.com/semarketir/quranjson"
"/master/source/translation/en/en_translation_{n}.json"
)
# CDN dist per-chapter English (Arabic + English + transliteration)
CDN_CHAPTER_EN_URL_TPL = (
"https://cdn.jsdelivr.net/npm/quran-json@3.1.2/dist/chapters/en/{n}.json"
)
# ── Tafsir sources ────────────────────────────────────────────────────
KAGGLE_TAFSIR_URL = (
"https://www.kaggle.com/api/v1/datasets/download/"
"abdelrahmanahmed110/quranic-ayahs-with-tafseer-json-dataset"
)
# Fallback: Quran.com API
QURAN_API_BASE = "https://api.quran.com/api/v4"
TAFSIR_EN_ID = 169 # Ibn Kathir (Abridged) – English
TAFSIR_AR_ID = 16 # Al-Muyassar – Arabic
# ── Hadith source: AhmedBaset ─────────────────────────────────────────
AHMEDBASET_BASE_URL = (
"https://raw.githubusercontent.com/AhmedBaset/hadith-json"
"/main/db/by_book/the_9_books"
)
HADITH_BOOKS = {
"ahmed.json": {
"collection": "Musnad Ahmad",
"id_prefix": "ahmad",
"author": "Imam Ahmad ibn Hanbal",
},
"bukhari.json": {
"collection": "Sahih al-Bukhari",
"id_prefix": "bukhari",
"author": "Muhammad al-Bukhari",
},
"muslim.json": {
"collection": "Sahih Muslim",
"id_prefix": "muslim",
"author": "Muslim ibn al-Hajjaj",
},
"abudawud.json": {
"collection": "Sunan Abu Dawood",
"id_prefix": "abudawud",
"author": "Abu Dawood Sulaiman",
},
"tirmidhi.json": {
"collection": "Jami' at-Tirmidhi",
"id_prefix": "tirmidhi",
"author": "Al-Tirmidhi",
},
"ibnmajah.json": {
"collection": "Sunan Ibn Majah",
"id_prefix": "ibnmajah",
"author": "Ibn Majah al-Qazwini",
},
"nasai.json": {
"collection": "Sunan an-Nasai",
"id_prefix": "nasai",
"author": "Ahmad al-Nasai",
},
"malik.json": {
"collection": "Muwatta Malik",
"id_prefix": "malik",
"author": "Malik ibn Anas",
},
"darimi.json": {
"collection": "Sunan al-Darimi",
"id_prefix": "darimi",
"author": "Al-Darimi",
},
}
# ── Hadith source: fawazahmed0 (for grades) ───────────────────────────
FAWAZ_CDN_BASE = "https://cdn.jsdelivr.net/gh/fawazahmed0/hadith-api@1"
FAWAZ_RAW_BASE = (
"https://raw.githubusercontent.com/fawazahmed0/hadith-api/1"
)
FAWAZ_EDITION_MAP = {
"bukhari": "eng-bukhari",
"muslim": "eng-muslim",
"abudawud": "eng-abudawud",
"tirmidhi": "eng-tirmidhi",
"nasai": "eng-nasai",
"ibnmajah": "eng-ibnmajah",
"malik": "eng-malik",
"ahmad": "eng-ahmed",
"darimi": "eng-darimi",
}
# ── Embedding / network config ────────────────────────────────────────
DEFAULT_EMBED_MODEL = "intfloat/multilingual-e5-large"
EMBED_BATCH_SIZE = 32
REQUEST_TIMEOUT = 60
RETRY_ATTEMPTS = 3
RETRY_DELAY = 2
# ══════════════════════════════════════════════════════════════════════
# UTILITIES
# ══════════════════════════════════════════════════════════════════════
def _ensure_dir(path: Path):
path.mkdir(parents=True, exist_ok=True)
def download_json(
url: str,
cache_path: Optional[Path] = None,
force: bool = False,
) -> Any:
"""Download JSON with optional file caching and retries."""
if cache_path and cache_path.exists() and not force:
with open(cache_path, "r", encoding="utf-8") as f:
return json.load(f)
for attempt in range(1, RETRY_ATTEMPTS + 1):
try:
resp = requests.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
data = resp.json()
if cache_path:
_ensure_dir(cache_path.parent)
with open(cache_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False)
return data
except Exception as exc:
if attempt == RETRY_ATTEMPTS:
raise
print(f" Retry {attempt}/{RETRY_ATTEMPTS} for {url}: {exc}")
time.sleep(RETRY_DELAY * attempt)
def download_file(
url: str,
cache_path: Path,
force: bool = False,
auth: Optional[Tuple[str, str]] = None,
) -> Path:
"""Download a binary file with caching."""
if cache_path.exists() and cache_path.stat().st_size > 0 and not force:
return cache_path
_ensure_dir(cache_path.parent)
for attempt in range(1, RETRY_ATTEMPTS + 1):
try:
resp = requests.get(
url, timeout=REQUEST_TIMEOUT, stream=True, auth=auth,
)
resp.raise_for_status()
with open(cache_path, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
f.write(chunk)
return cache_path
except Exception as exc:
if attempt == RETRY_ATTEMPTS:
raise
print(f" Retry {attempt}/{RETRY_ATTEMPTS}: {exc}")
time.sleep(RETRY_DELAY * attempt)
def strip_html(text: str) -> str:
"""Remove HTML tags and collapse whitespace."""
clean = re.sub(r"<[^>]+>", " ", text)
return re.sub(r"\s+", " ", clean).strip()
def _kaggle_auth() -> Optional[Tuple[str, str]]:
"""Return (username, key) from env vars or ~/.kaggle/kaggle.json."""
username = os.environ.get("KAGGLE_USERNAME")
key = os.environ.get("KAGGLE_KEY")
if username and key:
return (username, key)
kaggle_json = Path.home() / ".kaggle" / "kaggle.json"
if kaggle_json.exists():
with open(kaggle_json, "r") as f:
creds = json.load(f)
u, k = creds.get("username"), creds.get("key")
if u and k:
return (u, k)
return None
# ══════════════════════════════════════════════════════════════════════
# STEP 1: FETCH & BUILD QURAN ENTRIES
# ══════════════════════════════════════════════════════════════════════
def fetch_quran_sources(
force: bool = False,
) -> Tuple[Dict[int, Dict], Dict[int, Dict], Dict[int, Dict], Dict[int, Dict]]:
"""Download Quran data from all sources.
Returns (cdn_chapters, quran_data, chapter_meta, semarketir_translations).
cdn_chapters: { surah_num: { "id", "name", "transliteration", "translation",
"type", "total_verses", "verses": [{"id", "text",
"translation", "transliteration"}] } } (primary)
quran_data: raw quran.json { "N": [{"chapter", "verse", "text"}] }
chapter_meta: { surah_num: {"id", "name", "transliteration", "translation",
"type", "total_verses"} } (fallback metadata)
semarketir_translations: { surah_num: { "verse": {"1": "english_text"} } }
"""
print("=" * 60)
print("Step 1: Fetching Quran Sources")
print("=" * 60)
# 1a. CDN per-chapter English (primary – has Arabic + English + transliteration)
print(" Downloading per-chapter English data from CDN …")
cdn_chapters: Dict[int, Dict] = {}
for n in tqdm(range(1, 115), desc=" CDN chapters", leave=True):
try:
url = CDN_CHAPTER_EN_URL_TPL.format(n=n)
data = download_json(
url,
cache_path=CACHE_DIR / "quran" / "cdn_en" / f"{n}.json",
force=force,
)
cdn_chapters[n] = data
except Exception as exc:
print(f"\n βœ— Chapter {n}: {exc}")
print(f" βœ“ Loaded {len(cdn_chapters)} chapters from CDN")
# 1b. risan/quran-json – full Quran text (fallback Arabic)
print(" Downloading quran.json from risan/quran-json …")
quran_data = download_json(
QURAN_JSON_URL,
cache_path=CACHE_DIR / "quran" / "quran.json",
force=force,
)
print(f" βœ“ Loaded {len(quran_data)} surahs")
# 1c. risan/quran-json – chapter metadata (fallback)
print(" Downloading chapters/en.json …")
chapters_raw = download_json(
CHAPTERS_EN_URL,
cache_path=CACHE_DIR / "quran" / "chapters_en.json",
force=force,
)
chapter_meta: Dict[int, Dict] = {}
if isinstance(chapters_raw, list):
chapter_meta = {ch["id"]: ch for ch in chapters_raw}
elif isinstance(chapters_raw, dict):
chapter_meta = {int(k): v for k, v in chapters_raw.items()}
print(f" βœ“ Loaded {len(chapter_meta)} chapter records")
# 1d. semarketir English translations (additional fallback)
print(" Downloading English translations from semarketir/quranjson …")
semarketir_translations: Dict[int, Dict] = {}
for n in tqdm(range(1, 115), desc=" Semarketir EN", leave=True):
try:
url = SEMARKETIR_TRANSLATION_URL_TPL.format(n=n)
data = download_json(
url,
cache_path=CACHE_DIR / "quran" / "semarketir_en" / f"en_translation_{n}.json",
force=force,
)
semarketir_translations[n] = data
except Exception as exc:
print(f"\n βœ— Surah {n} translation: {exc}")
print(f" βœ“ Loaded translation for {len(semarketir_translations)} surahs")
return cdn_chapters, quran_data, chapter_meta, semarketir_translations
def build_quran_entries(
cdn_chapters: Dict[int, Dict],
quran_data: Dict,
chapter_meta: Dict[int, Dict],
semarketir_translations: Dict[int, Dict],
) -> List[Dict]:
"""Merge Quran sources into a list of verse entries.
Priority:
Arabic text: CDN > quran.json
English: CDN > semarketir translation
Transliteration: CDN
Chapter metadata: CDN > chapter_meta (chapters/en.json)
"""
print("\n" + "=" * 60)
print("Step 2: Building Quran Entries")
print("=" * 60)
# Build a fallback Arabic lookup from quran.json
# quran.json: { "N": [{"chapter": int, "verse": int, "text": str}] }
arabic_fallback: Dict[str, str] = {}
for surah_key, verses in quran_data.items():
if isinstance(verses, list):
for v in verses:
vk = f"{v.get('chapter', surah_key)}:{v.get('verse', '')}"
arabic_fallback[vk] = v.get("text", "")
# Build semarketir English fallback
# semarketir_translations: { surah_num: {"verse": {"1": "english_text"}} }
en_fallback: Dict[str, str] = {}
for surah_num, sdata in semarketir_translations.items():
verses = sdata.get("verse", {})
if isinstance(verses, dict):
for vnum_str, text in verses.items():
en_fallback[f"{surah_num}:{vnum_str}"] = text if isinstance(text, str) else ""
# Determine surah numbers to iterate
all_surahs = sorted(
set(cdn_chapters.keys())
| {int(k) for k in quran_data.keys()}
)
entries: List[Dict] = []
for surah_num in all_surahs:
cdn = cdn_chapters.get(surah_num, {})
ch = chapter_meta.get(surah_num, {})
# Chapter metadata – prefer CDN, fallback to chapters_en.json
surah_name_ar = cdn.get("name", ch.get("name", ""))
surah_name_en = cdn.get("translation", ch.get("translation", ""))
surah_translit = cdn.get("transliteration", ch.get("transliteration", ""))
revelation_type = cdn.get("type", ch.get("type", "")).lower()
total_verses = cdn.get("total_verses", ch.get("total_verses", 0))
# Verses from CDN (primary)
cdn_verses = cdn.get("verses", [])
if cdn_verses:
for verse in cdn_verses:
verse_num = verse["id"]
vk = f"{surah_num}:{verse_num}"
entries.append({
"id": vk,
"arabic": verse.get("text", arabic_fallback.get(vk, "")),
"english": verse.get("translation", en_fallback.get(vk, "")),
"source": f"Surah {surah_name_ar} {vk}",
"surah_number": surah_num,
"surah_name_en": surah_name_en,
"surah_name_ar": surah_name_ar,
"verse_number": verse_num,
"transliteration": verse.get("transliteration", ""),
"type": "quran",
"surah_name_transliteration": surah_translit,
"revelation_type": revelation_type,
"total_verses": total_verses,
})
else:
# Fallback: build from quran.json verses
raw_verses = quran_data.get(str(surah_num), [])
if isinstance(raw_verses, list):
for v in raw_verses:
verse_num = v.get("verse", v.get("id", 0))
vk = f"{surah_num}:{verse_num}"
entries.append({
"id": vk,
"arabic": v.get("text", ""),
"english": en_fallback.get(vk, ""),
"source": f"Surah {surah_name_ar} {vk}",
"surah_number": surah_num,
"surah_name_en": surah_name_en,
"surah_name_ar": surah_name_ar,
"verse_number": verse_num,
"transliteration": "",
"type": "quran",
"surah_name_transliteration": surah_translit,
"revelation_type": revelation_type,
"total_verses": total_verses,
})
print(f" βœ“ Built {len(entries):,} Quran verses across {len(all_surahs)} surahs")
return entries
# ══════════════════════════════════════════════════════════════════════
# STEP 3: ENRICH QURAN WITH TAFSIR
# ══════════════════════════════════════════════════════════════════════
def _extract_verse_key(item: Dict) -> Optional[str]:
"""Try to extract a 'surah:verse' key from a tafsir record."""
surah_fields = [
"sura_no", "surah", "surah_number", "sura",
"chapter", "chapter_no", "SuraID", "SurahNumber",
]
verse_fields = [
"aya_no", "ayah", "verse_number", "aya",
"verse", "ayah_number", "AyaID", "VerseNumber",
]
surah = verse = None
for f in surah_fields:
if f in item:
surah = item[f]
break
for f in verse_fields:
if f in item:
verse = item[f]
break
if surah is not None and verse is not None:
return f"{int(surah)}:{int(verse)}"
if "verse_key" in item:
return item["verse_key"]
return None
def _extract_tafsir_text(item: Dict) -> Optional[Dict[str, str]]:
"""Try to extract tafsir text from a tafsir record."""
result: Dict[str, str] = {}
en_fields = [
"tafseer_en", "tafsir_en", "tafseer_english", "tafsir_english",
"english_tafsir", "english_tafseer", "interpretation_en",
]
ar_fields = [
"tafseer_ar", "tafsir_ar", "tafseer_arabic", "tafsir_arabic",
"arabic_tafsir", "arabic_tafseer", "interpretation_ar",
"tafseer", "tafsir",
]
for f in en_fields:
if f in item and item[f]:
result["tafsir_en"] = strip_html(str(item[f]))
break
for f in ar_fields:
if f in item and item[f]:
val = str(item[f])
if any("\u0600" <= c <= "\u06ff" for c in val):
result["tafsir_ar"] = strip_html(val)
elif "tafsir_en" not in result:
# Treat as English if no Arabic characters detected
result["tafsir_en"] = strip_html(val)
break
# Handle nested tafsir object (e.g. {"1": "...", "2": "..."})
if not result:
for key in ("tafseer", "tafsir"):
obj = item.get(key)
if isinstance(obj, dict):
for _, val in obj.items():
if val:
result["tafsir_en"] = strip_html(str(val))
break
break
return result if result else None
def _load_tafsir_from_records(records: List[Dict]) -> Dict[str, Dict[str, str]]:
"""Build verse-key β†’ tafsir dict from a list of records."""
tafsir_map: Dict[str, Dict[str, str]] = {}
for item in records:
verse_key = _extract_verse_key(item)
if not verse_key:
continue
text = _extract_tafsir_text(item)
if text:
tafsir_map.setdefault(verse_key, {}).update(text)
return tafsir_map
def fetch_kaggle_tafsir(
force: bool = False,
) -> Optional[Dict[str, Dict[str, str]]]:
"""Download and parse the Kaggle tafsir dataset (ZIP).
Returns { "surah:verse": {"tafsir_en": …, "tafsir_ar": …} } or None.
"""
zip_path = CACHE_DIR / "tafsir" / "kaggle_tafsir.zip"
extract_dir = CACHE_DIR / "tafsir" / "kaggle_extracted"
# Download
try:
print(" Downloading Kaggle tafsir dataset …")
auth = _kaggle_auth()
download_file(KAGGLE_TAFSIR_URL, zip_path, force=force, auth=auth)
except Exception as exc:
print(f" βœ— Kaggle download failed: {exc}")
print(
" Tip: set KAGGLE_USERNAME and KAGGLE_KEY env vars, "
"or place kaggle.json in ~/.kaggle/"
)
return None
# Verify it's actually a ZIP
if not zipfile.is_zipfile(zip_path):
print(" βœ— Downloaded file is not a valid ZIP (may need Kaggle auth)")
return None
# Extract
try:
_ensure_dir(extract_dir)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_dir)
print(f" βœ“ Extracted to {extract_dir}")
except Exception as exc:
print(f" βœ— Failed to extract ZIP: {exc}")
return None
# Parse JSON files inside the archive
json_files = list(extract_dir.rglob("*.json"))
if not json_files:
print(" βœ— No JSON files found in Kaggle archive")
return None
print(f" Found {len(json_files)} JSON file(s) in archive")
tafsir_map: Dict[str, Dict[str, str]] = {}
for jf in json_files:
try:
with open(jf, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception as exc:
print(f" βœ— Error parsing {jf.name}: {exc}")
continue
if isinstance(data, list):
tafsir_map.update(_load_tafsir_from_records(data))
elif isinstance(data, dict):
# Might be keyed by surah number or some other grouping
for _key, value in data.items():
if isinstance(value, list):
tafsir_map.update(_load_tafsir_from_records(value))
elif isinstance(value, dict):
vk = _extract_verse_key(value)
if vk:
tt = _extract_tafsir_text(value)
if tt:
tafsir_map.setdefault(vk, {}).update(tt)
if tafsir_map:
print(f" βœ“ Loaded tafsir for {len(tafsir_map):,} verses from Kaggle")
return tafsir_map if tafsir_map else None
def _fetch_tafsir_chapter_api(
tafsir_id: int, chapter: int,
) -> Dict[str, str]:
"""Fetch all tafsir entries for a chapter from Quran.com API."""
result: Dict[str, str] = {}
page = 1
while True:
url = (
f"{QURAN_API_BASE}/tafsirs/{tafsir_id}/by_chapter/{chapter}"
f"?per_page=50&page={page}"
)
resp = requests.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
data = resp.json()
for entry in data.get("tafsirs", []):
raw = entry.get("text", "")
if raw:
result[entry["verse_key"]] = strip_html(raw)
pagination = data.get("pagination", {})
if pagination.get("next_page") is None:
break
page = pagination["next_page"]
time.sleep(0.3)
return result
def fetch_qurancom_tafsir(
surah_numbers: List[int],
) -> Dict[str, Dict[str, str]]:
"""Fallback: fetch tafsir from Quran.com API."""
print(" Falling back to Quran.com API for tafsir …")
tafsir_map: Dict[str, Dict[str, str]] = {}
for surah_num in tqdm(surah_numbers, desc=" Fetching tafsir"):
try:
en_entries = _fetch_tafsir_chapter_api(TAFSIR_EN_ID, surah_num)
time.sleep(0.3)
ar_entries = _fetch_tafsir_chapter_api(TAFSIR_AR_ID, surah_num)
time.sleep(0.3)
for vk, text in en_entries.items():
tafsir_map.setdefault(vk, {})["tafsir_en"] = text
for vk, text in ar_entries.items():
tafsir_map.setdefault(vk, {})["tafsir_ar"] = text
except Exception as exc:
print(f"\n βœ— Surah {surah_num}: {exc}")
return tafsir_map
def enrich_quran_with_tafsir(
entries: List[Dict],
force_download: bool = False,
) -> List[Dict]:
"""Add tafsir fields to Quran entries (Kaggle β†’ Quran.com fallback)."""
print("\n" + "=" * 60)
print("Step 3: Enriching Quran with Tafsir")
print("=" * 60)
tafsir_map = fetch_kaggle_tafsir(force=force_download)
if not tafsir_map:
surah_numbers = sorted(
{e["surah_number"] for e in entries if e.get("type") == "quran"}
)
tafsir_map = fetch_qurancom_tafsir(surah_numbers)
if not tafsir_map:
print(" βœ— No tafsir data available")
return entries
enriched = 0
for entry in entries:
if entry.get("type") != "quran":
continue
verse_key = f"{entry['surah_number']}:{entry['verse_number']}"
tafsir = tafsir_map.get(verse_key, {})
entry["tafsir_en"] = tafsir.get("tafsir_en", "")
entry["tafsir_ar"] = tafsir.get("tafsir_ar", "")
if entry["tafsir_en"] or entry["tafsir_ar"]:
enriched += 1
print(f" βœ“ Enriched {enriched:,} verses with tafsir")
return entries
# ══════════════════════════════════════════════════════════════════════
# STEP 4: FETCH & BUILD HADITH ENTRIES
# ══════════════════════════════════════════════════════════════════════
def _pick_best_grade(grades: List[Dict]) -> str:
"""Pick the most authoritative grade from a list of scholar grades."""
priority = ["darussalam", "al-albani", "zubair ali zai"]
grade_map = {}
for g in grades:
name = g.get("name", "").lower()
grade_text = g.get("grade", "")
if grade_text:
grade_map[name] = grade_text
for scholar in priority:
for name, grade in grade_map.items():
if scholar in name:
return grade
for g in grades:
if g.get("grade"):
return g["grade"]
return ""
def _fetch_fawaz_grades(
edition: str, force: bool = False,
) -> Optional[Dict[int, str]]:
"""Fetch grades for a hadith edition from fawazahmed0."""
cache_path = CACHE_DIR / "hadith" / "fawazahmed0" / f"{edition}.json"
urls = [
f"{FAWAZ_CDN_BASE}/editions/{edition}.json",
f"{FAWAZ_RAW_BASE}/editions/{edition}.json",
]
data = None
for url in urls:
try:
data = download_json(url, cache_path=cache_path, force=force)
break
except Exception:
continue
if not data:
return None
grades: Dict[int, str] = {}
for hadith in data.get("hadiths", []):
hnum = hadith.get("hadithnumber")
if hnum is None:
continue
grade_list = hadith.get("grades", [])
if grade_list:
grades[int(hnum)] = _pick_best_grade(grade_list)
return grades
def fetch_hadith_sources(
force: bool = False,
) -> Tuple[Dict[str, Dict], Dict[str, Dict[int, str]]]:
"""Download hadith data from AhmedBaset and grades from fawazahmed0.
Returns (ahmedbaset_books, fawaz_grades).
"""
print("\n" + "=" * 60)
print("Step 4a: Fetching Hadith Sources")
print("=" * 60)
# AhmedBaset hadith books
print(" Downloading from AhmedBaset/hadith-json …")
ahmedbaset_books: Dict[str, Dict] = {}
for filename in tqdm(HADITH_BOOKS.keys(), desc=" Books"):
try:
url = f"{AHMEDBASET_BASE_URL}/{filename}"
data = download_json(
url,
cache_path=CACHE_DIR / "hadith" / "ahmedbaset" / filename,
force=force,
)
ahmedbaset_books[filename] = data
except Exception as exc:
print(f"\n βœ— {filename}: {exc}")
print(f" βœ“ Loaded {len(ahmedbaset_books)} books")
# fawazahmed0 editions (for grades)
print(" Downloading grade data from fawazahmed0/hadith-api …")
fawaz_grades: Dict[str, Dict[int, str]] = {}
for prefix, edition in tqdm(FAWAZ_EDITION_MAP.items(), desc=" Editions"):
grades = _fetch_fawaz_grades(edition, force)
if grades:
fawaz_grades[prefix] = grades
print(f" βœ“ Loaded grades for {len(fawaz_grades)} collections")
return ahmedbaset_books, fawaz_grades
def build_hadith_entries(
ahmedbaset_books: Dict[str, Dict],
fawaz_grades: Dict[str, Dict[int, str]],
) -> List[Dict]:
"""Merge AhmedBaset data with fawazahmed0 grades into hadith entries."""
print("\n" + "=" * 60)
print("Step 4b: Building Hadith Entries")
print("=" * 60)
entries: List[Dict] = []
stats: Dict[str, int] = defaultdict(int)
for filename, book_config in HADITH_BOOKS.items():
book_data = ahmedbaset_books.get(filename)
if not book_data:
print(f" βœ— Skipping {filename} (not downloaded)")
continue
prefix = book_config["id_prefix"]
grades = fawaz_grades.get(prefix, {})
hadiths = book_data.get("hadiths", [])
chapter_map = {
ch.get("id"): ch.get("arabic", "")
for ch in book_data.get("chapters", [])
}
for hadith in hadiths:
hadith_num = hadith.get("idInBook", hadith.get("id", ""))
# English text
if isinstance(hadith.get("english"), dict):
parts = []
if hadith["english"].get("narrator"):
parts.append(hadith["english"]["narrator"])
if hadith["english"].get("text"):
parts.append(hadith["english"]["text"])
english = " ".join(parts)
else:
english = str(hadith.get("english", ""))
# Chapter name
chapter_name = ""
if "chapterId" in hadith:
chapter_name = chapter_map.get(hadith["chapterId"], "")
# Grade from fawazahmed0
grade = ""
if hadith_num:
grade = grades.get(int(hadith_num), "")
entries.append(
{
"id": f"{prefix}_{hadith_num}",
"arabic": hadith.get("arabic", ""),
"english": english,
"reference": f"{book_config['collection']} {hadith_num}",
"hadith_number": hadith_num,
"collection": book_config["collection"],
"chapter": chapter_name,
"grade": grade,
"type": "hadith",
"author": book_config["author"],
}
)
stats[book_config["collection"]] += 1
print(f" βœ“ Built {len(entries):,} hadith entries")
print("\n Breakdown:")
for collection, count in sorted(stats.items()):
print(f" {collection}: {count:,}")
graded = sum(1 for e in entries if e.get("grade"))
print(f"\n Hadiths with grades: {graded:,} / {len(entries):,}")
return entries
# ══════════════════════════════════════════════════════════════════════
# STEP 5: GENERATE METADATA
# ══════════════════════════════════════════════════════════════════════
def generate_metadata(
quran_entries: List[Dict],
hadith_entries: List[Dict],
) -> List[Dict]:
"""Combine all entries and write metadata.json."""
print("\n" + "=" * 60)
print("Step 5: Generating metadata.json")
print("=" * 60)
documents = quran_entries + hadith_entries
print(f" Quran entries: {len(quran_entries):,}")
print(f" Hadith entries: {len(hadith_entries):,}")
print(f" Total: {len(documents):,}")
# Check for duplicate IDs
ids = [d["id"] for d in documents]
if len(ids) != len(set(ids)):
dupes = len(ids) - len(set(ids))
print(f" ⚠ Warning: {dupes} duplicate IDs found")
print(f" Writing to {METADATA_PATH} …")
with open(METADATA_PATH, "w", encoding="utf-8") as f:
json.dump(documents, f, ensure_ascii=False, indent=2)
size_mb = METADATA_PATH.stat().st_size / (1024 * 1024)
print(f" βœ“ File size: {size_mb:.2f} MB")
return documents
# ══════════════════════════════════════════════════════════════════════
# STEP 6: BUILD FAISS INDEX
# ══════════════════════════════════════════════════════════════════════
def build_faiss_index(
documents: List[Dict],
model_name: str = DEFAULT_EMBED_MODEL,
):
"""Generate embeddings and build FAISS index."""
print("\n" + "=" * 60)
print("Step 6: Building FAISS Index")
print("=" * 60)
print(f" Loading embedding model: {model_name}")
model = SentenceTransformer(model_name)
embedding_dim = model.get_sentence_embedding_dimension()
print(f" Embedding dimension: {embedding_dim}")
# Build text for each document
all_texts: List[str] = []
for doc in documents:
if doc.get("type") == "quran":
# Include truncated tafsir for richer semantic matching
tafsir_snippet = doc.get("tafsir_en", "")[:500]
text = (
f"{doc.get('arabic', '')} {doc.get('english', '')} "
f"{tafsir_snippet}"
)
else: # hadith
text = (
f"{doc.get('collection', '')} "
f"{doc.get('arabic', '')} "
f"{doc.get('english', '')}"
)
all_texts.append(text.strip())
print(f"\n Generating embeddings for {len(all_texts):,} documents …")
all_embeddings = []
for i in tqdm(
range(0, len(all_texts), EMBED_BATCH_SIZE),
desc=" Embedding batches",
):
batch = all_texts[i : i + EMBED_BATCH_SIZE]
batch_emb = model.encode(batch, convert_to_numpy=True)
all_embeddings.extend(batch_emb)
embeddings = np.array(all_embeddings, dtype=np.float32)
print(f" Embeddings shape: {embeddings.shape}")
print("\n Creating FAISS index (IndexFlatIP + L2 normalization) …")
index = faiss.IndexFlatIP(embedding_dim)
faiss.normalize_L2(embeddings)
index.add(embeddings)
print(f" Saving to {INDEX_PATH}")
faiss.write_index(index, str(INDEX_PATH))
size_mb = INDEX_PATH.stat().st_size / (1024 * 1024)
print(f"\n {'=' * 50}")
print(f" Index Build Complete")
print(f" {'=' * 50}")
print(f" Documents indexed: {index.ntotal:,}")
print(f" Index file size: {size_mb:.2f} MB")
# ══════════════════════════════════════════════════════════════════════
# CLI
# ══════════════════════════════════════════════════════════════════════
def main():
parser = argparse.ArgumentParser(
description=(
"QModel Dataset Builder v2 β€” builds metadata.json and "
"QModel.index from scratch using multiple authoritative sources"
),
)
parser.add_argument(
"--index-only",
action="store_true",
help="Only build FAISS index from existing metadata.json",
)
parser.add_argument(
"--data-only",
action="store_true",
help="Only generate metadata.json, skip index building",
)
parser.add_argument(
"--skip-tafsir",
action="store_true",
help="Skip tafsir enrichment",
)
parser.add_argument(
"--force-download",
action="store_true",
help="Re-download all sources even if cached",
)
parser.add_argument(
"--model",
default=DEFAULT_EMBED_MODEL,
help=f"Sentence-transformer model for embeddings (default: {DEFAULT_EMBED_MODEL})",
)
args = parser.parse_args()
# ── index-only: skip all data fetching ─────────────────────────
if args.index_only:
print("Loading existing metadata.json …")
with open(METADATA_PATH, "r", encoding="utf-8") as f:
documents = json.load(f)
build_faiss_index(documents, model_name=args.model)
print("\nβœ“ Done!")
return
force = args.force_download
# Step 1: Fetch Quran sources
cdn_chapters, quran_data, chapter_meta, sem_translations = fetch_quran_sources(force=force)
# Step 2: Build Quran entries
quran_entries = build_quran_entries(cdn_chapters, quran_data, chapter_meta, sem_translations)
# Step 3: Enrich with tafsir
if not args.skip_tafsir:
quran_entries = enrich_quran_with_tafsir(
quran_entries, force_download=force,
)
else:
print("\nSkipping tafsir enrichment (--skip-tafsir)")
# Step 4: Fetch and build hadith entries
ahmedbaset_books, fawaz_grades = fetch_hadith_sources(force=force)
hadith_entries = build_hadith_entries(ahmedbaset_books, fawaz_grades)
# Step 5: Generate metadata.json
documents = generate_metadata(quran_entries, hadith_entries)
# Step 6: Build FAISS index
if not args.data_only:
build_faiss_index(documents, model_name=args.model)
else:
print("\nSkipping index build (--data-only)")
print("\nβœ“ Done!")
if __name__ == "__main__":
main()