QModel / app /analysis.py

Upload folder using huggingface_hub

6ab1c8e 23 days ago

16.8 kB

	"""Quran / Hadith analytics — occurrence counting, surah metadata, dataset stats."""

	from __future__ import annotations

	import re
	from typing import Dict, List, Literal, Optional

	from app.arabic_nlp import light_stem, normalize_arabic, tokenize_ar
	from app.cache import analysis_cache
	from app.config import cfg

	# ═══════════════════════════════════════════════════════════════════════
	# INTENT DETECTION — frequency / surah info queries
	# ═══════════════════════════════════════════════════════════════════════
	_COUNT_EN = re.compile(
	r"\b(how many\|count\|number of\|frequency\|occurrences? of\|how often\|"
	r"times? (does\|is\|appears?))\b",
	re.I,
	)
	_COUNT_AR = re.compile(
	r"(كم مرة\|كم عدد\|كم تكرر\|عدد مرات\|تكرار\|كم ذُكر\|كم وردت?)"
	)

	_SURAH_VERSES_AR = re.compile(
	r"كم\s+(?:عدد\s+)?آيات?\s*(?:في\s+\|فى\s+)?(?:سورة\|سوره)"
	r"\|عدد\s+آيات?\s+(?:سورة\|سوره)"
	r"\|كم\s+آية\s+(?:في\|فى)\s+(?:سورة\|سوره)"
	r"\|(?:سورة\|سوره)\s+[\u0600-\u06FF\s]+\s+(?:كم\s+آية\|عدد\s+آيات?)"
	)
	_SURAH_VERSES_EN = re.compile(
	r"(?:how many\|number of)\s+(?:verses?\|ayat\|ayahs?)\s+(?:in\|of\|does)\b"
	r"\|\bsurah?\b.*\b(?:how many\|number of)\s+(?:verses?\|ayat\|ayahs?)",
	re.I,
	)
	_SURAH_TYPE_AR = re.compile(
	r"(?:سورة\|سوره)\s+[\u0600-\u06FF\s]+\s+(?:مكية\|مدنية\|مكي\|مدني)"
	r"\|(?:هل\|ما\s+نوع)\s+(?:سورة\|سوره)\s+[\u0600-\u06FF\s]+\s+(?:مكية\|مدنية)"
	)
	_SURAH_NAME_AR = re.compile(
	r"(?:سورة\|سوره)\s+([\u0600-\u06FF\u0750-\u077F\s]+)"
	)
	_SURAH_NAME_EN = re.compile(
	r"\bsurah?\s+([a-zA-Z'\-]+(?:[\s\-][a-zA-Z'\-]+)*)",
	re.I,
	)


	def _extract_surah_name(query: str) -> Optional[str]:
	"""Extract surah name from a query string."""
	for pat in (_SURAH_NAME_AR, _SURAH_NAME_EN):
	m = pat.search(query)
	if m:
	name = m.group(1).strip()
	name = re.sub(r'[\s؟?!]+$', '', name)
	name = re.sub(r'\s+(كم\|عدد\|هل\|ما\|في\|فى)$', '', name)
	if name:
	return name
	return None


	# ═══════════════════════════════════════════════════════════════════════
	# SURAH INFO DETECTION & LOOKUP
	# ═══════════════════════════════════════════════════════════════════════
	async def detect_surah_info(query: str, rewrite: dict) -> Optional[dict]:
	"""Detect if query asks about surah metadata (verse count, type, etc.)."""
	is_verse_q = bool(_SURAH_VERSES_AR.search(query) or _SURAH_VERSES_EN.search(query))
	is_type_q = bool(_SURAH_TYPE_AR.search(query))

	if not (is_verse_q or is_type_q):
	if rewrite.get("intent") == "surah_info":
	is_verse_q = True
	elif rewrite.get("intent") == "count":
	kw_text = " ".join(rewrite.get("keywords", []))
	if any(w in kw_text for w in ("آيات", "آية", "verses", "ayat")):
	is_verse_q = True
	else:
	return None
	else:
	return None

	surah_name = _extract_surah_name(query)
	if not surah_name:
	return None

	return {
	"surah_query": surah_name,
	"query_type": "verses" if is_verse_q else "type",
	}


	async def lookup_surah_info(surah_query: str, dataset: list) -> Optional[dict]:
	"""Look up surah metadata from dataset entries."""
	query_norm = normalize_arabic(surah_query, aggressive=True).lower()
	query_clean = re.sub(r"^(ال\|al[\-\s']*)", "", query_norm, flags=re.I).strip()

	for item in dataset:
	if item.get("type") != "quran":
	continue
	for field in ("surah_name_ar", "surah_name_en", "surah_name_transliteration"):
	val = item.get(field, "")
	if not val:
	continue
	val_norm = normalize_arabic(val, aggressive=True).lower()
	val_clean = re.sub(r"^(ال\|al[\-\s']*)", "", val_norm, flags=re.I).strip()
	if (query_norm in val_norm or val_norm in query_norm
	or (query_clean and val_clean
	and (query_clean in val_clean or val_clean in query_clean))
	or (query_clean and query_clean in val_norm)):
	return {
	"surah_number": item.get("surah_number"),
	"surah_name_ar": item.get("surah_name_ar", ""),
	"surah_name_en": item.get("surah_name_en", ""),
	"surah_name_transliteration": item.get("surah_name_transliteration", ""),
	"total_verses": item.get("total_verses"),
	"revelation_type": item.get("revelation_type", ""),
	}
	return None


	# ═══════════════════════════════════════════════════════════════════════
	# ANALYSIS INTENT (word frequency detection)
	# ═══════════════════════════════════════════════════════════════════════
	async def detect_analysis_intent(query: str, rewrite: dict) -> Optional[str]:
	"""Detect if query is asking for word frequency analysis."""
	if (_SURAH_VERSES_AR.search(query) or _SURAH_VERSES_EN.search(query)
	or _SURAH_TYPE_AR.search(query)
	or rewrite.get("intent") == "surah_info"):
	return None

	if rewrite.get("intent") == "count":
	kws = rewrite.get("keywords", [])
	kw_text = " ".join(kws)
	if any(w in kw_text for w in ("آيات", "آية", "verses", "ayat")):
	return None
	# The rewriter is instructed to put the target word as first keyword
	if kws:
	return kws[0]
	# Fallback: extract from query
	keyword = _extract_count_keyword(query)
	return keyword

	if not (_COUNT_EN.search(query) or _COUNT_AR.search(query)):
	return None

	keyword = _extract_count_keyword(query)
	return keyword


	def _extract_count_keyword(query: str) -> Optional[str]:
	"""Extract the keyword being counted from various question patterns."""
	# Arabic patterns: كم مرة ذكرت كلمة X / كم مرة وردت X / عدد مرات ذكر X
	ar_patterns = [
	re.compile(r"(?:كلمة\|لفظ\|لفظة)\s+([\u0600-\u06FF\u0750-\u077F]+)"),
	re.compile(r"(?:ذ[ُ]?كر(?:ت\|)\|وردت?\|تكرر(?:ت\|))\s+(?:كلمة\s+)?([\u0600-\u06FF\u0750-\u077F]+)"),
	re.compile(r"(?:عدد\s+مرات\s+(?:ذكر\|ورود))\s+(?:كلمة\s+)?([\u0600-\u06FF\u0750-\u077F]+)"),
	re.compile(r"كم\s+(?:مرة\|مره)\s+(?:ذ[ُ]?كر(?:ت\|)\|وردت?)\s+(?:كلمة\s+)?([\u0600-\u06FF\u0750-\u077F]+)"),
	]
	for pat in ar_patterns:
	m = pat.search(query)
	if m:
	word = m.group(1).strip()
	# Skip common non-keyword words
	if word not in ("في", "من", "عن", "إلى", "على", "هل", "ما", "كم"):
	return word

	# English patterns: how many times is X mentioned / count of X / occurrences of X
	en_patterns = [
	re.compile(r"(?:word\|term)\s+['\"]?(\w+)['\"]?", re.I),
	re.compile(r"(?:times?\s+(?:is\|does\|has)\s+)(\w+)", re.I),
	re.compile(r"(?:occurrences?\s+of\|frequency\s+of\|count\s+of)\s+['\"]?(\w+)['\"]?", re.I),
	re.compile(r"(?:mentioned\|appear[s]?\|occur[s]?)\s+.*?['\"]?(\w+)['\"]?\s+(?:in\|throughout)", re.I),
	re.compile(r"(?:how many times)\s+(?:is\s+)?['\"]?(\w+)['\"]?", re.I),
	]
	for pat in en_patterns:
	m = pat.search(query)
	if m:
	word = m.group(1).strip()
	if word.lower() not in ("the", "a", "an", "in", "of", "is", "are", "was", "how", "many", "quran"):
	return word

	# Last resort: find the first meaningful word after count-related keywords
	for pat in (_COUNT_EN, _COUNT_AR):
	m = pat.search(query)
	if m:
	tail = query[m.end():].strip().split()
	for word in tail:
	clean = re.sub(r"[؟?!.,،]", "", word).strip()
	if clean and clean.lower() not in (
	"في", "من", "عن", "القرآن", "الكريم", "the", "quran", "in", "of",
	"كلمة", "لفظ", "word", "term",
	):
	return clean
	return None


	# ═══════════════════════════════════════════════════════════════════════
	# OCCURRENCE COUNTING
	# ═══════════════════════════════════════════════════════════════════════
	async def count_occurrences(
	keyword: str,
	dataset: list,
	source_type: Optional[str] = "quran",
	) -> dict:
	"""Count keyword occurrences with surah/collection grouping."""
	cached = await analysis_cache.get(keyword, source_type or "all")
	if cached:
	return cached

	kw_norm = normalize_arabic(keyword, aggressive=True).lower()
	kw_stem = light_stem(kw_norm)
	count = 0
	by_surah: Dict[int, Dict] = {}
	examples: list = []

	for item in dataset:
	if source_type and item.get("type") != source_type:
	continue

	ar_norm = normalize_arabic(item.get("arabic", ""), aggressive=True).lower()
	combined = f"{ar_norm} {item.get('english', '')}".lower()
	exact = combined.count(kw_norm)
	stemmed = combined.count(kw_stem) - exact if kw_stem != kw_norm else 0
	occ = exact + stemmed

	if occ > 0:
	count += occ
	surah_num = item.get("surah_number", 0)
	if surah_num not in by_surah:
	by_surah[surah_num] = {
	"name": item.get("surah_name_en", f"Surah {surah_num}"),
	"count": 0,
	}
	by_surah[surah_num]["count"] += occ

	if len(examples) < cfg.MAX_EXAMPLES:
	examples.append({
	"reference": item.get("source", ""),
	"arabic": item.get("arabic", ""),
	"english": item.get("english", ""),
	})

	result = {
	"keyword": keyword,
	"kw_stemmed": kw_stem,
	"total_count": count,
	"by_surah": dict(sorted(by_surah.items())),
	"examples": examples,
	}
	await analysis_cache.set(result, keyword, source_type or "all")
	return result


	# ═══════════════════════════════════════════════════════════════════════
	# DATASET ANALYTICS — aggregate statistics
	# ═══════════════════════════════════════════════════════════════════════
	def get_quran_analytics(dataset: list) -> dict:
	"""Compute aggregate Quran statistics from dataset."""
	surahs: Dict[int, dict] = {}
	total_verses = 0

	for item in dataset:
	if item.get("type") != "quran":
	continue
	total_verses += 1
	sn = item.get("surah_number", 0)
	if sn not in surahs:
	surahs[sn] = {
	"surah_number": sn,
	"surah_name_ar": item.get("surah_name_ar", ""),
	"surah_name_en": item.get("surah_name_en", ""),
	"surah_name_transliteration": item.get("surah_name_transliteration", ""),
	"revelation_type": item.get("revelation_type", ""),
	"total_verses": item.get("total_verses", 0),
	"verses_in_dataset": 0,
	}
	surahs[sn]["verses_in_dataset"] += 1

	meccan = sum(1 for s in surahs.values() if s.get("revelation_type", "").lower() == "meccan")
	medinan = sum(1 for s in surahs.values() if s.get("revelation_type", "").lower() == "medinan")

	return {
	"total_verses_in_dataset": total_verses,
	"total_surahs": len(surahs),
	"meccan_surahs": meccan,
	"medinan_surahs": medinan,
	"surahs": [surahs[k] for k in sorted(surahs)],
	}


	def get_hadith_analytics(dataset: list) -> dict:
	"""Compute aggregate Hadith statistics from dataset."""
	collections: Dict[str, dict] = {}
	grades: Dict[str, int] = {}
	total = 0

	for item in dataset:
	if item.get("type") != "hadith":
	continue
	total += 1

	col = item.get("collection", "Unknown")
	if col not in collections:
	collections[col] = {"collection": col, "count": 0, "grades": {}}
	collections[col]["count"] += 1

	grade = item.get("grade", "Ungraded")
	grades[grade] = grades.get(grade, 0) + 1
	collections[col]["grades"][grade] = collections[col]["grades"].get(grade, 0) + 1

	return {
	"total_hadiths": total,
	"collections": sorted(collections.values(), key=lambda c: c["count"], reverse=True),
	"grade_summary": dict(sorted(grades.items(), key=lambda x: x[1], reverse=True)),
	}


	def get_chapter_info(chapter_number: int, dataset: list) -> Optional[dict]:
	"""Get all verses and metadata for a specific surah/chapter."""
	verses = []
	meta = None

	for item in dataset:
	if item.get("type") != "quran":
	continue
	if item.get("surah_number") != chapter_number:
	continue
	if meta is None:
	meta = {
	"surah_number": item.get("surah_number"),
	"surah_name_ar": item.get("surah_name_ar", ""),
	"surah_name_en": item.get("surah_name_en", ""),
	"surah_name_transliteration": item.get("surah_name_transliteration", ""),
	"revelation_type": item.get("revelation_type", ""),
	"total_verses": item.get("total_verses", 0),
	}
	verses.append({
	"ayah": item.get("ayah_number") or item.get("verse_number"),
	"arabic": item.get("arabic", ""),
	"english": item.get("english", ""),
	"source": item.get("source", ""),
	})

	if not meta:
	return None

	verses.sort(key=lambda v: v.get("ayah") or 0)
	return {**meta, "verses": verses}


	def get_verse(surah: int, ayah: int, dataset: list) -> Optional[dict]:
	"""Get a specific verse by surah and ayah number."""
	for item in dataset:
	if item.get("type") != "quran":
	continue
	if item.get("surah_number") != surah:
	continue
	item_ayah = item.get("ayah_number") or item.get("verse_number")
	if item_ayah == ayah:
	return {
	"surah_number": item.get("surah_number"),
	"surah_name_ar": item.get("surah_name_ar", ""),
	"surah_name_en": item.get("surah_name_en", ""),
	"surah_name_transliteration": item.get("surah_name_transliteration", ""),
	"ayah": item_ayah,
	"arabic": item.get("arabic", ""),
	"english": item.get("english", ""),
	"transliteration": item.get("transliteration", ""),
	"tafsir_en": item.get("tafsir_en", ""),
	"tafsir_ar": item.get("tafsir_ar", ""),
	"source": item.get("source", ""),
	"revelation_type": item.get("revelation_type", ""),
	}
	return None