Spaces:

vn6295337
/

RAG-document-assistant

Sleeping

App Files Files Community

RAG-document-assistant / src /evaluation /diagnosis.py

vn6295337

Initial commit: RAG Document Assistant with Zero-Storage Privacy

f866820 4 months ago

raw

history blame contribute delete

8.19 kB

	"""
	Failure diagnosis for RAG pipeline.

	Identifies root causes when answers are wrong or missing.
	"""

	from dataclasses import dataclass
	from typing import List, Dict, Any, Optional


	@dataclass
	class DiagnosisResult:
	"""Result of failure diagnosis."""
	root_cause: str
	stage_failed: str
	confidence: float
	details: str
	suggestions: List[str]
	alternative_queries: List[str]


	def diagnose_failure(
	query: str,
	chunks: List[Dict[str, Any]],
	answer: str,
	expected_content: str = None
	) -> DiagnosisResult:
	"""
	Diagnose why a RAG query failed or gave poor results.

	Args:
	query: Original query
	chunks: Retrieved chunks
	answer: Generated answer
	expected_content: What the answer should contain (optional)

	Returns:
	DiagnosisResult with root cause and suggestions
	"""
	suggestions = []
	alternative_queries = []

	# Case 1: No chunks retrieved
	if not chunks:
	return DiagnosisResult(
	root_cause="retrieval_failure",
	stage_failed="retrieval",
	confidence=0.9,
	details="No chunks were retrieved for this query",
	suggestions=[
	"Check if documents are indexed",
	"Try broader search terms",
	"Use keyword search for exact matches"
	],
	alternative_queries=_generate_alternative_queries(query)
	)

	# Case 2: Low relevance scores
	scores = [c.get("score", 0) for c in chunks]
	avg_score = sum(scores) / len(scores) if scores else 0

	if avg_score < 0.4:
	return DiagnosisResult(
	root_cause="low_relevance",
	stage_failed="retrieval",
	confidence=0.8,
	details=f"Retrieved chunks have low relevance (avg score: {avg_score:.2f})",
	suggestions=[
	"Query terms may not match document vocabulary",
	"Try rephrasing the query",
	"Use query expansion or synonyms"
	],
	alternative_queries=_generate_alternative_queries(query)
	)

	# Case 3: Abstention (model refused to answer)
	abstention_phrases = [
	"don't have enough information",
	"cannot answer",
	"no information",
	"not mentioned",
	"not enough"
	]
	is_abstention = any(phrase in answer.lower() for phrase in abstention_phrases)

	if is_abstention:
	# Check if chunks actually contain relevant info
	combined_text = " ".join(c.get("text", "") for c in chunks)
	query_words = set(query.lower().split())
	chunk_words = set(combined_text.lower().split())
	overlap = len(query_words & chunk_words) / len(query_words) if query_words else 0

	if overlap > 0.5:
	return DiagnosisResult(
	root_cause="context_interpretation",
	stage_failed="generation",
	confidence=0.7,
	details="Chunks contain relevant terms but LLM couldn't extract answer",
	suggestions=[
	"Context may be fragmented across chunks",
	"Try retrieving more chunks",
	"Consider using reasoning-aware prompts"
	],
	alternative_queries=[]
	)
	else:
	return DiagnosisResult(
	root_cause="topic_mismatch",
	stage_failed="retrieval",
	confidence=0.8,
	details="Retrieved chunks don't appear to cover the query topic",
	suggestions=[
	"Query topic may not be in the document corpus",
	"Try different terminology",
	"Check if relevant documents are indexed"
	],
	alternative_queries=_generate_alternative_queries(query)
	)

	# Case 4: Expected content not in answer
	if expected_content:
	expected_words = set(expected_content.lower().split())
	answer_words = set(answer.lower().split())
	coverage = len(expected_words & answer_words) / len(expected_words) if expected_words else 1

	if coverage < 0.3:
	# Check if expected content is in chunks
	combined_chunks = " ".join(c.get("text", "").lower() for c in chunks)
	in_chunks = any(word in combined_chunks for word in expected_words)

	if in_chunks:
	return DiagnosisResult(
	root_cause="generation_miss",
	stage_failed="generation",
	confidence=0.7,
	details="Expected information is in chunks but not in answer",
	suggestions=[
	"LLM may have focused on wrong parts of context",
	"Try more specific prompting",
	"Increase context relevance through reranking"
	],
	alternative_queries=[]
	)
	else:
	return DiagnosisResult(
	root_cause="retrieval_miss",
	stage_failed="retrieval",
	confidence=0.8,
	details="Expected information not found in retrieved chunks",
	suggestions=[
	"Relevant chunks may not have been retrieved",
	"Try different query formulation",
	"Increase top_k for more coverage"
	],
	alternative_queries=_generate_alternative_queries(query)
	)

	# Case 5: Default - unclear failure
	return DiagnosisResult(
	root_cause="unknown",
	stage_failed="unknown",
	confidence=0.5,
	details="Unable to determine specific failure cause",
	suggestions=[
	"Review the query for clarity",
	"Check chunk quality manually",
	"Try with different retrieval settings"
	],
	alternative_queries=_generate_alternative_queries(query)
	)


	def _generate_alternative_queries(query: str) -> List[str]:
	"""Generate alternative query formulations."""
	alternatives = []

	# Remove question words
	cleaned = query.lower()
	for word in ["what", "how", "why", "when", "where", "who", "which"]:
	cleaned = cleaned.replace(word + " ", "")
	cleaned = cleaned.replace(word + "'s ", "")

	# Extract key terms
	words = [w for w in cleaned.split() if len(w) > 3]

	if words:
	# Just key terms
	alternatives.append(" ".join(words[:5]))

	# With "about"
	if len(words) >= 2:
	alternatives.append(f"about {words[0]} {words[1]}")

	return alternatives[:3]


	def run_diagnostics_suite(
	query: str,
	chunks: List[Dict[str, Any]],
	answer: str
	) -> Dict[str, Any]:
	"""
	Run comprehensive diagnostics on a query result.

	Returns a detailed report for debugging.
	"""
	diagnosis = diagnose_failure(query, chunks, answer)

	# Additional checks
	chunk_analysis = {
	"count": len(chunks),
	"avg_length": sum(len(c.get("text", "")) for c in chunks) / len(chunks) if chunks else 0,
	"sources": list(set(c.get("id", "").split("::")[0] for c in chunks if c.get("id"))),
	"score_range": (
	min(c.get("score", 0) for c in chunks) if chunks else 0,
	max(c.get("score", 0) for c in chunks) if chunks else 0
	)
	}

	answer_analysis = {
	"length": len(answer),
	"word_count": len(answer.split()),
	"has_citations": "[ID:" in answer,
	"is_abstention": any(p in answer.lower() for p in ["don't have", "cannot answer"])
	}

	return {
	"diagnosis": {
	"root_cause": diagnosis.root_cause,
	"stage_failed": diagnosis.stage_failed,
	"confidence": diagnosis.confidence,
	"details": diagnosis.details
	},
	"suggestions": diagnosis.suggestions,
	"alternative_queries": diagnosis.alternative_queries,
	"chunk_analysis": chunk_analysis,
	"answer_analysis": answer_analysis
	}