vn6295337's picture
Initial commit: RAG Document Assistant with Zero-Storage Privacy
f866820
"""
Failure diagnosis for RAG pipeline.
Identifies root causes when answers are wrong or missing.
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
@dataclass
class DiagnosisResult:
"""Result of failure diagnosis."""
root_cause: str
stage_failed: str
confidence: float
details: str
suggestions: List[str]
alternative_queries: List[str]
def diagnose_failure(
query: str,
chunks: List[Dict[str, Any]],
answer: str,
expected_content: str = None
) -> DiagnosisResult:
"""
Diagnose why a RAG query failed or gave poor results.
Args:
query: Original query
chunks: Retrieved chunks
answer: Generated answer
expected_content: What the answer should contain (optional)
Returns:
DiagnosisResult with root cause and suggestions
"""
suggestions = []
alternative_queries = []
# Case 1: No chunks retrieved
if not chunks:
return DiagnosisResult(
root_cause="retrieval_failure",
stage_failed="retrieval",
confidence=0.9,
details="No chunks were retrieved for this query",
suggestions=[
"Check if documents are indexed",
"Try broader search terms",
"Use keyword search for exact matches"
],
alternative_queries=_generate_alternative_queries(query)
)
# Case 2: Low relevance scores
scores = [c.get("score", 0) for c in chunks]
avg_score = sum(scores) / len(scores) if scores else 0
if avg_score < 0.4:
return DiagnosisResult(
root_cause="low_relevance",
stage_failed="retrieval",
confidence=0.8,
details=f"Retrieved chunks have low relevance (avg score: {avg_score:.2f})",
suggestions=[
"Query terms may not match document vocabulary",
"Try rephrasing the query",
"Use query expansion or synonyms"
],
alternative_queries=_generate_alternative_queries(query)
)
# Case 3: Abstention (model refused to answer)
abstention_phrases = [
"don't have enough information",
"cannot answer",
"no information",
"not mentioned",
"not enough"
]
is_abstention = any(phrase in answer.lower() for phrase in abstention_phrases)
if is_abstention:
# Check if chunks actually contain relevant info
combined_text = " ".join(c.get("text", "") for c in chunks)
query_words = set(query.lower().split())
chunk_words = set(combined_text.lower().split())
overlap = len(query_words & chunk_words) / len(query_words) if query_words else 0
if overlap > 0.5:
return DiagnosisResult(
root_cause="context_interpretation",
stage_failed="generation",
confidence=0.7,
details="Chunks contain relevant terms but LLM couldn't extract answer",
suggestions=[
"Context may be fragmented across chunks",
"Try retrieving more chunks",
"Consider using reasoning-aware prompts"
],
alternative_queries=[]
)
else:
return DiagnosisResult(
root_cause="topic_mismatch",
stage_failed="retrieval",
confidence=0.8,
details="Retrieved chunks don't appear to cover the query topic",
suggestions=[
"Query topic may not be in the document corpus",
"Try different terminology",
"Check if relevant documents are indexed"
],
alternative_queries=_generate_alternative_queries(query)
)
# Case 4: Expected content not in answer
if expected_content:
expected_words = set(expected_content.lower().split())
answer_words = set(answer.lower().split())
coverage = len(expected_words & answer_words) / len(expected_words) if expected_words else 1
if coverage < 0.3:
# Check if expected content is in chunks
combined_chunks = " ".join(c.get("text", "").lower() for c in chunks)
in_chunks = any(word in combined_chunks for word in expected_words)
if in_chunks:
return DiagnosisResult(
root_cause="generation_miss",
stage_failed="generation",
confidence=0.7,
details="Expected information is in chunks but not in answer",
suggestions=[
"LLM may have focused on wrong parts of context",
"Try more specific prompting",
"Increase context relevance through reranking"
],
alternative_queries=[]
)
else:
return DiagnosisResult(
root_cause="retrieval_miss",
stage_failed="retrieval",
confidence=0.8,
details="Expected information not found in retrieved chunks",
suggestions=[
"Relevant chunks may not have been retrieved",
"Try different query formulation",
"Increase top_k for more coverage"
],
alternative_queries=_generate_alternative_queries(query)
)
# Case 5: Default - unclear failure
return DiagnosisResult(
root_cause="unknown",
stage_failed="unknown",
confidence=0.5,
details="Unable to determine specific failure cause",
suggestions=[
"Review the query for clarity",
"Check chunk quality manually",
"Try with different retrieval settings"
],
alternative_queries=_generate_alternative_queries(query)
)
def _generate_alternative_queries(query: str) -> List[str]:
"""Generate alternative query formulations."""
alternatives = []
# Remove question words
cleaned = query.lower()
for word in ["what", "how", "why", "when", "where", "who", "which"]:
cleaned = cleaned.replace(word + " ", "")
cleaned = cleaned.replace(word + "'s ", "")
# Extract key terms
words = [w for w in cleaned.split() if len(w) > 3]
if words:
# Just key terms
alternatives.append(" ".join(words[:5]))
# With "about"
if len(words) >= 2:
alternatives.append(f"about {words[0]} {words[1]}")
return alternatives[:3]
def run_diagnostics_suite(
query: str,
chunks: List[Dict[str, Any]],
answer: str
) -> Dict[str, Any]:
"""
Run comprehensive diagnostics on a query result.
Returns a detailed report for debugging.
"""
diagnosis = diagnose_failure(query, chunks, answer)
# Additional checks
chunk_analysis = {
"count": len(chunks),
"avg_length": sum(len(c.get("text", "")) for c in chunks) / len(chunks) if chunks else 0,
"sources": list(set(c.get("id", "").split("::")[0] for c in chunks if c.get("id"))),
"score_range": (
min(c.get("score", 0) for c in chunks) if chunks else 0,
max(c.get("score", 0) for c in chunks) if chunks else 0
)
}
answer_analysis = {
"length": len(answer),
"word_count": len(answer.split()),
"has_citations": "[ID:" in answer,
"is_abstention": any(p in answer.lower() for p in ["don't have", "cannot answer"])
}
return {
"diagnosis": {
"root_cause": diagnosis.root_cause,
"stage_failed": diagnosis.stage_failed,
"confidence": diagnosis.confidence,
"details": diagnosis.details
},
"suggestions": diagnosis.suggestions,
"alternative_queries": diagnosis.alternative_queries,
"chunk_analysis": chunk_analysis,
"answer_analysis": answer_analysis
}