File size: 4,083 Bytes
918426a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, Request, UploadFile, File, Form, Query
from typing import List, Dict, Any, Optional

from ...core.neo4j_store import Neo4jStore
from ...retrieval.agent import AgentRetrievalSystem
from ...ingestion.pipeline import IngestionPipeline
from ...config import settings
from ...api.models import *
from ...api.auth import get_current_user, User
import redis
from ..dependencies import get_graph_store, get_retrieval_agent, get_ingestion_pipeline, get_redis_client

router = APIRouter()

from ...core.storage import get_storage
storage = get_storage()

@router.post("/api/eval/score", response_model=EvalResponse, tags=["Evaluation"])
async def evaluate_response(
    request: EvalRequest,
    current_user: User = Depends(get_current_user)
):
    """
    Run RAGAS-style quality evaluation on a Q&A pair.
    Measures faithfulness, relevancy, and context precision.
    Results are persisted in Neo4j for the quality dashboard.
    """
    from ...retrieval.tools import RAGEvaluator
    from ...core.llm_factory import LLMFactory
    from ...core.models import EvalResult

    llm = LLMFactory.create(provider=settings.default_llm_provider)
    evaluator = RAGEvaluator(llm)

    metrics = await evaluator.evaluate(
        question=request.question,
        answer=request.answer,
        contexts=request.contexts,
        ground_truth=request.ground_truth
    )

    eval_record = EvalResult(
        question=request.question,
        answer=request.answer,
        faithfulness=metrics["faithfulness"],
        answer_relevancy=metrics["answer_relevancy"],
        context_precision=metrics["context_precision"],
        overall_score=metrics["overall_score"],
        hallucination_detected=metrics["hallucination_detected"],
        document_id=request.document_id
    )
    eval_id = await request.app.state.graph_store.save_eval_result(eval_record)

    return EvalResponse(
        question=request.question,
        faithfulness=metrics["faithfulness"],
        answer_relevancy=metrics["answer_relevancy"],
        context_precision=metrics["context_precision"],
        overall_score=metrics["overall_score"],
        hallucination_detected=metrics["hallucination_detected"],
        eval_id=eval_id
    )



@router.get("/api/eval/dashboard", response_model=EvalDashboardResponse, tags=["Evaluation"])
async def get_eval_dashboard(request: Request, 
    limit: int = 100,
    current_user: User = Depends(get_current_user)
):
    """Retrieve evaluation history for the quality dashboard"""
    rows = await request.app.state.graph_store.get_eval_results(limit=limit)

    if not rows:
        return EvalDashboardResponse(
            total_evaluations=0,
            avg_overall_score=0.0,
            avg_faithfulness=0.0,
            avg_relevancy=0.0,
            hallucination_rate=0.0,
            trend_data=[]
        )

    total = len(rows)
    avg_score = sum(r.get("overall_score", 0) for r in rows) / total
    avg_faith = sum(r.get("faithfulness", 0) for r in rows) / total
    avg_rel = sum(r.get("answer_relevancy", 0) for r in rows) / total
    hall_rate = sum(1 for r in rows if r.get("hallucination_detected")) / total

    trend = [
        EvalTrendPoint(
            timestamp=str(r.get("timestamp", ""))[:19],
            overall_score=r.get("overall_score", 0.0),
            faithfulness=r.get("faithfulness", 0.0),
            answer_relevancy=r.get("answer_relevancy", 0.0),
            hallucination_detected=bool(r.get("hallucination_detected")),
            document_id=r.get("document_id")
        )
        for r in rows
    ]

    return EvalDashboardResponse(
        total_evaluations=total,
        avg_overall_score=round(avg_score, 4),
        avg_faithfulness=round(avg_faith, 4),
        avg_relevancy=round(avg_rel, 4),
        hallucination_rate=round(hall_rate, 4),
        trend_data=trend
    )


# ── Gap #2: Community Detection Endpoints ─────────────────────────────────────