import os import sys import json from pathlib import Path from datetime import datetime from dotenv import find_dotenv, load_dotenv REPO_ROOT = Path(__file__).resolve().parents[1] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) load_dotenv(find_dotenv(usecwd=True)) from pydantic import SecretStr from datasets import Dataset from langchain_openai import ChatOpenAI from ragas import evaluate from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, RougeScore from ragas.llms import LangchainLLMWrapper from ragas.embeddings import LangchainEmbeddingsWrapper from ragas.run_config import RunConfig from evaluation.eval_utils import load_csv_data, init_rag, generate_answers # Configuration CSV_PATH = "data/data.csv" OUTPUT_DIR = "evaluation/results" LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "qwen/qwen3-32b") API_BASE = "https://api.siliconflow.com/v1" def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict: print(f"\n{'='*60}") print(f"RAGAS EVALUATION - Mode: {retrieval_mode}") print(f"{'='*60}") # Initialize RAG components rag, embeddings, llm_client = init_rag() # Load test data questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size) print(f" Loaded {len(questions)} samples") # Generate answers answers, contexts = generate_answers( rag, questions, llm_client, llm_model=LLM_MODEL, retrieval_mode=retrieval_mode, ) # Setup RAGAS evaluator api_key = os.getenv("SILICONFLOW_API_KEY", "") evaluator_llm = LangchainLLMWrapper(ChatOpenAI( model=LLM_MODEL, api_key=SecretStr(api_key), base_url=API_BASE, temperature=0, timeout=120, max_retries=3, )) evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings) # Convert data to Dataset format dataset = Dataset.from_dict({ "question": questions, "answer": answers, "contexts": contexts, "ground_truth": ground_truths, }) # Run RAGAS evaluation print("\n Running RAGAS metrics...") results = evaluate( dataset=dataset, metrics=[ faithfulness, # Độ trung thực với context answer_relevancy, # Độ liên quan của câu trả lời context_precision, # Độ chính xác của context context_recall, # Độ bao phủ của context RougeScore(rouge_type='rouge1', mode='fmeasure'), RougeScore(rouge_type='rouge2', mode='fmeasure'), RougeScore(rouge_type='rougeL', mode='fmeasure'), ], llm=evaluator_llm, embeddings=evaluator_embeddings, raise_exceptions=False, run_config=RunConfig(max_workers=8, timeout=600, max_retries=3), ) # Extract scores df = results.to_pandas() metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")] # Calculate average score for each metric avg_scores = {} for col in metric_cols: values = df[col].dropna().tolist() if values: avg_scores[col] = sum(values) / len(values) # Save results out_path = REPO_ROOT / OUTPUT_DIR out_path.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # Save JSON file (detailed) json_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.json" with open(json_path, 'w', encoding='utf-8') as f: json.dump({ "retrieval_mode": retrieval_mode, "sample_size": len(questions), "timestamp": timestamp, "scores": avg_scores, }, f, ensure_ascii=False, indent=2) # Save CSV file (summary) csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv" with open(csv_path, 'w', encoding='utf-8') as f: f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n") f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n") # Print results print(f"\n{'='*60}") print(f"RESULTS - {retrieval_mode} ({len(questions)} samples)") print(f"{'='*60}") for metric, score in avg_scores.items(): bar = "#" * int(score * 20) + "-" * (20 - int(score * 20)) print(f" {metric:25} [{bar}] {score:.4f}") print(f"\nSaved: {json_path}") print(f"Saved: {csv_path}") return avg_scores