| import os |
| import sys |
| import json |
| from pathlib import Path |
| from datetime import datetime |
| from dotenv import find_dotenv, load_dotenv |
|
|
| REPO_ROOT = Path(__file__).resolve().parents[1] |
| if str(REPO_ROOT) not in sys.path: |
| sys.path.insert(0, str(REPO_ROOT)) |
| load_dotenv(find_dotenv(usecwd=True)) |
|
|
| from pydantic import SecretStr |
| from datasets import Dataset |
| from langchain_openai import ChatOpenAI |
| from ragas import evaluate |
| from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, RougeScore |
| from ragas.llms import LangchainLLMWrapper |
| from ragas.embeddings import LangchainEmbeddingsWrapper |
| from ragas.run_config import RunConfig |
|
|
| from evaluation.eval_utils import load_csv_data, init_rag, generate_answers |
|
|
| |
| CSV_PATH = "data/data.csv" |
| OUTPUT_DIR = "evaluation/results" |
| LLM_MODEL = os.getenv("EVAL_LLM_MODEL", "qwen/qwen3-32b") |
| API_BASE = "https://api.siliconflow.com/v1" |
|
|
|
|
| def run_evaluation(sample_size: int = 10, retrieval_mode: str = "hybrid_rerank") -> dict: |
|
|
| print(f"\n{'='*60}") |
| print(f"RAGAS EVALUATION - Mode: {retrieval_mode}") |
| print(f"{'='*60}") |
| |
| |
| rag, embeddings, llm_client = init_rag() |
| |
| |
| questions, ground_truths = load_csv_data(str(REPO_ROOT / CSV_PATH), sample_size) |
| print(f" Loaded {len(questions)} samples") |
| |
| |
| answers, contexts = generate_answers( |
| rag, questions, llm_client, |
| llm_model=LLM_MODEL, |
| retrieval_mode=retrieval_mode, |
| ) |
| |
| |
| api_key = os.getenv("SILICONFLOW_API_KEY", "") |
| evaluator_llm = LangchainLLMWrapper(ChatOpenAI( |
| model=LLM_MODEL, |
| api_key=SecretStr(api_key), |
| base_url=API_BASE, |
| temperature=0, |
| timeout=120, |
| max_retries=3, |
| )) |
| evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings) |
| |
| |
| dataset = Dataset.from_dict({ |
| "question": questions, |
| "answer": answers, |
| "contexts": contexts, |
| "ground_truth": ground_truths, |
| }) |
| |
| |
| print("\n Running RAGAS metrics...") |
| results = evaluate( |
| dataset=dataset, |
| metrics=[ |
| faithfulness, |
| answer_relevancy, |
| context_precision, |
| context_recall, |
| RougeScore(rouge_type='rouge1', mode='fmeasure'), |
| RougeScore(rouge_type='rouge2', mode='fmeasure'), |
| RougeScore(rouge_type='rougeL', mode='fmeasure'), |
| ], |
| llm=evaluator_llm, |
| embeddings=evaluator_embeddings, |
| raise_exceptions=False, |
| run_config=RunConfig(max_workers=8, timeout=600, max_retries=3), |
| ) |
| |
| |
| df = results.to_pandas() |
| metric_cols = [c for c in df.columns if c not in ("question", "answer", "contexts", "ground_truth", "user_input", "response", "reference", "retrieved_contexts")] |
| |
| |
| avg_scores = {} |
| for col in metric_cols: |
| values = df[col].dropna().tolist() |
| if values: |
| avg_scores[col] = sum(values) / len(values) |
| |
| |
| out_path = REPO_ROOT / OUTPUT_DIR |
| out_path.mkdir(parents=True, exist_ok=True) |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
| |
| json_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.json" |
| with open(json_path, 'w', encoding='utf-8') as f: |
| json.dump({ |
| "retrieval_mode": retrieval_mode, |
| "sample_size": len(questions), |
| "timestamp": timestamp, |
| "scores": avg_scores, |
| }, f, ensure_ascii=False, indent=2) |
|
|
| |
| csv_path = out_path / f"ragas_{retrieval_mode}_{timestamp}.csv" |
| with open(csv_path, 'w', encoding='utf-8') as f: |
| f.write("retrieval_mode,sample_size," + ",".join(avg_scores.keys()) + "\n") |
| f.write(f"{retrieval_mode},{len(questions)}," + ",".join(f"{v:.4f}" for v in avg_scores.values()) + "\n") |
| |
| |
| print(f"\n{'='*60}") |
| print(f"RESULTS - {retrieval_mode} ({len(questions)} samples)") |
| print(f"{'='*60}") |
| for metric, score in avg_scores.items(): |
| bar = "#" * int(score * 20) + "-" * (20 - int(score * 20)) |
| print(f" {metric:25} [{bar}] {score:.4f}") |
| |
| print(f"\nSaved: {json_path}") |
| print(f"Saved: {csv_path}") |
| |
| return avg_scores |