import json import re import os from huggingface_hub import InferenceClient HF_TOKEN = os.getenv("HF_TOKEN", "") MODEL_NAME = os.getenv("HF_MODEL", "Qwen/Qwen2.5-72B-Instruct") _client: InferenceClient | None = None def _get_client() -> InferenceClient: global _client if _client is None: _client = InferenceClient(model=MODEL_NAME, token=HF_TOKEN or None) return _client def _call_hf(prompt: str, max_tokens: int = 256, temperature: float = 0.1) -> str: client = _get_client() response = client.text_generation( prompt, max_new_tokens=max_tokens, temperature=temperature, do_sample=False, # deterministic for evaluation return_full_text=False, ) return response.strip() def _extract_score(raw: str) -> float: try: cleaned = re.sub(r'```(?:json)?\s*|```', '', raw).strip() data = json.loads(cleaned) if isinstance(data, dict): for key in ["score", "value", "result", "rating"]: if key in data: val = float(data[key]) return max(0.0, min(1.0, val if val <= 1.0 else val / 10.0)) except Exception: pass matches = re.findall(r'\b(0\.\d+|1\.0|[0-9](?:\.[0-9]+)?)\b', raw) for m in matches: val = float(m) if 0.0 <= val <= 1.0: return val if 1.0 < val <= 10.0: return val / 10.0 raw_lower = raw.lower() if any(w in raw_lower for w in ["excellent", "perfect", "fully", "completely"]): return 0.9 if any(w in raw_lower for w in ["good", "mostly", "largely"]): return 0.7 if any(w in raw_lower for w in ["partial", "somewhat", "moderate"]): return 0.5 if any(w in raw_lower for w in ["poor", "barely", "little"]): return 0.3 if any(w in raw_lower for w in ["no", "none", "not", "fail"]): return 0.1 return 0.5 def _parse_result(raw: str) -> tuple[float, str]: score = _extract_score(raw) reason = "No reason provided." try: cleaned = re.sub(r'```(?:json)?\s*|```', '', raw).strip() data = json.loads(cleaned) reason = data.get("reason", reason) except Exception: m = re.search(r'"reason"\s*:\s*"([^"]+)"', raw) if m: reason = m.group(1) return round(score, 2), reason # ── Evaluation functions ────────────────────────────────────────────────────── def evaluate_faithfulness(question: str, context: str, answer: str) -> dict: prompt = f"""[INST] Tu es un évaluateur RAG expert. Évalue la FIDÉLITÉ de la réponse. La fidélité mesure si la réponse est entièrement fondée sur le contexte fourni. Question : {question} Contexte : {context[:2000]} Réponse : {answer[:1000]} Note de 0.0 à 1.0 (1.0 = entièrement fondée sur le contexte, 0.0 = totalement hallucinée). Réponds UNIQUEMENT avec : {{"score": , "reason": ""}} [/INST] """ raw = _call_hf(prompt) score, reason = _parse_result(raw) return {"score": score, "reason": reason, "raw": raw[:200]} def evaluate_answer_relevancy(question: str, answer: str) -> dict: prompt = f"""[INST] Tu es un évaluateur RAG expert. Évalue la PERTINENCE DE LA RÉPONSE. La pertinence mesure si la réponse répond directement à la question posée. Question : {question} Réponse : {answer[:1000]} Note de 0.0 à 1.0 (1.0 = répond parfaitement, 0.0 = hors sujet). Réponds UNIQUEMENT avec : {{"score": , "reason": ""}} [/INST] """ raw = _call_hf(prompt) score, reason = _parse_result(raw) return {"score": score, "reason": reason, "raw": raw[:200]} def evaluate_context_recall(question: str, context: str) -> dict: prompt = f"""[INST] Tu es un évaluateur RAG expert. Évalue le RAPPEL DU CONTEXTE. Mesure si le contexte récupéré contient les informations nécessaires pour répondre à la question. Question : {question} Contexte récupéré : {context[:2000]} Note de 0.0 à 1.0 (1.0 = contexte idéal, 0.0 = contexte inutile). Réponds UNIQUEMENT avec : {{"score": , "reason": ""}} [/INST] """ raw = _call_hf(prompt) score, reason = _parse_result(raw) return {"score": score, "reason": reason, "raw": raw[:200]} def evaluate_hallucination(question: str, context: str, answer: str) -> dict: prompt = f"""[INST] Tu es un évaluateur RAG expert. Détecte les HALLUCINATIONS dans la réponse. Une hallucination = information présente dans la réponse mais ABSENTE du contexte et non-connaissance générale. Question : {question} Contexte : {context[:2000]} Réponse : {answer[:1000]} Note de 0.0 à 1.0 (1.0 = aucune hallucination, 0.0 = totalement hallucinée). Réponds UNIQUEMENT avec : {{"score": , "reason": ""}} [/INST] """ raw = _call_hf(prompt) score, reason = _parse_result(raw) return {"score": score, "reason": reason, "raw": raw[:200]} def evaluate_rag_response(question: str, context: str, answer: str) -> dict: print(f"[RAG EVAL] Démarrage pour : {question[:80]}") results: dict[str, dict] = {} for key, fn, args in [ ("faithfulness", evaluate_faithfulness, (question, context, answer)), ("answer_relevancy", evaluate_answer_relevancy, (question, answer)), ("context_recall", evaluate_context_recall, (question, context)), ("hallucination", evaluate_hallucination, (question, context, answer)), ]: try: results[key] = fn(*args) print(f"[RAG EVAL] {key}: {results[key]['score']}") except Exception as e: results[key] = {"score": 0.0, "reason": str(e), "error": True} weights = { "faithfulness": 0.35, "answer_relevancy": 0.30, "context_recall": 0.20, "hallucination": 0.15, } overall = round(sum( results[k]["score"] * w for k, w in weights.items() if not results[k].get("error") ), 2) grade = "A" if overall >= 0.85 else "B" if overall >= 0.70 else "C" if overall >= 0.55 else "D" if overall >= 0.40 else "F" print(f"[RAG EVAL] Overall: {overall} ({grade})") return { "question": question, "overall_score": overall, "grade": grade, "metrics": results, "summary": _generate_summary(overall, results), } def _generate_summary(overall: float, results: dict) -> str: label_map = { "faithfulness": "Fidélité", "answer_relevancy": "Pertinence", "context_recall": "Rappel contexte", "hallucination": "Hallucination", } weak = [label_map[k] for k, v in results.items() if v["score"] < 0.5 and not v.get("error")] strong = [label_map[k] for k, v in results.items() if v["score"] >= 0.8 and not v.get("error")] if overall >= 0.85: verdict = "Excellente réponse RAG." elif overall >= 0.70: verdict = "Bonne réponse avec quelques défauts mineurs." elif overall >= 0.50: verdict = "Réponse acceptable — qualité du contexte à améliorer." else: verdict = "Réponse insuffisante — uploadez des documents plus pertinents." parts = [] if strong: parts.append(f"Points forts : {', '.join(strong)}.") if weak: parts.append(f"À améliorer : {', '.join(weak)}.") return verdict + (" " + " ".join(parts) if parts else "")