Spaces:
Sleeping
Sleeping
| """ | |
| We are going to create a strict grading script using LangChain's | |
| **with_structured_output**. This forces our Judge LLM to return a strict JSON | |
| object containing an integer score (1 for Pass, 0 for Fail) and a reasoning string. | |
| """ | |
| from pydantic import BaseModel, Field | |
| from langchain_groq import ChatGroq | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # ========================================== | |
| # 1. The Strict Grading Schema | |
| # ========================================== | |
| class HallucinationScore(BaseModel): | |
| score: int = Field(description="Return 1 if perfectly grounded. Return 0 if hallucinated.") | |
| reasoning: str = Field(description="A 1-sentence explanation of why you gave this score.") | |
| # ========================================== | |
| # 2. Initialize the Impartial Judge | |
| # ========================================== | |
| # We use temperature=0 because we want strict, deterministic grading, not creativity! | |
| model_name_1 = "llama-3.1-70b-versatile" | |
| model_name_2 = "llama-3.1-8b-instant" | |
| judge_llm = ChatGroq(model=model_name_2, temperature=0) | |
| structured_judge = judge_llm.with_structured_output(HallucinationScore) | |
| # ========================================== | |
| # 3. The Grading Rubric (System Prompt) | |
| # ========================================== | |
| system_prompt = """You are an impartial AI Compliance Judge evaluating an Agent's response. | |
| You will be given the 'Retrieved Context' from the database, and the 'Agent Answer'. | |
| Your ONLY job is to check for HALLUCINATIONS. | |
| RULE: | |
| - If the Agent's answer contains ANY factual information, names, or numbers that are NOT present in the Retrieved Context, score it a 0. | |
| - If the Agent's answer is strictly based ONLY on the context, score it a 1. | |
| - Do not grade grammar or tone. Only grade factual grounding. | |
| """ | |
| prompt = ChatPromptTemplate.from_messages([ | |
| ("system", system_prompt), | |
| ("human", "Retrieved Context: \n\n {context} \n\n Agent Answer: \n\n {answer}") | |
| ]) | |
| evaluator = prompt | structured_judge | |
| def check_hallucination(context: str, answer: str): | |
| print("\n⚖️ [JUDGE] Evaluating answer for hallucinations...") | |
| try: | |
| result = evaluator.invoke({"context": context, "answer": answer}) | |
| return result | |
| except Exception as e: | |
| print(f"Judge Error: {e}") | |
| return None | |
| if __name__ == "__main__": | |
| # The reality: What our Vector DB actually found. | |
| simulated_context = ( | |
| "OmniRouter is an AI architecture that routes LLM requests. " | |
| "It supports OpenAI and Anthropic APIs." | |
| ) | |
| print("\n========== TEST 1: The Good Agent ==========") | |
| good_answer = "OmniRouter routes requests and works with Anthropic and OpenAI." | |
| good_result = check_hallucination(simulated_context, good_answer) | |
| print(f"Score: {good_result.score}/1") | |
| print(f"Reasoning: {good_result.reasoning}") | |
| print("\n========== TEST 2: The Hallucinating Agent ==========") | |
| bad_answer = "OmniRouter routes requests and works with OpenAI, Anthropic, and Google Gemini." | |
| bad_result = check_hallucination(simulated_context, bad_answer) | |
| print(f"Score: {bad_result.score}/1") | |
| print(f"Reasoning: {bad_result.reasoning}") |