""" groundlens — Geometric LLM Hallucination Detection Demo Plain-language interface: paste a question and the AI's answer, optionally upload context (PDF, Excel, or plain text). Compares groundlens (embedding geometry) vs Vectara HHEM-2.1-Open. Models load once at module level to avoid cold-start on Space wake. """ import logging import time import os import gradio as gr from groundlens import compute_sgi, compute_dgi logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ───────────────────────────────────────────────────────────────────────────── # FILE EXTRACTION — PDF and Excel support # ───────────────────────────────────────────────────────────────────────────── def extract_pdf_text(file_path: str, max_chars: int = 8000) -> str: """Extract text from a PDF file.""" try: import pdfplumber text_parts = [] with pdfplumber.open(file_path) as pdf: for page in pdf.pages[:20]: page_text = page.extract_text() if page_text: text_parts.append(page_text) full_text = "\n\n".join(text_parts) return full_text[:max_chars] if len(full_text) > max_chars else full_text except Exception as e: return f"[Could not read PDF: {e}]" def extract_excel_text(file_path: str, max_chars: int = 8000) -> str: """Extract text from an Excel file.""" try: import openpyxl wb = openpyxl.load_workbook(file_path, data_only=True) text_parts = [] for sheet_name in wb.sheetnames[:5]: ws = wb[sheet_name] text_parts.append(f"--- {sheet_name} ---") for row in ws.iter_rows(max_row=200, values_only=True): cells = [str(c) if c is not None else "" for c in row] line = " | ".join(cells).strip() if line and line != " | ".join([""] * len(cells)): text_parts.append(line) full_text = "\n".join(text_parts) return full_text[:max_chars] if len(full_text) > max_chars else full_text except Exception as e: return f"[Could not read Excel file: {e}]" def extract_file_to_text(file) -> str: """Extract text from an uploaded file and return it for the textbox.""" if file is None: return "" file_path = file.name if hasattr(file, 'name') else str(file) ext = os.path.splitext(file_path)[1].lower() basename = os.path.basename(file_path) if ext == ".pdf": text = extract_pdf_text(file_path) elif ext in (".xlsx", ".xls"): text = extract_excel_text(file_path) elif ext in (".txt", ".md", ".csv"): try: with open(file_path, "r", encoding="utf-8", errors="replace") as f: text = f.read(8000) except Exception as e: text = f"[Could not read file: {e}]" else: text = f"[Unsupported file type: {ext}. Use PDF, Excel, TXT, or CSV.]" if text and not text.startswith("["): return f"[Extracted from {basename}]\n\n{text}" return text # ───────────────────────────────────────────────────────────────────────────── # HHEM-2.1-Open — baseline comparison # ───────────────────────────────────────────────────────────────────────────── logger.info("Loading HHEM-2.1-Open...") from transformers import AutoModelForSequenceClassification _hhem = AutoModelForSequenceClassification.from_pretrained( "vectara/hallucination_evaluation_model", trust_remote_code=True, ) logger.info("HHEM loaded.") # Warm up groundlens embedding model logger.info("Warming up groundlens...") compute_dgi(question="warmup", response="warmup") logger.info("groundlens ready.") # ───────────────────────────────────────────────────────────────────────────── # SCORING # ───────────────────────────────────────────────────────────────────────────── def score_groundlens(question: str, response: str, context: str) -> dict: start = time.perf_counter() has_context = bool(context.strip()) if has_context: result = compute_sgi( question=question, context=context, response=response, ) method = "SGI (with context)" raw_score = result.value grounded = not result.flagged threshold = 0.95 mode_note = ( "Measured how much the AI's answer used your source document " "vs. just rephrasing the question." ) else: result = compute_dgi( question=question, response=response, ) method = "DGI (without context)" raw_score = result.value grounded = not result.flagged threshold = 0.30 mode_note = ( "Measured whether the AI's answer follows patterns typical " "of grounded, factual responses." ) elapsed_ms = (time.perf_counter() - start) * 1000 return { "method": method, "raw_score": round(raw_score, 4), "grounded": grounded, "threshold": threshold, "elapsed_ms": round(elapsed_ms, 1), "mode_note": mode_note, } def score_hhem(question: str, response: str, context: str) -> dict: has_context = bool(context.strip()) premise = ( f"{context.strip()}\n\n{question}".strip() if has_context else question ) if len(premise) > 1800: premise = premise[:1800] start = time.perf_counter() scores = _hhem.predict([(premise, response)]) raw_score = float(scores[0]) elapsed_ms = (time.perf_counter() - start) * 1000 return { "method": "HHEM-2.1-Open", "raw_score": round(raw_score, 4), "grounded": raw_score >= 0.5, "elapsed_ms": round(elapsed_ms, 1), "label": "consistent" if raw_score >= 0.5 else "hallucinated", } # ───────────────────────────────────────────────────────────────────────────── # MAIN COMPARISON — now takes only text inputs (no file object) # ───────────────────────────────────────────────────────────────────────────── def run_comparison( question: str, context_text: str, response: str ) -> tuple[str, str, str]: if not question.strip(): return "⚠️ Enter the question you asked the AI.", "", "" if not response.strip(): return "⚠️ Enter the AI's response.", "", "" # Strip the "[Extracted from ...]" header if present context = context_text.strip() if context.startswith("[Extracted from "): newline_pos = context.find("\n") if newline_pos > 0: context = context[newline_pos:].strip() gl = score_groundlens(question, response, context) hhem = score_hhem(question, response, context) # groundlens result if gl["grounded"]: gl_verdict = "🟢 Looks grounded" gl_explain = "The AI's answer appears to be based on real information." else: gl_verdict = "🔴 Possible hallucination" gl_explain = "The AI's answer shows signs of being fabricated or not grounded in the source." gl_md = f"""### groundlens **{gl_verdict}** {gl_explain} | | | |---|---| | **Method** | {gl["method"]} | | **Score** | {gl["raw_score"]} (threshold: {gl["threshold"]}) | | **Time** | {gl["elapsed_ms"]} ms | *{gl["mode_note"]}*""" # HHEM result if hhem["grounded"]: hhem_verdict = "🟢 Looks consistent" hhem_explain = "The classifier considers this answer consistent with the input." else: hhem_verdict = "🔴 Possible hallucination" hhem_explain = "The classifier flagged this answer as potentially hallucinated." hhem_md = f"""### Vectara HHEM-2.1-Open **{hhem_verdict}** {hhem_explain} | | | |---|---| | **Method** | {hhem["method"]} | | **Score** | {hhem["raw_score"]} ({hhem["label"]}) | | **Time** | {hhem["elapsed_ms"]} ms | *Fine-tuned flan-T5 classifier.*""" # Agreement agree = gl["grounded"] == hhem["grounded"] if agree and gl["grounded"]: agreement_md = "### 🔵 Both methods agree: the answer looks reliable." elif agree and not gl["grounded"]: agreement_md = "### 🔴 Both methods agree: this answer is likely hallucinated." else: agreement_md = """### 🟠 The two methods disagree. This often happens with **subtle factual errors** — the answer sounds right and uses the correct vocabulary, but gets specific facts wrong. Embedding geometry (groundlens) measures the shape of the answer; the classifier (HHEM) evaluates its content differently. When they disagree, it's worth checking the facts manually. [Learn more about hallucination types →](https://docs.groundlens.dev/theory/hallucination-taxonomy/)""" return gl_md, hhem_md, agreement_md # ───────────────────────────────────────────────────────────────────────────── # EXAMPLES # ───────────────────────────────────────────────────────────────────────────── EXAMPLES = [ [ "What does the water damage policy cover?", "Coverage includes burst pipes and sudden appliance failure up to " "$50,000. Flood damage requires a separate NFIP policy. " "Deductible is $1,500 per occurrence.", "The policy covers burst pipes and sudden appliance failure up to " "$50,000 per occurrence, with a $1,500 deductible.", ], [ "What does the water damage policy cover?", "Coverage includes burst pipes and sudden appliance failure up to " "$50,000. Flood damage requires a separate NFIP policy. " "Deductible is $1,500 per occurrence.", "The policy covers all water damage including floods " "with no deductible required.", ], [ "What causes seasons on Earth?", "", "Seasons are caused by Earth's 23.5-degree axial tilt, which " "changes how directly sunlight hits each hemisphere.", ], [ "What causes seasons on Earth?", "", "Seasons are regulated by the Atmospheric Regulation Committee, " "a UN body established in 1952 that adjusts global temperature " "through orbital satellites.", ], ] # ───────────────────────────────────────────────────────────────────────────── # THEME — dark, matching groundlens.dev # ───────────────────────────────────────────────────────────────────────────── _orange = gr.themes.Color( c50="#fff7ed", c100="#ffedd5", c200="#fed7aa", c300="#fdba74", c400="#fb923c", c500="#fc7604", c600="#ea580c", c700="#c2410c", c800="#9a3412", c900="#7c2d12", c950="#431407", ) theme = gr.Theme.from_hub("Bruhn/CrimsonNight").set( # Override crimson red → groundlens orange button_primary_background_fill="#fc7604", button_primary_background_fill_dark="#fc7604", button_primary_background_fill_hover="#fb923c", button_primary_background_fill_hover_dark="#fb923c", button_primary_text_color="#0a0a0a", button_primary_text_color_dark="#0a0a0a", border_color_primary="#fc7604", border_color_primary_dark="#fc7604", ) # ───────────────────────────────────────────────────────────────────────────── # INTERFACE # ───────────────────────────────────────────────────────────────────────────── css = """ .gradio-container { max-width: 1200px !important; margin: 0 auto !important; padding: 1.5rem !important; } h1 { color: #fc7604 !important; font-size: 2.2rem !important; font-weight: 700 !important; margin-bottom: 0.2rem !important; } h3 { font-size: 1.15rem !important; } .subtitle { color: #94a3b8 !important; font-size: 1.1rem !important; margin-top: 0 !important; } a { color: #fd9a42 !important; } a:hover { color: #fec08a !important; } .step-label { color: #fc7604; font-weight: 600; font-size: 1.05rem; } .links-bar { font-size: 0.9rem; color: #64748b; margin-top: 0.5rem; } .links-bar a { color: #64748b !important; } .links-bar a:hover { color: #fd9a42 !important; } footer { display: none !important; } /* Upload button — small, dashed secondary style */ .upload-btn { margin-top: 0.25rem !important; } .upload-btn button { background: transparent !important; border: 1px dashed #475569 !important; color: #94a3b8 !important; font-size: 0.85rem !important; padding: 0.4rem 1rem !important; border-radius: 6px !important; } .upload-btn button:hover { border-color: #fc7604 !important; color: #fc7604 !important; } .upload-status p { color: #94a3b8 !important; font-size: 0.85rem !important; margin: 0.25rem 0 0 0 !important; font-style: italic; } @media (max-width: 768px) { .gradio-container { padding: 0.75rem !important; } h1 { font-size: 1.6rem !important; } } """ with gr.Blocks( title="groundlens — Check if your AI is hallucinating", theme=theme, css=css, ) as demo: gr.Markdown(""" # groundlens

Check if an AI gave you a real answer or made something up.

""") gr.Markdown(""" You asked an AI a question and got an answer. Was it real or hallucinated? Paste both below and we'll check using two independent methods: **groundlens** (geometric analysis) and **Vectara HHEM** (neural classifier). """) gr.Markdown("""""") # ── Step 1: Question ── gr.Markdown('

1. What did you ask the AI?

') q_in = gr.Textbox( show_label=False, placeholder="e.g. What does our insurance policy cover for water damage?", lines=2, ) # ── Step 2: Context ── gr.Markdown( '

2. Did you give the AI any source material? (optional)

' ) gr.Markdown( "If you gave the AI a document, a webpage, an Excel file, or any reference " "material to base its answer on, paste the text below. " "If you just asked a question with no source, skip this step.", ) ctx_in = gr.Textbox( show_label=False, placeholder="Paste the source text here, or use the upload button below to extract text from a file...", lines=5, ) # Hidden file input + visible upload button file_in = gr.File( file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"], file_count="single", visible=False, ) upload_status = gr.Markdown("", elem_classes=["upload-status"]) upload_btn = gr.UploadButton( "📄 Upload a file (PDF, Excel, CSV, TXT)", file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"], file_count="single", elem_classes=["upload-btn"], ) def handle_upload(file, existing_text): """Extract file text and append to context textbox.""" extracted = extract_file_to_text(file) if not extracted: return existing_text, "" if extracted.startswith("[Could not") or extracted.startswith("[Unsupported"): return existing_text, f"⚠️ {extracted}" basename = os.path.basename(file.name if hasattr(file, 'name') else str(file)) # Replace existing content or append if existing_text and existing_text.strip(): new_text = existing_text.strip() + "\n\n" + extracted else: new_text = extracted return new_text, f"✓ Extracted text from **{basename}**" upload_btn.upload( fn=handle_upload, inputs=[upload_btn, ctx_in], outputs=[ctx_in, upload_status], ) # ── Step 3: Response ── gr.Markdown('

3. What did the AI answer?

') r_in = gr.Textbox( show_label=False, placeholder="Paste the AI's response here...", lines=4, ) # ── Evaluate button ── run_btn = gr.Button( "Check for hallucination", variant="primary", size="lg", ) # ── Results ── with gr.Row(equal_height=True): gl_out = gr.Markdown() hhem_out = gr.Markdown() agreement_out = gr.Markdown() # ── Examples ── gr.Markdown("---") gr.Markdown("### Try an example") gr.Examples( examples=EXAMPLES, inputs=[q_in, ctx_in, r_in], label="", ) # ── Footer ── gr.Markdown(""" ---

groundlens is open source (MIT). Built by Javier Marin. This demo runs the same library available via pip install groundlens.
groundlens is verification triage, not a truth oracle. It tells you which answers deserve trust and which need a closer look.

""") # ── Event binding ── run_btn.click( fn=run_comparison, inputs=[q_in, ctx_in, r_in], outputs=[gl_out, hhem_out, agreement_out], ) if __name__ == "__main__": demo.launch()