Spaces:

groundlens
/

demo

Sleeping

App Files Files Community

demo / app.py

AI-that-works

Upload app.py

9f6ea26 verified 5 days ago

raw

history blame contribute delete

19.4 kB

	"""
	groundlens — Geometric LLM Hallucination Detection Demo

	Plain-language interface: paste a question and the AI's answer,
	optionally upload context (PDF, Excel, or plain text).
	Compares groundlens (embedding geometry) vs Vectara HHEM-2.1-Open.

	Models load once at module level to avoid cold-start on Space wake.
	"""

	import logging
	import time
	import os

	import gradio as gr
	from groundlens import compute_sgi, compute_dgi

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	# ─────────────────────────────────────────────────────────────────────────────
	# FILE EXTRACTION — PDF and Excel support
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_pdf_text(file_path: str, max_chars: int = 8000) -> str:
	"""Extract text from a PDF file."""
	try:
	import pdfplumber
	text_parts = []
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages[:20]:
	page_text = page.extract_text()
	if page_text:
	text_parts.append(page_text)
	full_text = "\n\n".join(text_parts)
	return full_text[:max_chars] if len(full_text) > max_chars else full_text
	except Exception as e:
	return f"[Could not read PDF: {e}]"


	def extract_excel_text(file_path: str, max_chars: int = 8000) -> str:
	"""Extract text from an Excel file."""
	try:
	import openpyxl
	wb = openpyxl.load_workbook(file_path, data_only=True)
	text_parts = []
	for sheet_name in wb.sheetnames[:5]:
	ws = wb[sheet_name]
	text_parts.append(f"--- {sheet_name} ---")
	for row in ws.iter_rows(max_row=200, values_only=True):
	cells = [str(c) if c is not None else "" for c in row]
	line = " \| ".join(cells).strip()
	if line and line != " \| ".join([""] * len(cells)):
	text_parts.append(line)
	full_text = "\n".join(text_parts)
	return full_text[:max_chars] if len(full_text) > max_chars else full_text
	except Exception as e:
	return f"[Could not read Excel file: {e}]"


	def extract_file_to_text(file) -> str:
	"""Extract text from an uploaded file and return it for the textbox."""
	if file is None:
	return ""

	file_path = file.name if hasattr(file, 'name') else str(file)
	ext = os.path.splitext(file_path)[1].lower()
	basename = os.path.basename(file_path)

	if ext == ".pdf":
	text = extract_pdf_text(file_path)
	elif ext in (".xlsx", ".xls"):
	text = extract_excel_text(file_path)
	elif ext in (".txt", ".md", ".csv"):
	try:
	with open(file_path, "r", encoding="utf-8", errors="replace") as f:
	text = f.read(8000)
	except Exception as e:
	text = f"[Could not read file: {e}]"
	else:
	text = f"[Unsupported file type: {ext}. Use PDF, Excel, TXT, or CSV.]"

	if text and not text.startswith("["):
	return f"[Extracted from {basename}]\n\n{text}"
	return text


	# ─────────────────────────────────────────────────────────────────────────────
	# HHEM-2.1-Open — baseline comparison
	# ─────────────────────────────────────────────────────────────────────────────

	logger.info("Loading HHEM-2.1-Open...")
	from transformers import AutoModelForSequenceClassification

	_hhem = AutoModelForSequenceClassification.from_pretrained(
	"vectara/hallucination_evaluation_model",
	trust_remote_code=True,
	)
	logger.info("HHEM loaded.")

	# Warm up groundlens embedding model
	logger.info("Warming up groundlens...")
	compute_dgi(question="warmup", response="warmup")
	logger.info("groundlens ready.")


	# ─────────────────────────────────────────────────────────────────────────────
	# SCORING
	# ─────────────────────────────────────────────────────────────────────────────

	def score_groundlens(question: str, response: str, context: str) -> dict:
	start = time.perf_counter()
	has_context = bool(context.strip())

	if has_context:
	result = compute_sgi(
	question=question,
	context=context,
	response=response,
	)
	method = "SGI (with context)"
	raw_score = result.value
	grounded = not result.flagged
	threshold = 0.95
	mode_note = (
	"Measured how much the AI's answer used your source document "
	"vs. just rephrasing the question."
	)
	else:
	result = compute_dgi(
	question=question,
	response=response,
	)
	method = "DGI (without context)"
	raw_score = result.value
	grounded = not result.flagged
	threshold = 0.30
	mode_note = (
	"Measured whether the AI's answer follows patterns typical "
	"of grounded, factual responses."
	)

	elapsed_ms = (time.perf_counter() - start) * 1000

	return {
	"method": method,
	"raw_score": round(raw_score, 4),
	"grounded": grounded,
	"threshold": threshold,
	"elapsed_ms": round(elapsed_ms, 1),
	"mode_note": mode_note,
	}


	def score_hhem(question: str, response: str, context: str) -> dict:
	has_context = bool(context.strip())
	premise = (
	f"{context.strip()}\n\n{question}".strip()
	if has_context
	else question
	)
	if len(premise) > 1800:
	premise = premise[:1800]

	start = time.perf_counter()
	scores = _hhem.predict([(premise, response)])
	raw_score = float(scores[0])
	elapsed_ms = (time.perf_counter() - start) * 1000

	return {
	"method": "HHEM-2.1-Open",
	"raw_score": round(raw_score, 4),
	"grounded": raw_score >= 0.5,
	"elapsed_ms": round(elapsed_ms, 1),
	"label": "consistent" if raw_score >= 0.5 else "hallucinated",
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# MAIN COMPARISON — now takes only text inputs (no file object)
	# ─────────────────────────────────────────────────────────────────────────────

	def run_comparison(
	question: str, context_text: str, response: str
	) -> tuple[str, str, str]:

	if not question.strip():
	return "⚠️ Enter the question you asked the AI.", "", ""
	if not response.strip():
	return "⚠️ Enter the AI's response.", "", ""

	# Strip the "[Extracted from ...]" header if present
	context = context_text.strip()
	if context.startswith("[Extracted from "):
	newline_pos = context.find("\n")
	if newline_pos > 0:
	context = context[newline_pos:].strip()

	gl = score_groundlens(question, response, context)
	hhem = score_hhem(question, response, context)

	# groundlens result
	if gl["grounded"]:
	gl_verdict = "🟢 Looks grounded"
	gl_explain = "The AI's answer appears to be based on real information."
	else:
	gl_verdict = "🔴 Possible hallucination"
	gl_explain = "The AI's answer shows signs of being fabricated or not grounded in the source."

	gl_md = f"""### groundlens

	{gl_verdict}

	{gl_explain}

	\| \| \|
	\|---\|---\|
	\| Method \| {gl["method"]} \|
	\| Score \| {gl["raw_score"]} (threshold: {gl["threshold"]}) \|
	\| Time \| {gl["elapsed_ms"]} ms \|

	{gl["mode_note"]}"""

	# HHEM result
	if hhem["grounded"]:
	hhem_verdict = "🟢 Looks consistent"
	hhem_explain = "The classifier considers this answer consistent with the input."
	else:
	hhem_verdict = "🔴 Possible hallucination"
	hhem_explain = "The classifier flagged this answer as potentially hallucinated."

	hhem_md = f"""### Vectara HHEM-2.1-Open

	{hhem_verdict}

	{hhem_explain}

	\| \| \|
	\|---\|---\|
	\| Method \| {hhem["method"]} \|
	\| Score \| {hhem["raw_score"]} ({hhem["label"]}) \|
	\| Time \| {hhem["elapsed_ms"]} ms \|

	Fine-tuned flan-T5 classifier."""

	# Agreement
	agree = gl["grounded"] == hhem["grounded"]
	if agree and gl["grounded"]:
	agreement_md = "### 🔵 Both methods agree: the answer looks reliable."
	elif agree and not gl["grounded"]:
	agreement_md = "### 🔴 Both methods agree: this answer is likely hallucinated."
	else:
	agreement_md = """### 🟠 The two methods disagree.

	This often happens with subtle factual errors — the answer sounds right and
	uses the correct vocabulary, but gets specific facts wrong. Embedding geometry
	(groundlens) measures the shape of the answer; the classifier (HHEM) evaluates
	its content differently. When they disagree, it's worth checking the facts manually.

	[Learn more about hallucination types →](https://docs.groundlens.dev/theory/hallucination-taxonomy/)"""

	return gl_md, hhem_md, agreement_md


	# ─────────────────────────────────────────────────────────────────────────────
	# EXAMPLES
	# ─────────────────────────────────────────────────────────────────────────────

	EXAMPLES = [
	[
	"What does the water damage policy cover?",
	"Coverage includes burst pipes and sudden appliance failure up to "
	"$50,000. Flood damage requires a separate NFIP policy. "
	"Deductible is $1,500 per occurrence.",
	"The policy covers burst pipes and sudden appliance failure up to "
	"$50,000 per occurrence, with a $1,500 deductible.",
	],
	[
	"What does the water damage policy cover?",
	"Coverage includes burst pipes and sudden appliance failure up to "
	"$50,000. Flood damage requires a separate NFIP policy. "
	"Deductible is $1,500 per occurrence.",
	"The policy covers all water damage including floods "
	"with no deductible required.",
	],
	[
	"What causes seasons on Earth?",
	"",
	"Seasons are caused by Earth's 23.5-degree axial tilt, which "
	"changes how directly sunlight hits each hemisphere.",
	],
	[
	"What causes seasons on Earth?",
	"",
	"Seasons are regulated by the Atmospheric Regulation Committee, "
	"a UN body established in 1952 that adjusts global temperature "
	"through orbital satellites.",
	],
	]


	# ─────────────────────────────────────────────────────────────────────────────
	# THEME — dark, matching groundlens.dev
	# ─────────────────────────────────────────────────────────────────────────────

	_orange = gr.themes.Color(
	c50="#fff7ed",
	c100="#ffedd5",
	c200="#fed7aa",
	c300="#fdba74",
	c400="#fb923c",
	c500="#fc7604",
	c600="#ea580c",
	c700="#c2410c",
	c800="#9a3412",
	c900="#7c2d12",
	c950="#431407",
	)

	theme = gr.Theme.from_hub("Bruhn/CrimsonNight").set(
	# Override crimson red → groundlens orange
	button_primary_background_fill="#fc7604",
	button_primary_background_fill_dark="#fc7604",
	button_primary_background_fill_hover="#fb923c",
	button_primary_background_fill_hover_dark="#fb923c",
	button_primary_text_color="#0a0a0a",
	button_primary_text_color_dark="#0a0a0a",
	border_color_primary="#fc7604",
	border_color_primary_dark="#fc7604",
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# INTERFACE
	# ─────────────────────────────────────────────────────────────────────────────

	css = """
	.gradio-container {
	max-width: 1200px !important;
	margin: 0 auto !important;
	padding: 1.5rem !important;
	}
	h1 { color: #fc7604 !important; font-size: 2.2rem !important; font-weight: 700 !important; margin-bottom: 0.2rem !important; }
	h3 { font-size: 1.15rem !important; }
	.subtitle { color: #94a3b8 !important; font-size: 1.1rem !important; margin-top: 0 !important; }
	a { color: #fd9a42 !important; }
	a:hover { color: #fec08a !important; }
	.step-label { color: #fc7604; font-weight: 600; font-size: 1.05rem; }
	.links-bar { font-size: 0.9rem; color: #64748b; margin-top: 0.5rem; }
	.links-bar a { color: #64748b !important; }
	.links-bar a:hover { color: #fd9a42 !important; }
	footer { display: none !important; }

	/* Upload button — small, dashed secondary style */
	.upload-btn { margin-top: 0.25rem !important; }
	.upload-btn button {
	background: transparent !important;
	border: 1px dashed #475569 !important;
	color: #94a3b8 !important;
	font-size: 0.85rem !important;
	padding: 0.4rem 1rem !important;
	border-radius: 6px !important;
	}
	.upload-btn button:hover {
	border-color: #fc7604 !important;
	color: #fc7604 !important;
	}
	.upload-status p {
	color: #94a3b8 !important;
	font-size: 0.85rem !important;
	margin: 0.25rem 0 0 0 !important;
	font-style: italic;
	}
	@media (max-width: 768px) {
	.gradio-container { padding: 0.75rem !important; }
	h1 { font-size: 1.6rem !important; }
	}
	"""

	with gr.Blocks(
	title="groundlens — Check if your AI is hallucinating",
	theme=theme,
	css=css,
	) as demo:

	gr.Markdown("""
	# groundlens

	<p class="subtitle">Check if an AI gave you a real answer or made something up.</p>
	""")

	gr.Markdown("""
	You asked an AI a question and got an answer. Was it real or hallucinated?
	Paste both below and we'll check using two independent methods: groundlens
	(geometric analysis) and Vectara HHEM (neural classifier).
	""")

	gr.Markdown("""<p class="links-bar">
	<a href="https://github.com/groundlens-dev/groundlens">GitHub</a> ·
	<a href="https://docs.groundlens.dev">Docs</a> ·
	<a href="https://pypi.org/project/groundlens/">PyPI</a> ·
	<a href="https://arxiv.org/abs/2512.13771">SGI paper</a> ·
	<a href="https://arxiv.org/pdf/2602.13224v3">Taxonomy</a> ·
	<a href="https://arxiv.org/abs/2603.13259">Mechanistic paper</a>
	</p>""")

	# ── Step 1: Question ──
	gr.Markdown('<p class="step-label">1. What did you ask the AI?</p>')
	q_in = gr.Textbox(
	show_label=False,
	placeholder="e.g. What does our insurance policy cover for water damage?",
	lines=2,
	)

	# ── Step 2: Context ──
	gr.Markdown(
	'<p class="step-label">2. Did you give the AI any source material? (optional)</p>'
	)
	gr.Markdown(
	"If you gave the AI a document, a webpage, an Excel file, or any reference "
	"material to base its answer on, paste the text below. "
	"If you just asked a question with no source, skip this step.",
	)

	ctx_in = gr.Textbox(
	show_label=False,
	placeholder="Paste the source text here, or use the upload button below to extract text from a file...",
	lines=5,
	)

	# Hidden file input + visible upload button
	file_in = gr.File(
	file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"],
	file_count="single",
	visible=False,
	)
	upload_status = gr.Markdown("", elem_classes=["upload-status"])

	upload_btn = gr.UploadButton(
	"📄 Upload a file (PDF, Excel, CSV, TXT)",
	file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"],
	file_count="single",
	elem_classes=["upload-btn"],
	)

	def handle_upload(file, existing_text):
	"""Extract file text and append to context textbox."""
	extracted = extract_file_to_text(file)
	if not extracted:
	return existing_text, ""
	if extracted.startswith("[Could not") or extracted.startswith("[Unsupported"):
	return existing_text, f"⚠️ {extracted}"

	basename = os.path.basename(file.name if hasattr(file, 'name') else str(file))
	# Replace existing content or append
	if existing_text and existing_text.strip():
	new_text = existing_text.strip() + "\n\n" + extracted
	else:
	new_text = extracted
	return new_text, f"✓ Extracted text from {basename}"

	upload_btn.upload(
	fn=handle_upload,
	inputs=[upload_btn, ctx_in],
	outputs=[ctx_in, upload_status],
	)

	# ── Step 3: Response ──
	gr.Markdown('<p class="step-label">3. What did the AI answer?</p>')
	r_in = gr.Textbox(
	show_label=False,
	placeholder="Paste the AI's response here...",
	lines=4,
	)

	# ── Evaluate button ──
	run_btn = gr.Button(
	"Check for hallucination",
	variant="primary",
	size="lg",
	)

	# ── Results ──
	with gr.Row(equal_height=True):
	gl_out = gr.Markdown()
	hhem_out = gr.Markdown()

	agreement_out = gr.Markdown()

	# ── Examples ──
	gr.Markdown("---")
	gr.Markdown("### Try an example")

	gr.Examples(
	examples=EXAMPLES,
	inputs=[q_in, ctx_in, r_in],
	label="",
	)

	# ── Footer ──
	gr.Markdown("""
	---

	<p style="color:#475569; font-size:0.85rem; text-align:center;">
	<strong>groundlens</strong> is open source (MIT). Built by
	<a href="https://jmarin.info" style="color:#64748b !important;">Javier Marin</a>.
	This demo runs the same library available via <code>pip install groundlens</code>.<br>
	groundlens is verification triage, not a truth oracle. It tells you which answers
	deserve trust and which need a closer look.
	</p>
	""")

	# ── Event binding ──
	run_btn.click(
	fn=run_comparison,
	inputs=[q_in, ctx_in, r_in],
	outputs=[gl_out, hhem_out, agreement_out],
	)


	if __name__ == "__main__":
	demo.launch()