Spaces:

groundlens
/

demo

Sleeping

App Files Files Community

AI-that-works commited on 7 days ago

Commit

c873959

verified ·

1 Parent(s): 09c0851

Upload app.py

Browse files

Files changed (1) hide show

app.py +543 -0

app.py ADDED Viewed

	@@ -0,0 +1,543 @@

+"""
+groundlens — Geometric LLM Hallucination Detection Demo
+Plain-language interface: paste a question and the AI's answer,
+optionally upload context (PDF, Excel, or plain text).
+Compares groundlens (embedding geometry) vs Vectara HHEM-2.1-Open.
+Models load once at module level to avoid cold-start on Space wake.
+"""
+import logging
+import time
+import tempfile
+import os
+import gradio as gr
+from groundlens import compute_sgi, compute_dgi
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# ─────────────────────────────────────────────────────────────────────────────
+# FILE EXTRACTION — PDF and Excel support
+# ─────────────────────────────────────────────────────────────────────────────
+def extract_pdf_text(file_path: str, max_chars: int = 8000) -> str:
+    """Extract text from a PDF file."""
+    try:
+        import pdfplumber
+        text_parts = []
+        with pdfplumber.open(file_path) as pdf:
+            for page in pdf.pages[:20]:  # limit to 20 pages
+                page_text = page.extract_text()
+                if page_text:
+                    text_parts.append(page_text)
+        full_text = "\n\n".join(text_parts)
+        return full_text[:max_chars] if len(full_text) > max_chars else full_text
+    except Exception as e:
+        return f"[Could not read PDF: {e}]"
+def extract_excel_text(file_path: str, max_chars: int = 8000) -> str:
+    """Extract text from an Excel file."""
+    try:
+        import openpyxl
+        wb = openpyxl.load_workbook(file_path, data_only=True)
+        text_parts = []
+        for sheet_name in wb.sheetnames[:5]:  # limit to 5 sheets
+            ws = wb[sheet_name]
+            text_parts.append(f"--- {sheet_name} ---")
+            for row in ws.iter_rows(max_row=200, values_only=True):
+                cells = [str(c) if c is not None else "" for c in row]
+                line = " | ".join(cells).strip()
+                if line and line != " | ".join([""] * len(cells)):
+                    text_parts.append(line)
+        full_text = "\n".join(text_parts)
+        return full_text[:max_chars] if len(full_text) > max_chars else full_text
+    except Exception as e:
+        return f"[Could not read Excel file: {e}]"
+def process_uploaded_file(file) -> str:
+    """Extract text from an uploaded file (PDF or Excel)."""
+    if file is None:
+        return ""
+    file_path = file.name if hasattr(file, 'name') else str(file)
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".pdf":
+        return extract_pdf_text(file_path)
+    elif ext in (".xlsx", ".xls"):
+        return extract_excel_text(file_path)
+    elif ext in (".txt", ".md", ".csv"):
+        try:
+            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+                text = f.read(8000)
+            return text
+        except Exception as e:
+            return f"[Could not read file: {e}]"
+    else:
+        return f"[Unsupported file type: {ext}. Use PDF, Excel, TXT, or CSV.]"
+# ─────────────────────────────────────────────────────────────────────────────
+# HHEM-2.1-Open — baseline comparison
+# ─────────────────────────────────────────────────────────────────────────────
+logger.info("Loading HHEM-2.1-Open...")
+from transformers import AutoModelForSequenceClassification
+_hhem = AutoModelForSequenceClassification.from_pretrained(
+    "vectara/hallucination_evaluation_model",
+    trust_remote_code=True,
+)
+logger.info("HHEM loaded.")
+# Warm up groundlens embedding model
+logger.info("Warming up groundlens...")
+compute_dgi(question="warmup", response="warmup")
+logger.info("groundlens ready.")
+# ─────────────────────────────────────────────────────────────────────────────
+# SCORING
+# ─────────────────────────────────────────────────────────────────────────────
+def score_groundlens(question: str, response: str, context: str) -> dict:
+    start = time.perf_counter()
+    has_context = bool(context.strip())
+    if has_context:
+        result = compute_sgi(
+            question=question,
+            context=context,
+            response=response,
+        )
+        method = "SGI (with context)"
+        raw_score = result.value
+        grounded = not result.flagged
+        threshold = 0.95
+        mode_note = (
+            "Measured how much the AI's answer used your source document "
+            "vs. just rephrasing the question."
+        )
+    else:
+        result = compute_dgi(
+            question=question,
+            response=response,
+        )
+        method = "DGI (without context)"
+        raw_score = result.value
+        grounded = not result.flagged
+        threshold = 0.30
+        mode_note = (
+            "Measured whether the AI's answer follows patterns typical "
+            "of grounded, factual responses."
+        )
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    return {
+        "method": method,
+        "raw_score": round(raw_score, 4),
+        "grounded": grounded,
+        "threshold": threshold,
+        "elapsed_ms": round(elapsed_ms, 1),
+        "mode_note": mode_note,
+    }
+def score_hhem(question: str, response: str, context: str) -> dict:
+    has_context = bool(context.strip())
+    premise = (
+        f"{context.strip()}\n\n{question}".strip()
+        if has_context
+        else question
+    )
+    if len(premise) > 1800:
+        premise = premise[:1800]
+    start = time.perf_counter()
+    scores = _hhem.predict([(premise, response)])
+    raw_score = float(scores[0])
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    return {
+        "method": "HHEM-2.1-Open",
+        "raw_score": round(raw_score, 4),
+        "grounded": raw_score >= 0.5,
+        "elapsed_ms": round(elapsed_ms, 1),
+        "label": "consistent" if raw_score >= 0.5 else "hallucinated",
+    }
+# ─────────────────────────────────────────────────────────────────────────────
+# MAIN COMPARISON
+# ─────────────────────────────────────────────────────────────────────────────
+def run_comparison(
+    question: str, context_text: str, file_upload, response: str
+) -> tuple[str, str, str]:
+    if not question.strip():
+        return "⚠️ Enter the question you asked the AI.", "", ""
+    if not response.strip():
+        return "⚠️ Enter the AI's response.", "", ""
+    # Merge context: typed text + uploaded file
+    context_parts = []
+    if context_text and context_text.strip():
+        context_parts.append(context_text.strip())
+    if file_upload is not None:
+        extracted = process_uploaded_file(file_upload)
+        if extracted and not extracted.startswith("["):
+            context_parts.append(extracted)
+        elif extracted.startswith("["):
+            return f"⚠️ {extracted}", "", ""
+    context = "\n\n".join(context_parts)
+    gl = score_groundlens(question, response, context)
+    hhem = score_hhem(question, response, context)
+    # groundlens result
+    if gl["grounded"]:
+        gl_verdict = "🟢 Looks grounded"
+        gl_explain = "The AI's answer appears to be based on real information."
+    else:
+        gl_verdict = "🔴 Possible hallucination"
+        gl_explain = "The AI's answer shows signs of being fabricated or not grounded in the source."
+    gl_md = f"""### groundlens
+**{gl_verdict}**
+{gl_explain}
+| | |
+|---|---|
+| **Method** | {gl["method"]} |
+| **Score** | {gl["raw_score"]} (threshold: {gl["threshold"]}) |
+| **Time** | {gl["elapsed_ms"]} ms |
+*{gl["mode_note"]}*"""
+    # HHEM result
+    if hhem["grounded"]:
+        hhem_verdict = "🟢 Looks consistent"
+        hhem_explain = "The classifier considers this answer consistent with the input."
+    else:
+        hhem_verdict = "🔴 Possible hallucination"
+        hhem_explain = "The classifier flagged this answer as potentially hallucinated."
+    hhem_md = f"""### Vectara HHEM-2.1-Open
+**{hhem_verdict}**
+{hhem_explain}
+| | |
+|---|---|
+| **Method** | {hhem["method"]} |
+| **Score** | {hhem["raw_score"]} ({hhem["label"]}) |
+| **Time** | {hhem["elapsed_ms"]} ms |
+*Fine-tuned flan-T5 classifier.*"""
+    # Agreement
+    agree = gl["grounded"] == hhem["grounded"]
+    if agree and gl["grounded"]:
+        agreement_md = "### 🔵 Both methods agree: the answer looks reliable."
+    elif agree and not gl["grounded"]:
+        agreement_md = "### 🔴 Both methods agree: this answer is likely hallucinated."
+    else:
+        agreement_md = """### 🟠 The two methods disagree.
+This often happens with **subtle factual errors** — the answer sounds right and
+uses the correct vocabulary, but gets specific facts wrong. Embedding geometry
+(groundlens) measures the shape of the answer; the classifier (HHEM) evaluates
+its content differently. When they disagree, it's worth checking the facts manually.
+[Learn more about hallucination types →](https://docs.groundlens.dev/theory/hallucination-taxonomy/)"""
+    return gl_md, hhem_md, agreement_md
+# ─────────────────────────────────────────────────────────────────────────────
+# EXAMPLES
+# ─────────────────────────────────────────────────────────────────────────────
+EXAMPLES = [
+    [
+        "What does the water damage policy cover?",
+        "Coverage includes burst pipes and sudden appliance failure up to "
+        "$50,000. Flood damage requires a separate NFIP policy. "
+        "Deductible is $1,500 per occurrence.",
+        "The policy covers burst pipes and sudden appliance failure up to "
+        "$50,000 per occurrence, with a $1,500 deductible.",
+    ],
+    [
+        "What does the water damage policy cover?",
+        "Coverage includes burst pipes and sudden appliance failure up to "
+        "$50,000. Flood damage requires a separate NFIP policy. "
+        "Deductible is $1,500 per occurrence.",
+        "The policy covers all water damage including floods "
+        "with no deductible required.",
+    ],
+    [
+        "What causes seasons on Earth?",
+        "",
+        "Seasons are caused by Earth's 23.5-degree axial tilt, which "
+        "changes how directly sunlight hits each hemisphere.",
+    ],
+    [
+        "What causes seasons on Earth?",
+        "",
+        "Seasons are regulated by the Atmospheric Regulation Committee, "
+        "a UN body established in 1952 that adjusts global temperature "
+        "through orbital satellites.",
+    ],
+]
+# ─────────────────────────────────────────────────────────────────────────────
+# THEME — dark, matching groundlens.dev
+# ─────────────────────────────────────────────────────────────────────────────
+theme = gr.themes.Base(
+    primary_hue=gr.themes.Color(
+        c50="#fff7ed",
+        c100="#ffedd5",
+        c200="#fed7aa",
+        c300="#fdba74",
+        c400="#fb923c",
+        c500="#fc7604",
+        c600="#ea580c",
+        c700="#c2410c",
+        c800="#9a3412",
+        c900="#7c2d12",
+        c950="#431407",
+    ),
+    secondary_hue="slate",
+    neutral_hue="slate",
+    font=gr.themes.GoogleFont("Inter"),
+    font_mono=gr.themes.GoogleFont("JetBrains Mono"),
+    text_size=gr.themes.sizes.text_lg,
+    radius_size=gr.themes.sizes.radius_md,
+).set(
+    body_background_fill="#0a0a0a",
+    body_background_fill_dark="#0a0a0a",
+    body_text_color="#e2e8f0",
+    body_text_color_dark="#e2e8f0",
+    body_text_size="1rem",
+    block_background_fill="#141414",
+    block_background_fill_dark="#141414",
+    block_border_color="#1e293b",
+    block_border_color_dark="#1e293b",
+    block_label_text_color="#94a3b8",
+    block_label_text_color_dark="#94a3b8",
+    block_label_text_size="0.95rem",
+    block_title_text_color="#e2e8f0",
+    block_title_text_color_dark="#e2e8f0",
+    input_background_fill="#1e1e1e",
+    input_background_fill_dark="#1e1e1e",
+    input_border_color="#334155",
+    input_border_color_dark="#334155",
+    input_text_size="1rem",
+    input_placeholder_color="#64748b",
+    input_placeholder_color_dark="#64748b",
+    button_primary_background_fill="#fc7604",
+    button_primary_background_fill_dark="#fc7604",
+    button_primary_background_fill_hover="#fb923c",
+    button_primary_background_fill_hover_dark="#fb923c",
+    button_primary_text_color="#0a0a0a",
+    button_primary_text_color_dark="#0a0a0a",
+    button_large_text_size="1.1rem",
+    border_color_primary="#fc7604",
+    border_color_primary_dark="#fc7604",
+)
+# ─────────────────────────────────────────────────────────────────────────────
+# INTERFACE
+# ─────────────────────────────────────────────────────────────────────────────
+css = """
+.gradio-container {
+    max-width: 1200px !important;
+    margin: 0 auto !important;
+    padding: 1.5rem !important;
+}
+h1 { color: #fc7604 !important; font-size: 2.2rem !important; font-weight: 700 !important; margin-bottom: 0.2rem !important; }
+h3 { font-size: 1.15rem !important; }
+.subtitle { color: #94a3b8 !important; font-size: 1.1rem !important; margin-top: 0 !important; }
+a { color: #fd9a42 !important; }
+a:hover { color: #fec08a !important; }
+.step-label { color: #fc7604; font-weight: 600; font-size: 1.05rem; }
+.links-bar { font-size: 0.9rem; color: #64748b; margin-top: 0.5rem; }
+.links-bar a { color: #64748b !important; }
+.links-bar a:hover { color: #fd9a42 !important; }
+footer { display: none !important; }
+/* Unified context box */
+.context-box {
+    border: 1px solid #334155 !important;
+    border-radius: 8px !important;
+    padding: 1rem !important;
+    background: #141414 !important;
+}
+.context-box .block {
+    border: none !important;
+    background: transparent !important;
+    padding: 0 !important;
+    box-shadow: none !important;
+}
+.context-box .wrap {
+    gap: 0.75rem !important;
+}
+.context-box textarea {
+    background: #1e1e1e !important;
+    border: 1px solid #334155 !important;
+    border-radius: 6px !important;
+}
+.context-divider {
+    text-align: center;
+    color: #64748b !important;
+    font-size: 0.85rem !important;
+    margin: 0.25rem 0 !important;
+    padding: 0 !important;
+}
+.context-divider p { margin: 0 !important; }
+.context-box .file-upload,
+.context-box .upload-button {
+    border: 1px dashed #475569 !important;
+    border-radius: 6px !important;
+    background: #1a1a1a !important;
+}
+.context-box .file-preview {
+    border: none !important;
+}
+@media (max-width: 768px) {
+    .gradio-container { padding: 0.75rem !important; }
+    h1 { font-size: 1.6rem !important; }
+}
+"""
+with gr.Blocks(
+    title="groundlens — Check if your AI is hallucinating",
+    theme=theme,
+    css=css,
+) as demo:
+    gr.Markdown("""
+# groundlens
+<p class="subtitle">Check if an AI gave you a real answer or made something up.</p>
+""")
+    gr.Markdown("""
+You asked an AI a question and got an answer. Was it real or hallucinated?
+Paste both below and we'll check using two independent methods: **groundlens**
+(geometric analysis) and **Vectara HHEM** (neural classifier).
+""")
+    gr.Markdown("""<p class="links-bar">
+<a href="https://github.com/groundlens-dev/groundlens">GitHub</a> ·
+<a href="https://docs.groundlens.dev">Docs</a> ·
+<a href="https://pypi.org/project/groundlens/">PyPI</a> ·
+<a href="https://arxiv.org/abs/2512.13771">SGI paper</a> ·
+<a href="https://arxiv.org/pdf/2602.13224v3">Taxonomy</a> ·
+<a href="https://arxiv.org/abs/2603.13259">Mechanistic paper</a>
+</p>""")
+    # ── Step 1: Question ──
+    gr.Markdown('<p class="step-label">1. What did you ask the AI?</p>')
+    q_in = gr.Textbox(
+        show_label=False,
+        placeholder="e.g. What does our insurance policy cover for water damage?",
+        lines=2,
+    )
+    # ── Step 2: Context ──
+    gr.Markdown(
+        '<p class="step-label">2. Did you give the AI any source material? (optional)</p>'
+    )
+    gr.Markdown(
+        "If you gave the AI a document, a webpage, an Excel file, or any reference "
+        "material to base its answer on, paste the text here or upload the file. "
+        "If you just asked a question with no source, skip this step.",
+    )
+    with gr.Group(elem_classes=["context-box"]):
+        ctx_in = gr.Textbox(
+            show_label=False,
+            placeholder="Paste source text here...",
+            lines=4,
+            container=False,
+        )
+        gr.Markdown("— or —", elem_classes=["context-divider"])
+        file_in = gr.File(
+            label="Upload a file (PDF, Excel, CSV, TXT — max 20 pages / 200 rows)",
+            file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"],
+            file_count="single",
+            height=60,
+        )
+    # ── Step 3: Response ──
+    gr.Markdown('<p class="step-label">3. What did the AI answer?</p>')
+    r_in = gr.Textbox(
+        show_label=False,
+        placeholder="Paste the AI's response here...",
+        lines=4,
+    )
+    # ── Evaluate button ──
+    run_btn = gr.Button(
+        "Check for hallucination",
+        variant="primary",
+        size="lg",
+    )
+    # ── Results ──
+    with gr.Row(equal_height=True):
+        gl_out = gr.Markdown()
+        hhem_out = gr.Markdown()
+    agreement_out = gr.Markdown()
+    # ── Examples ──
+    gr.Markdown("---")
+    gr.Markdown("### Try an example")
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[q_in, ctx_in, r_in],
+        label="",
+    )
+    # ── Footer ──
+    gr.Markdown("""
+---
+<p style="color:#475569; font-size:0.85rem; text-align:center;">
+<strong>groundlens</strong> is open source (MIT). Built by
+<a href="https://jmarin.info" style="color:#64748b !important;">Javier Marin</a>.
+This demo runs the same library available via <code>pip install groundlens</code>.<br>
+groundlens is verification triage, not a truth oracle. It tells you which answers
+deserve trust and which need a closer look.
+</p>
+""")
+    # ── Event binding ──
+    run_btn.click(
+        fn=run_comparison,
+        inputs=[q_in, ctx_in, file_in, r_in],
+        outputs=[gl_out, hhem_out, agreement_out],
+    )
+if __name__ == "__main__":
+    demo.launch()