Spaces:

groundlens
/

demo

Sleeping

File size: 19,408 Bytes

9f6ea26

"""
groundlens — Geometric LLM Hallucination Detection Demo

Plain-language interface: paste a question and the AI's answer,
optionally upload context (PDF, Excel, or plain text).
Compares groundlens (embedding geometry) vs Vectara HHEM-2.1-Open.

Models load once at module level to avoid cold-start on Space wake.
"""

import logging
import time
import os

import gradio as gr
from groundlens import compute_sgi, compute_dgi

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# ─────────────────────────────────────────────────────────────────────────────
# FILE EXTRACTION — PDF and Excel support
# ─────────────────────────────────────────────────────────────────────────────

def extract_pdf_text(file_path: str, max_chars: int = 8000) -> str:
    """Extract text from a PDF file."""
    try:
        import pdfplumber
        text_parts = []
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages[:20]:
                page_text = page.extract_text()
                if page_text:
                    text_parts.append(page_text)
        full_text = "\n\n".join(text_parts)
        return full_text[:max_chars] if len(full_text) > max_chars else full_text
    except Exception as e:
        return f"[Could not read PDF: {e}]"


def extract_excel_text(file_path: str, max_chars: int = 8000) -> str:
    """Extract text from an Excel file."""
    try:
        import openpyxl
        wb = openpyxl.load_workbook(file_path, data_only=True)
        text_parts = []
        for sheet_name in wb.sheetnames[:5]:
            ws = wb[sheet_name]
            text_parts.append(f"--- {sheet_name} ---")
            for row in ws.iter_rows(max_row=200, values_only=True):
                cells = [str(c) if c is not None else "" for c in row]
                line = " | ".join(cells).strip()
                if line and line != " | ".join([""] * len(cells)):
                    text_parts.append(line)
        full_text = "\n".join(text_parts)
        return full_text[:max_chars] if len(full_text) > max_chars else full_text
    except Exception as e:
        return f"[Could not read Excel file: {e}]"


def extract_file_to_text(file) -> str:
    """Extract text from an uploaded file and return it for the textbox."""
    if file is None:
        return ""

    file_path = file.name if hasattr(file, 'name') else str(file)
    ext = os.path.splitext(file_path)[1].lower()
    basename = os.path.basename(file_path)

    if ext == ".pdf":
        text = extract_pdf_text(file_path)
    elif ext in (".xlsx", ".xls"):
        text = extract_excel_text(file_path)
    elif ext in (".txt", ".md", ".csv"):
        try:
            with open(file_path, "r", encoding="utf-8", errors="replace") as f:
                text = f.read(8000)
        except Exception as e:
            text = f"[Could not read file: {e}]"
    else:
        text = f"[Unsupported file type: {ext}. Use PDF, Excel, TXT, or CSV.]"

    if text and not text.startswith("["):
        return f"[Extracted from {basename}]\n\n{text}"
    return text


# ─────────────────────────────────────────────────────────────────────────────
# HHEM-2.1-Open — baseline comparison
# ─────────────────────────────────────────────────────────────────────────────

logger.info("Loading HHEM-2.1-Open...")
from transformers import AutoModelForSequenceClassification

_hhem = AutoModelForSequenceClassification.from_pretrained(
    "vectara/hallucination_evaluation_model",
    trust_remote_code=True,
)
logger.info("HHEM loaded.")

# Warm up groundlens embedding model
logger.info("Warming up groundlens...")
compute_dgi(question="warmup", response="warmup")
logger.info("groundlens ready.")


# ─────────────────────────────────────────────────────────────────────────────
# SCORING
# ─────────────────────────────────────────────────────────────────────────────

def score_groundlens(question: str, response: str, context: str) -> dict:
    start = time.perf_counter()
    has_context = bool(context.strip())

    if has_context:
        result = compute_sgi(
            question=question,
            context=context,
            response=response,
        )
        method = "SGI (with context)"
        raw_score = result.value
        grounded = not result.flagged
        threshold = 0.95
        mode_note = (
            "Measured how much the AI's answer used your source document "
            "vs. just rephrasing the question."
        )
    else:
        result = compute_dgi(
            question=question,
            response=response,
        )
        method = "DGI (without context)"
        raw_score = result.value
        grounded = not result.flagged
        threshold = 0.30
        mode_note = (
            "Measured whether the AI's answer follows patterns typical "
            "of grounded, factual responses."
        )

    elapsed_ms = (time.perf_counter() - start) * 1000

    return {
        "method": method,
        "raw_score": round(raw_score, 4),
        "grounded": grounded,
        "threshold": threshold,
        "elapsed_ms": round(elapsed_ms, 1),
        "mode_note": mode_note,
    }


def score_hhem(question: str, response: str, context: str) -> dict:
    has_context = bool(context.strip())
    premise = (
        f"{context.strip()}\n\n{question}".strip()
        if has_context
        else question
    )
    if len(premise) > 1800:
        premise = premise[:1800]

    start = time.perf_counter()
    scores = _hhem.predict([(premise, response)])
    raw_score = float(scores[0])
    elapsed_ms = (time.perf_counter() - start) * 1000

    return {
        "method": "HHEM-2.1-Open",
        "raw_score": round(raw_score, 4),
        "grounded": raw_score >= 0.5,
        "elapsed_ms": round(elapsed_ms, 1),
        "label": "consistent" if raw_score >= 0.5 else "hallucinated",
    }


# ─────────────────────────────────────────────────────────────────────────────
# MAIN COMPARISON — now takes only text inputs (no file object)
# ─────────────────────────────────────────────────────────────────────────────

def run_comparison(
    question: str, context_text: str, response: str
) -> tuple[str, str, str]:

    if not question.strip():
        return "⚠️ Enter the question you asked the AI.", "", ""
    if not response.strip():
        return "⚠️ Enter the AI's response.", "", ""

    # Strip the "[Extracted from ...]" header if present
    context = context_text.strip()
    if context.startswith("[Extracted from "):
        newline_pos = context.find("\n")
        if newline_pos > 0:
            context = context[newline_pos:].strip()

    gl = score_groundlens(question, response, context)
    hhem = score_hhem(question, response, context)

    # groundlens result
    if gl["grounded"]:
        gl_verdict = "🟢 Looks grounded"
        gl_explain = "The AI's answer appears to be based on real information."
    else:
        gl_verdict = "🔴 Possible hallucination"
        gl_explain = "The AI's answer shows signs of being fabricated or not grounded in the source."

    gl_md = f"""### groundlens

**{gl_verdict}**

{gl_explain}

| | |
|---|---|
| **Method** | {gl["method"]} |
| **Score** | {gl["raw_score"]} (threshold: {gl["threshold"]}) |
| **Time** | {gl["elapsed_ms"]} ms |

*{gl["mode_note"]}*"""

    # HHEM result
    if hhem["grounded"]:
        hhem_verdict = "🟢 Looks consistent"
        hhem_explain = "The classifier considers this answer consistent with the input."
    else:
        hhem_verdict = "🔴 Possible hallucination"
        hhem_explain = "The classifier flagged this answer as potentially hallucinated."

    hhem_md = f"""### Vectara HHEM-2.1-Open

**{hhem_verdict}**

{hhem_explain}

| | |
|---|---|
| **Method** | {hhem["method"]} |
| **Score** | {hhem["raw_score"]} ({hhem["label"]}) |
| **Time** | {hhem["elapsed_ms"]} ms |

*Fine-tuned flan-T5 classifier.*"""

    # Agreement
    agree = gl["grounded"] == hhem["grounded"]
    if agree and gl["grounded"]:
        agreement_md = "### 🔵 Both methods agree: the answer looks reliable."
    elif agree and not gl["grounded"]:
        agreement_md = "### 🔴 Both methods agree: this answer is likely hallucinated."
    else:
        agreement_md = """### 🟠 The two methods disagree.

This often happens with **subtle factual errors** — the answer sounds right and
uses the correct vocabulary, but gets specific facts wrong. Embedding geometry
(groundlens) measures the shape of the answer; the classifier (HHEM) evaluates
its content differently. When they disagree, it's worth checking the facts manually.

[Learn more about hallucination types →](https://docs.groundlens.dev/theory/hallucination-taxonomy/)"""

    return gl_md, hhem_md, agreement_md


# ─────────────────────────────────────────────────────────────────────────────
# EXAMPLES
# ─────────────────────────────────────────────────────────────────────────────

EXAMPLES = [
    [
        "What does the water damage policy cover?",
        "Coverage includes burst pipes and sudden appliance failure up to "
        "$50,000. Flood damage requires a separate NFIP policy. "
        "Deductible is $1,500 per occurrence.",
        "The policy covers burst pipes and sudden appliance failure up to "
        "$50,000 per occurrence, with a $1,500 deductible.",
    ],
    [
        "What does the water damage policy cover?",
        "Coverage includes burst pipes and sudden appliance failure up to "
        "$50,000. Flood damage requires a separate NFIP policy. "
        "Deductible is $1,500 per occurrence.",
        "The policy covers all water damage including floods "
        "with no deductible required.",
    ],
    [
        "What causes seasons on Earth?",
        "",
        "Seasons are caused by Earth's 23.5-degree axial tilt, which "
        "changes how directly sunlight hits each hemisphere.",
    ],
    [
        "What causes seasons on Earth?",
        "",
        "Seasons are regulated by the Atmospheric Regulation Committee, "
        "a UN body established in 1952 that adjusts global temperature "
        "through orbital satellites.",
    ],
]


# ─────────────────────────────────────────────────────────────────────────────
# THEME — dark, matching groundlens.dev
# ─────────────────────────────────────────────────────────────────────────────

_orange = gr.themes.Color(
    c50="#fff7ed",
    c100="#ffedd5",
    c200="#fed7aa",
    c300="#fdba74",
    c400="#fb923c",
    c500="#fc7604",
    c600="#ea580c",
    c700="#c2410c",
    c800="#9a3412",
    c900="#7c2d12",
    c950="#431407",
)

theme = gr.Theme.from_hub("Bruhn/CrimsonNight").set(
    # Override crimson red → groundlens orange
    button_primary_background_fill="#fc7604",
    button_primary_background_fill_dark="#fc7604",
    button_primary_background_fill_hover="#fb923c",
    button_primary_background_fill_hover_dark="#fb923c",
    button_primary_text_color="#0a0a0a",
    button_primary_text_color_dark="#0a0a0a",
    border_color_primary="#fc7604",
    border_color_primary_dark="#fc7604",
)


# ─────────────────────────────────────────────────────────────────────────────
# INTERFACE
# ─────────────────────────────────────────────────────────────────────────────

css = """
.gradio-container {
    max-width: 1200px !important;
    margin: 0 auto !important;
    padding: 1.5rem !important;
}
h1 { color: #fc7604 !important; font-size: 2.2rem !important; font-weight: 700 !important; margin-bottom: 0.2rem !important; }
h3 { font-size: 1.15rem !important; }
.subtitle { color: #94a3b8 !important; font-size: 1.1rem !important; margin-top: 0 !important; }
a { color: #fd9a42 !important; }
a:hover { color: #fec08a !important; }
.step-label { color: #fc7604; font-weight: 600; font-size: 1.05rem; }
.links-bar { font-size: 0.9rem; color: #64748b; margin-top: 0.5rem; }
.links-bar a { color: #64748b !important; }
.links-bar a:hover { color: #fd9a42 !important; }
footer { display: none !important; }

/* Upload button — small, dashed secondary style */
.upload-btn { margin-top: 0.25rem !important; }
.upload-btn button {
    background: transparent !important;
    border: 1px dashed #475569 !important;
    color: #94a3b8 !important;
    font-size: 0.85rem !important;
    padding: 0.4rem 1rem !important;
    border-radius: 6px !important;
}
.upload-btn button:hover {
    border-color: #fc7604 !important;
    color: #fc7604 !important;
}
.upload-status p {
    color: #94a3b8 !important;
    font-size: 0.85rem !important;
    margin: 0.25rem 0 0 0 !important;
    font-style: italic;
}
@media (max-width: 768px) {
    .gradio-container { padding: 0.75rem !important; }
    h1 { font-size: 1.6rem !important; }
}
"""

with gr.Blocks(
    title="groundlens — Check if your AI is hallucinating",
    theme=theme,
    css=css,
) as demo:

    gr.Markdown("""
# groundlens

<p class="subtitle">Check if an AI gave you a real answer or made something up.</p>
""")

    gr.Markdown("""
You asked an AI a question and got an answer. Was it real or hallucinated?
Paste both below and we'll check using two independent methods: **groundlens**
(geometric analysis) and **Vectara HHEM** (neural classifier).
""")

    gr.Markdown("""<p class="links-bar">
<a href="https://github.com/groundlens-dev/groundlens">GitHub</a> ·
<a href="https://docs.groundlens.dev">Docs</a> ·
<a href="https://pypi.org/project/groundlens/">PyPI</a> ·
<a href="https://arxiv.org/abs/2512.13771">SGI paper</a> ·
<a href="https://arxiv.org/pdf/2602.13224v3">Taxonomy</a> ·
<a href="https://arxiv.org/abs/2603.13259">Mechanistic paper</a>
</p>""")

    # ── Step 1: Question ──
    gr.Markdown('<p class="step-label">1. What did you ask the AI?</p>')
    q_in = gr.Textbox(
        show_label=False,
        placeholder="e.g. What does our insurance policy cover for water damage?",
        lines=2,
    )

    # ── Step 2: Context ──
    gr.Markdown(
        '<p class="step-label">2. Did you give the AI any source material? (optional)</p>'
    )
    gr.Markdown(
        "If you gave the AI a document, a webpage, an Excel file, or any reference "
        "material to base its answer on, paste the text below. "
        "If you just asked a question with no source, skip this step.",
    )

    ctx_in = gr.Textbox(
        show_label=False,
        placeholder="Paste the source text here, or use the upload button below to extract text from a file...",
        lines=5,
    )

    # Hidden file input + visible upload button
    file_in = gr.File(
        file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"],
        file_count="single",
        visible=False,
    )
    upload_status = gr.Markdown("", elem_classes=["upload-status"])

    upload_btn = gr.UploadButton(
        "📄 Upload a file (PDF, Excel, CSV, TXT)",
        file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"],
        file_count="single",
        elem_classes=["upload-btn"],
    )

    def handle_upload(file, existing_text):
        """Extract file text and append to context textbox."""
        extracted = extract_file_to_text(file)
        if not extracted:
            return existing_text, ""
        if extracted.startswith("[Could not") or extracted.startswith("[Unsupported"):
            return existing_text, f"⚠️ {extracted}"

        basename = os.path.basename(file.name if hasattr(file, 'name') else str(file))
        # Replace existing content or append
        if existing_text and existing_text.strip():
            new_text = existing_text.strip() + "\n\n" + extracted
        else:
            new_text = extracted
        return new_text, f"✓ Extracted text from **{basename}**"

    upload_btn.upload(
        fn=handle_upload,
        inputs=[upload_btn, ctx_in],
        outputs=[ctx_in, upload_status],
    )

    # ── Step 3: Response ──
    gr.Markdown('<p class="step-label">3. What did the AI answer?</p>')
    r_in = gr.Textbox(
        show_label=False,
        placeholder="Paste the AI's response here...",
        lines=4,
    )

    # ── Evaluate button ──
    run_btn = gr.Button(
        "Check for hallucination",
        variant="primary",
        size="lg",
    )

    # ── Results ──
    with gr.Row(equal_height=True):
        gl_out = gr.Markdown()
        hhem_out = gr.Markdown()

    agreement_out = gr.Markdown()

    # ── Examples ──
    gr.Markdown("---")
    gr.Markdown("### Try an example")

    gr.Examples(
        examples=EXAMPLES,
        inputs=[q_in, ctx_in, r_in],
        label="",
    )

    # ── Footer ──
    gr.Markdown("""
---

<p style="color:#475569; font-size:0.85rem; text-align:center;">
<strong>groundlens</strong> is open source (MIT). Built by
<a href="https://jmarin.info" style="color:#64748b !important;">Javier Marin</a>.
This demo runs the same library available via <code>pip install groundlens</code>.<br>
groundlens is verification triage, not a truth oracle. It tells you which answers
deserve trust and which need a closer look.
</p>
""")

    # ── Event binding ──
    run_btn.click(
        fn=run_comparison,
        inputs=[q_in, ctx_in, r_in],
        outputs=[gl_out, hhem_out, agreement_out],
    )


if __name__ == "__main__":
    demo.launch()