Spaces:

groundlens
/

demo

Sleeping

App Files Files Community

AI-that-works commited on 12 days ago

Commit

acd77b0

verified ·

1 Parent(s): bbdf3b3

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -362

app.py DELETED Viewed

@@ -1,362 +0,0 @@
-"""
-groundlens — Geometric LLM Hallucination Detection
-Live demo comparing groundlens (embedding geometry) against
-Vectara HHEM-2.1-Open (fine-tuned flan-T5 classifier).
-Uses the groundlens library directly — same code as `pip install groundlens`.
-Architecture: flat, sequential, no classes. Models load once at module level
-to eliminate cold-start timeout when the Space wakes from sleep.
-"""
-import logging
-import time
-import gradio as gr
-from groundlens import compute_sgi, compute_dgi
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# ─────────────────────────────────────────────────────────────────────────────
-# HHEM-2.1-Open — baseline comparison
-# Uses AutoModelForSequenceClassification with custom .predict().
-# Input: List[Tuple[str, str]] — model handles flan-T5 template internally.
-# Output: float per pair, 0.0 = hallucinated, 1.0 = consistent.
-# ─────────────────────────────────────────────────────────────────────────────
-logger.info("Loading HHEM-2.1-Open (vectara/hallucination_evaluation_model)...")
-from transformers import AutoModelForSequenceClassification
-_hhem = AutoModelForSequenceClassification.from_pretrained(
-    "vectara/hallucination_evaluation_model",
-    trust_remote_code=True,
-)
-logger.info("HHEM loaded.")
-# ─────────────────────────────────────────────────────────────────────────────
-# SCORING — groundlens (SGI / DGI)
-# ─────────────────────────────────────────────────────────────────────────────
-def score_groundlens(question: str, response: str, context: str) -> dict:
-    start = time.perf_counter()
-    has_context = bool(context.strip())
-    if has_context:
-        result = compute_sgi(
-            question=question,
-            context=context,
-            response=response,
-        )
-        method = "SGI"
-        raw_score = result.value
-        grounded = not result.flagged
-        threshold = 0.95
-        detail = (
-            f"dist(response, question) = {result.q_dist:.4f}\n"
-            f"dist(response, context) = {result.ctx_dist:.4f}"
-        )
-        mode_note = (
-            "*One embedding model, one geometric ratio. "
-            "No model inference for evaluation.*"
-        )
-    else:
-        result = compute_dgi(
-            question=question,
-            response=response,
-        )
-        method = "DGI"
-        raw_score = result.value
-        grounded = not result.flagged
-        threshold = 0.30
-        detail = ""
-        mode_note = (
-            "*Measuring displacement alignment against "
-            "grounded reference direction.*"
-        )
-    elapsed_ms = (time.perf_counter() - start) * 1000
-    return {
-        "method": method,
-        "raw_score": round(raw_score, 4),
-        "grounded": grounded,
-        "threshold": threshold,
-        "elapsed_ms": round(elapsed_ms, 1),
-        "explanation": result.explanation,
-        "detail": detail,
-        "mode_note": mode_note,
-    }
-# ─────────────────────────────────────────────────────────────────────────────
-# SCORING — HHEM-2.1-Open (baseline)
-# ─────────────────────────────────────────────────────────────────────────────
-def score_hhem(question: str, response: str, context: str) -> dict:
-    has_context = bool(context.strip())
-    premise = (
-        f"{context.strip()}\n\n{question}".strip()
-        if has_context
-        else question
-    )
-    # T5 max is ~512 tokens — truncate premise to safe char limit
-    if len(premise) > 1800:
-        premise = premise[:1800]
-    start = time.perf_counter()
-    scores = _hhem.predict([(premise, response)])
-    raw_score = float(scores[0])
-    elapsed_ms = (time.perf_counter() - start) * 1000
-    return {
-        "method": "HHEM-2.1-Open",
-        "raw_score": round(raw_score, 4),
-        "grounded": raw_score >= 0.5,
-        "elapsed_ms": round(elapsed_ms, 1),
-        "label": "consistent" if raw_score >= 0.5 else "hallucinated",
-    }
-# ───────────────────────────────────────────���─────────────────────────────────
-# COMPARISON — called by Gradio on every submission
-# ─────────────────────────────────────────────────────────────────────────────
-def run_comparison(
-    question: str, context: str, response: str
-) -> tuple[str, str, str]:
-    if not question.strip():
-        return "Provide a question.", "", ""
-    if not response.strip():
-        return "Provide a response to evaluate.", "", ""
-    gl = score_groundlens(question, response, context)
-    hhem = score_hhem(question, response, context)
-    # groundlens result
-    gl_verdict = (
-        "🟢 Not hallucinated" if gl["grounded"]
-        else "🔴 Hallucinated"
-    )
-    gl_md = f"""**{gl_verdict}**
-| | |
-|---|---|
-| Method | `{gl["method"]}` |
-| Score | `{gl["raw_score"]}` |
-| Threshold | `{gl["threshold"]}` |
-| Latency | `{gl["elapsed_ms"]} ms` |
-{gl["mode_note"]}"""
-    # HHEM result
-    hhem_verdict = (
-        "🟢 Not hallucinated" if hhem["grounded"]
-        else "🔴 Hallucinated"
-    )
-    hhem_md = f"""**{hhem_verdict}**
-| | |
-|---|---|
-| Method | `{hhem["method"]}` |
-| Score | `{hhem["raw_score"]}` |
-| Label | `{hhem["label"]}` |
-| Latency | `{hhem["elapsed_ms"]} ms` |
-*flan-T5 classifier. Full model inference per call.*"""
-    # Agreement
-    agree = gl["grounded"] == hhem["grounded"]
-    if agree:
-        agreement_md = "🔵 **Both methods agree.**"
-    else:
-        agreement_md = """🟠 **Methods disagree.**
-groundlens uses geometric displacement in embedding space.
-HHEM uses a learned classifier (fine-tuned flan-T5).
-Disagreement often surfaces **Type III hallucinations** — factual errors
-within the correct semantic frame. Embedding geometry cannot detect
-these: the response occupies the right region of the space but gets
-the facts wrong. See the
-[hallucination taxonomy](https://docs.groundlens.dev/theory/hallucination-taxonomy/)
-for details."""
-    return gl_md, hhem_md, agreement_md
-# ─────────────────────────────────────────────────────────────────────────────
-# EXAMPLES
-# ─────────────────────────────────────────────────────────────────────────────
-EXAMPLES = [
-    [
-        "What does the water damage policy cover?",
-        "Coverage includes burst pipes and sudden appliance failure up to "
-        "$50,000. Flood damage requires a separate NFIP policy. "
-        "Deductible is $1,500 per occurrence.",
-        "The policy covers burst pipes and sudden appliance failure up to "
-        "$50,000 per occurrence, with a $1,500 deductible.",
-    ],
-    [
-        "What does the water damage policy cover?",
-        "Coverage includes burst pipes and sudden appliance failure up to "
-        "$50,000. Flood damage requires a separate NFIP policy. "
-        "Deductible is $1,500 per occurrence.",
-        "The policy covers all water damage including floods "
-        "with no deductible required.",
-    ],
-    [
-        "What causes seasons on Earth?",
-        "",
-        "Seasons are caused by Earth's 23.5-degree axial tilt, which "
-        "changes how directly sunlight hits each hemisphere.",
-    ],
-    [
-        "What causes seasons on Earth?",
-        "",
-        "Seasons are regulated by the Atmospheric Regulation Committee, "
-        "a UN body established in 1952 that adjusts global temperature "
-        "through orbital satellites.",
-    ],
-]
-# ─────────────────────────────────────────────────────────────────────────────
-# CUSTOM THEME — dark, matching groundlens.dev
-# ─────────────────────────────────────────────────────────────────────────────
-theme = gr.themes.Base(
-    primary_hue=gr.themes.Color(
-        c50="#fff7ed",
-        c100="#ffedd5",
-        c200="#fed7aa",
-        c300="#fdba74",
-        c400="#fb923c",
-        c500="#fc7604",  # groundlens orange
-        c600="#ea580c",
-        c700="#c2410c",
-        c800="#9a3412",
-        c900="#7c2d12",
-        c950="#431407",
-    ),
-    secondary_hue="slate",
-    neutral_hue="slate",
-    font=gr.themes.GoogleFont("Inter"),
-    font_mono=gr.themes.GoogleFont("JetBrains Mono"),
-).set(
-    body_background_fill="#0a0a0a",
-    body_background_fill_dark="#0a0a0a",
-    body_text_color="#e2e8f0",
-    body_text_color_dark="#e2e8f0",
-    block_background_fill="#141414",
-    block_background_fill_dark="#141414",
-    block_border_color="#1e293b",
-    block_border_color_dark="#1e293b",
-    block_label_text_color="#94a3b8",
-    block_label_text_color_dark="#94a3b8",
-    block_title_text_color="#e2e8f0",
-    block_title_text_color_dark="#e2e8f0",
-    input_background_fill="#1e1e1e",
-    input_background_fill_dark="#1e1e1e",
-    input_border_color="#334155",
-    input_border_color_dark="#334155",
-    input_placeholder_color="#64748b",
-    input_placeholder_color_dark="#64748b",
-    button_primary_background_fill="#fc7604",
-    button_primary_background_fill_dark="#fc7604",
-    button_primary_background_fill_hover="#fb923c",
-    button_primary_background_fill_hover_dark="#fb923c",
-    button_primary_text_color="#0a0a0a",
-    button_primary_text_color_dark="#0a0a0a",
-    border_color_primary="#fc7604",
-    border_color_primary_dark="#fc7604",
-)
-# ─────────────────────────────────────────────────────────────────────────────
-# INTERFACE
-# ─────────────────────────────────────────────────────────────────────────────
-css = """
-.gradio-container { max-width: 960px !important; }
-h1 { color: #fc7604 !important; font-weight: 700 !important; }
-h3 { color: #94a3b8 !important; font-weight: 400 !important; }
-a { color: #fd9a42 !important; }
-a:hover { color: #fec08a !important; }
-"""
-with gr.Blocks(
-    title="groundlens — Hallucination Detection Demo",
-    theme=theme,
-    css=css,
-) as demo:
-    gr.Markdown("""
-# groundlens
-### Geometric LLM hallucination detection — benchmarked against Vectara HHEM-2.1-Open
-**With context (RAG)** — SGI measures whether the response engaged with
-the source document. Computed as `dist(response, question) / dist(response, context)`.
-No model inference for evaluation — one embedding, one ratio.
-**Without context** — DGI measures whether the response displacement
-aligns with the mean displacement of verified grounded pairs.
-[GitHub](https://github.com/groundlens-dev/groundlens) ·
-[Documentation](https://docs.groundlens.dev) ·
-[PyPI](https://pypi.org/project/groundlens/) ·
-[SGI paper](https://arxiv.org/abs/2512.13771) ·
-[Taxonomy paper](https://arxiv.org/pdf/2602.13224v3) ·
-[Mechanistic paper](https://arxiv.org/abs/2603.13259)
-""")
-    with gr.Row():
-        with gr.Column():
-            q_in = gr.Textbox(
-                label="Question",
-                placeholder="What does the policy cover for water damage?",
-                lines=2,
-            )
-            ctx_in = gr.Textbox(
-                label="Context  (optional — leave blank for DGI mode)",
-                placeholder="Paste source document or retrieved chunks here.",
-                lines=5,
-            )
-            r_in = gr.Textbox(
-                label="LLM Response",
-                placeholder="The model response to evaluate.",
-                lines=4,
-            )
-            run_btn = gr.Button("Evaluate", variant="primary")
-    with gr.Row():
-        gl_out = gr.Markdown(label="groundlens")
-        hhem_out = gr.Markdown(label="HHEM-2.1-Open")
-    agreement_out = gr.Markdown(label="Agreement")
-    gr.Examples(
-        examples=EXAMPLES,
-        inputs=[q_in, ctx_in, r_in],
-        label="Examples",
-    )
-    gr.Markdown("""
----
-*groundlens is MIT-licensed. Built by [Javier Marin](https://jmarin.info).
-This demo uses the same `groundlens` library available via `pip install groundlens`.*
-""")
-    run_btn.click(
-        fn=run_comparison,
-        inputs=[q_in, ctx_in, r_in],
-        outputs=[gl_out, hhem_out, agreement_out],
-    )
-if __name__ == "__main__":
-    demo.launch()