AI-that-works commited on
Commit
acd77b0
·
verified ·
1 Parent(s): bbdf3b3

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -362
app.py DELETED
@@ -1,362 +0,0 @@
1
- """
2
- groundlens — Geometric LLM Hallucination Detection
3
-
4
- Live demo comparing groundlens (embedding geometry) against
5
- Vectara HHEM-2.1-Open (fine-tuned flan-T5 classifier).
6
-
7
- Uses the groundlens library directly — same code as `pip install groundlens`.
8
-
9
- Architecture: flat, sequential, no classes. Models load once at module level
10
- to eliminate cold-start timeout when the Space wakes from sleep.
11
- """
12
-
13
- import logging
14
- import time
15
-
16
- import gradio as gr
17
- from groundlens import compute_sgi, compute_dgi
18
-
19
- logging.basicConfig(level=logging.INFO)
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- # ─────────────────────────────────────────────────────────────────────────────
24
- # HHEM-2.1-Open — baseline comparison
25
- # Uses AutoModelForSequenceClassification with custom .predict().
26
- # Input: List[Tuple[str, str]] — model handles flan-T5 template internally.
27
- # Output: float per pair, 0.0 = hallucinated, 1.0 = consistent.
28
- # ─────────────────────────────────────────────────────────────────────────────
29
-
30
- logger.info("Loading HHEM-2.1-Open (vectara/hallucination_evaluation_model)...")
31
- from transformers import AutoModelForSequenceClassification
32
-
33
- _hhem = AutoModelForSequenceClassification.from_pretrained(
34
- "vectara/hallucination_evaluation_model",
35
- trust_remote_code=True,
36
- )
37
- logger.info("HHEM loaded.")
38
-
39
-
40
- # ─────────────────────────────────────────────────────────────────────────────
41
- # SCORING — groundlens (SGI / DGI)
42
- # ─────────────────────────────────────────────────────────────────────────────
43
-
44
- def score_groundlens(question: str, response: str, context: str) -> dict:
45
- start = time.perf_counter()
46
- has_context = bool(context.strip())
47
-
48
- if has_context:
49
- result = compute_sgi(
50
- question=question,
51
- context=context,
52
- response=response,
53
- )
54
- method = "SGI"
55
- raw_score = result.value
56
- grounded = not result.flagged
57
- threshold = 0.95
58
- detail = (
59
- f"dist(response, question) = {result.q_dist:.4f}\n"
60
- f"dist(response, context) = {result.ctx_dist:.4f}"
61
- )
62
- mode_note = (
63
- "*One embedding model, one geometric ratio. "
64
- "No model inference for evaluation.*"
65
- )
66
- else:
67
- result = compute_dgi(
68
- question=question,
69
- response=response,
70
- )
71
- method = "DGI"
72
- raw_score = result.value
73
- grounded = not result.flagged
74
- threshold = 0.30
75
- detail = ""
76
- mode_note = (
77
- "*Measuring displacement alignment against "
78
- "grounded reference direction.*"
79
- )
80
-
81
- elapsed_ms = (time.perf_counter() - start) * 1000
82
-
83
- return {
84
- "method": method,
85
- "raw_score": round(raw_score, 4),
86
- "grounded": grounded,
87
- "threshold": threshold,
88
- "elapsed_ms": round(elapsed_ms, 1),
89
- "explanation": result.explanation,
90
- "detail": detail,
91
- "mode_note": mode_note,
92
- }
93
-
94
-
95
- # ─────────────────────────────────────────────────────────────────────────────
96
- # SCORING — HHEM-2.1-Open (baseline)
97
- # ─────────────────────────────────────────────────────────────────────────────
98
-
99
- def score_hhem(question: str, response: str, context: str) -> dict:
100
- has_context = bool(context.strip())
101
- premise = (
102
- f"{context.strip()}\n\n{question}".strip()
103
- if has_context
104
- else question
105
- )
106
-
107
- # T5 max is ~512 tokens — truncate premise to safe char limit
108
- if len(premise) > 1800:
109
- premise = premise[:1800]
110
-
111
- start = time.perf_counter()
112
- scores = _hhem.predict([(premise, response)])
113
- raw_score = float(scores[0])
114
- elapsed_ms = (time.perf_counter() - start) * 1000
115
-
116
- return {
117
- "method": "HHEM-2.1-Open",
118
- "raw_score": round(raw_score, 4),
119
- "grounded": raw_score >= 0.5,
120
- "elapsed_ms": round(elapsed_ms, 1),
121
- "label": "consistent" if raw_score >= 0.5 else "hallucinated",
122
- }
123
-
124
-
125
- # ───────────────────────────────────────────���─────────────────────────────────
126
- # COMPARISON — called by Gradio on every submission
127
- # ─────────────────────────────────────────────────────────────────────────────
128
-
129
- def run_comparison(
130
- question: str, context: str, response: str
131
- ) -> tuple[str, str, str]:
132
-
133
- if not question.strip():
134
- return "Provide a question.", "", ""
135
- if not response.strip():
136
- return "Provide a response to evaluate.", "", ""
137
-
138
- gl = score_groundlens(question, response, context)
139
- hhem = score_hhem(question, response, context)
140
-
141
- # groundlens result
142
- gl_verdict = (
143
- "​🟢​ Not hallucinated" if gl["grounded"]
144
- else "🔴​ Hallucinated"
145
- )
146
- gl_md = f"""**{gl_verdict}**
147
-
148
- | | |
149
- |---|---|
150
- | Method | `{gl["method"]}` |
151
- | Score | `{gl["raw_score"]}` |
152
- | Threshold | `{gl["threshold"]}` |
153
- | Latency | `{gl["elapsed_ms"]} ms` |
154
-
155
- {gl["mode_note"]}"""
156
-
157
- # HHEM result
158
- hhem_verdict = (
159
- "​🟢​ Not hallucinated" if hhem["grounded"]
160
- else "🔴 Hallucinated"
161
- )
162
- hhem_md = f"""**{hhem_verdict}**
163
-
164
- | | |
165
- |---|---|
166
- | Method | `{hhem["method"]}` |
167
- | Score | `{hhem["raw_score"]}` |
168
- | Label | `{hhem["label"]}` |
169
- | Latency | `{hhem["elapsed_ms"]} ms` |
170
-
171
- *flan-T5 classifier. Full model inference per call.*"""
172
-
173
- # Agreement
174
- agree = gl["grounded"] == hhem["grounded"]
175
- if agree:
176
- agreement_md = "🔵​ **Both methods agree.**"
177
- else:
178
- agreement_md = """🟠​ **Methods disagree.**
179
-
180
- groundlens uses geometric displacement in embedding space.
181
- HHEM uses a learned classifier (fine-tuned flan-T5).
182
- Disagreement often surfaces **Type III hallucinations** — factual errors
183
- within the correct semantic frame. Embedding geometry cannot detect
184
- these: the response occupies the right region of the space but gets
185
- the facts wrong. See the
186
- [hallucination taxonomy](https://docs.groundlens.dev/theory/hallucination-taxonomy/)
187
- for details."""
188
-
189
- return gl_md, hhem_md, agreement_md
190
-
191
-
192
- # ─────────────────────────────────────────────────────────────────────────────
193
- # EXAMPLES
194
- # ─────────────────────────────────────────────────────────────────────────────
195
-
196
- EXAMPLES = [
197
- [
198
- "What does the water damage policy cover?",
199
- "Coverage includes burst pipes and sudden appliance failure up to "
200
- "$50,000. Flood damage requires a separate NFIP policy. "
201
- "Deductible is $1,500 per occurrence.",
202
- "The policy covers burst pipes and sudden appliance failure up to "
203
- "$50,000 per occurrence, with a $1,500 deductible.",
204
- ],
205
- [
206
- "What does the water damage policy cover?",
207
- "Coverage includes burst pipes and sudden appliance failure up to "
208
- "$50,000. Flood damage requires a separate NFIP policy. "
209
- "Deductible is $1,500 per occurrence.",
210
- "The policy covers all water damage including floods "
211
- "with no deductible required.",
212
- ],
213
- [
214
- "What causes seasons on Earth?",
215
- "",
216
- "Seasons are caused by Earth's 23.5-degree axial tilt, which "
217
- "changes how directly sunlight hits each hemisphere.",
218
- ],
219
- [
220
- "What causes seasons on Earth?",
221
- "",
222
- "Seasons are regulated by the Atmospheric Regulation Committee, "
223
- "a UN body established in 1952 that adjusts global temperature "
224
- "through orbital satellites.",
225
- ],
226
- ]
227
-
228
-
229
- # ─────────────────────────────────────────────────────────────────────────────
230
- # CUSTOM THEME — dark, matching groundlens.dev
231
- # ─────────────────────────────────────────────────────────────────────────────
232
-
233
- theme = gr.themes.Base(
234
- primary_hue=gr.themes.Color(
235
- c50="#fff7ed",
236
- c100="#ffedd5",
237
- c200="#fed7aa",
238
- c300="#fdba74",
239
- c400="#fb923c",
240
- c500="#fc7604", # groundlens orange
241
- c600="#ea580c",
242
- c700="#c2410c",
243
- c800="#9a3412",
244
- c900="#7c2d12",
245
- c950="#431407",
246
- ),
247
- secondary_hue="slate",
248
- neutral_hue="slate",
249
- font=gr.themes.GoogleFont("Inter"),
250
- font_mono=gr.themes.GoogleFont("JetBrains Mono"),
251
- ).set(
252
- body_background_fill="#0a0a0a",
253
- body_background_fill_dark="#0a0a0a",
254
- body_text_color="#e2e8f0",
255
- body_text_color_dark="#e2e8f0",
256
- block_background_fill="#141414",
257
- block_background_fill_dark="#141414",
258
- block_border_color="#1e293b",
259
- block_border_color_dark="#1e293b",
260
- block_label_text_color="#94a3b8",
261
- block_label_text_color_dark="#94a3b8",
262
- block_title_text_color="#e2e8f0",
263
- block_title_text_color_dark="#e2e8f0",
264
- input_background_fill="#1e1e1e",
265
- input_background_fill_dark="#1e1e1e",
266
- input_border_color="#334155",
267
- input_border_color_dark="#334155",
268
- input_placeholder_color="#64748b",
269
- input_placeholder_color_dark="#64748b",
270
- button_primary_background_fill="#fc7604",
271
- button_primary_background_fill_dark="#fc7604",
272
- button_primary_background_fill_hover="#fb923c",
273
- button_primary_background_fill_hover_dark="#fb923c",
274
- button_primary_text_color="#0a0a0a",
275
- button_primary_text_color_dark="#0a0a0a",
276
- border_color_primary="#fc7604",
277
- border_color_primary_dark="#fc7604",
278
- )
279
-
280
-
281
- # ─────────────────────────────────────────────────────────────────────────────
282
- # INTERFACE
283
- # ─────────────────────────────────────────────────────────────────────────────
284
-
285
- css = """
286
- .gradio-container { max-width: 960px !important; }
287
- h1 { color: #fc7604 !important; font-weight: 700 !important; }
288
- h3 { color: #94a3b8 !important; font-weight: 400 !important; }
289
- a { color: #fd9a42 !important; }
290
- a:hover { color: #fec08a !important; }
291
- """
292
-
293
- with gr.Blocks(
294
- title="groundlens — Hallucination Detection Demo",
295
- theme=theme,
296
- css=css,
297
- ) as demo:
298
-
299
- gr.Markdown("""
300
- # groundlens
301
- ### Geometric LLM hallucination detection — benchmarked against Vectara HHEM-2.1-Open
302
-
303
- **With context (RAG)** — SGI measures whether the response engaged with
304
- the source document. Computed as `dist(response, question) / dist(response, context)`.
305
- No model inference for evaluation — one embedding, one ratio.
306
-
307
- **Without context** — DGI measures whether the response displacement
308
- aligns with the mean displacement of verified grounded pairs.
309
-
310
- [GitHub](https://github.com/groundlens-dev/groundlens) ·
311
- [Documentation](https://docs.groundlens.dev) ·
312
- [PyPI](https://pypi.org/project/groundlens/) ·
313
- [SGI paper](https://arxiv.org/abs/2512.13771) ·
314
- [Taxonomy paper](https://arxiv.org/pdf/2602.13224v3) ·
315
- [Mechanistic paper](https://arxiv.org/abs/2603.13259)
316
- """)
317
-
318
- with gr.Row():
319
- with gr.Column():
320
- q_in = gr.Textbox(
321
- label="Question",
322
- placeholder="What does the policy cover for water damage?",
323
- lines=2,
324
- )
325
- ctx_in = gr.Textbox(
326
- label="Context (optional — leave blank for DGI mode)",
327
- placeholder="Paste source document or retrieved chunks here.",
328
- lines=5,
329
- )
330
- r_in = gr.Textbox(
331
- label="LLM Response",
332
- placeholder="The model response to evaluate.",
333
- lines=4,
334
- )
335
- run_btn = gr.Button("Evaluate", variant="primary")
336
-
337
- with gr.Row():
338
- gl_out = gr.Markdown(label="groundlens")
339
- hhem_out = gr.Markdown(label="HHEM-2.1-Open")
340
-
341
- agreement_out = gr.Markdown(label="Agreement")
342
-
343
- gr.Examples(
344
- examples=EXAMPLES,
345
- inputs=[q_in, ctx_in, r_in],
346
- label="Examples",
347
- )
348
-
349
- gr.Markdown("""
350
- ---
351
- *groundlens is MIT-licensed. Built by [Javier Marin](https://jmarin.info).
352
- This demo uses the same `groundlens` library available via `pip install groundlens`.*
353
- """)
354
-
355
- run_btn.click(
356
- fn=run_comparison,
357
- inputs=[q_in, ctx_in, r_in],
358
- outputs=[gl_out, hhem_out, agreement_out],
359
- )
360
-
361
- if __name__ == "__main__":
362
- demo.launch()