AI-that-works commited on
Commit
5e8dbda
·
verified ·
1 Parent(s): c873959

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -543
app.py DELETED
@@ -1,543 +0,0 @@
1
- """
2
- groundlens — Geometric LLM Hallucination Detection Demo
3
-
4
- Plain-language interface: paste a question and the AI's answer,
5
- optionally upload context (PDF, Excel, or plain text).
6
- Compares groundlens (embedding geometry) vs Vectara HHEM-2.1-Open.
7
-
8
- Models load once at module level to avoid cold-start on Space wake.
9
- """
10
-
11
- import logging
12
- import time
13
- import tempfile
14
- import os
15
-
16
- import gradio as gr
17
- from groundlens import compute_sgi, compute_dgi
18
-
19
- logging.basicConfig(level=logging.INFO)
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- # ─────────────────────────────────────────────────────────────────────────────
24
- # FILE EXTRACTION — PDF and Excel support
25
- # ─────────────────────────────────────────────────────────────────────────────
26
-
27
- def extract_pdf_text(file_path: str, max_chars: int = 8000) -> str:
28
- """Extract text from a PDF file."""
29
- try:
30
- import pdfplumber
31
- text_parts = []
32
- with pdfplumber.open(file_path) as pdf:
33
- for page in pdf.pages[:20]: # limit to 20 pages
34
- page_text = page.extract_text()
35
- if page_text:
36
- text_parts.append(page_text)
37
- full_text = "\n\n".join(text_parts)
38
- return full_text[:max_chars] if len(full_text) > max_chars else full_text
39
- except Exception as e:
40
- return f"[Could not read PDF: {e}]"
41
-
42
-
43
- def extract_excel_text(file_path: str, max_chars: int = 8000) -> str:
44
- """Extract text from an Excel file."""
45
- try:
46
- import openpyxl
47
- wb = openpyxl.load_workbook(file_path, data_only=True)
48
- text_parts = []
49
- for sheet_name in wb.sheetnames[:5]: # limit to 5 sheets
50
- ws = wb[sheet_name]
51
- text_parts.append(f"--- {sheet_name} ---")
52
- for row in ws.iter_rows(max_row=200, values_only=True):
53
- cells = [str(c) if c is not None else "" for c in row]
54
- line = " | ".join(cells).strip()
55
- if line and line != " | ".join([""] * len(cells)):
56
- text_parts.append(line)
57
- full_text = "\n".join(text_parts)
58
- return full_text[:max_chars] if len(full_text) > max_chars else full_text
59
- except Exception as e:
60
- return f"[Could not read Excel file: {e}]"
61
-
62
-
63
- def process_uploaded_file(file) -> str:
64
- """Extract text from an uploaded file (PDF or Excel)."""
65
- if file is None:
66
- return ""
67
-
68
- file_path = file.name if hasattr(file, 'name') else str(file)
69
- ext = os.path.splitext(file_path)[1].lower()
70
-
71
- if ext == ".pdf":
72
- return extract_pdf_text(file_path)
73
- elif ext in (".xlsx", ".xls"):
74
- return extract_excel_text(file_path)
75
- elif ext in (".txt", ".md", ".csv"):
76
- try:
77
- with open(file_path, "r", encoding="utf-8", errors="replace") as f:
78
- text = f.read(8000)
79
- return text
80
- except Exception as e:
81
- return f"[Could not read file: {e}]"
82
- else:
83
- return f"[Unsupported file type: {ext}. Use PDF, Excel, TXT, or CSV.]"
84
-
85
-
86
- # ─────────────────────────────────────────────────────────────────────────────
87
- # HHEM-2.1-Open — baseline comparison
88
- # ─────────────────────────────────────────────────────────────────────────────
89
-
90
- logger.info("Loading HHEM-2.1-Open...")
91
- from transformers import AutoModelForSequenceClassification
92
-
93
- _hhem = AutoModelForSequenceClassification.from_pretrained(
94
- "vectara/hallucination_evaluation_model",
95
- trust_remote_code=True,
96
- )
97
- logger.info("HHEM loaded.")
98
-
99
- # Warm up groundlens embedding model
100
- logger.info("Warming up groundlens...")
101
- compute_dgi(question="warmup", response="warmup")
102
- logger.info("groundlens ready.")
103
-
104
-
105
- # ─────────────────────────────────────────────────────────────────────────────
106
- # SCORING
107
- # ─────────────────────────────────────────────────────────────────────────────
108
-
109
- def score_groundlens(question: str, response: str, context: str) -> dict:
110
- start = time.perf_counter()
111
- has_context = bool(context.strip())
112
-
113
- if has_context:
114
- result = compute_sgi(
115
- question=question,
116
- context=context,
117
- response=response,
118
- )
119
- method = "SGI (with context)"
120
- raw_score = result.value
121
- grounded = not result.flagged
122
- threshold = 0.95
123
- mode_note = (
124
- "Measured how much the AI's answer used your source document "
125
- "vs. just rephrasing the question."
126
- )
127
- else:
128
- result = compute_dgi(
129
- question=question,
130
- response=response,
131
- )
132
- method = "DGI (without context)"
133
- raw_score = result.value
134
- grounded = not result.flagged
135
- threshold = 0.30
136
- mode_note = (
137
- "Measured whether the AI's answer follows patterns typical "
138
- "of grounded, factual responses."
139
- )
140
-
141
- elapsed_ms = (time.perf_counter() - start) * 1000
142
-
143
- return {
144
- "method": method,
145
- "raw_score": round(raw_score, 4),
146
- "grounded": grounded,
147
- "threshold": threshold,
148
- "elapsed_ms": round(elapsed_ms, 1),
149
- "mode_note": mode_note,
150
- }
151
-
152
-
153
- def score_hhem(question: str, response: str, context: str) -> dict:
154
- has_context = bool(context.strip())
155
- premise = (
156
- f"{context.strip()}\n\n{question}".strip()
157
- if has_context
158
- else question
159
- )
160
- if len(premise) > 1800:
161
- premise = premise[:1800]
162
-
163
- start = time.perf_counter()
164
- scores = _hhem.predict([(premise, response)])
165
- raw_score = float(scores[0])
166
- elapsed_ms = (time.perf_counter() - start) * 1000
167
-
168
- return {
169
- "method": "HHEM-2.1-Open",
170
- "raw_score": round(raw_score, 4),
171
- "grounded": raw_score >= 0.5,
172
- "elapsed_ms": round(elapsed_ms, 1),
173
- "label": "consistent" if raw_score >= 0.5 else "hallucinated",
174
- }
175
-
176
-
177
- # ─────────────────────────────────────────────────────────────────────────────
178
- # MAIN COMPARISON
179
- # ─────────────────────────────────────────────────────────────────────────────
180
-
181
- def run_comparison(
182
- question: str, context_text: str, file_upload, response: str
183
- ) -> tuple[str, str, str]:
184
-
185
- if not question.strip():
186
- return "⚠️ Enter the question you asked the AI.", "", ""
187
- if not response.strip():
188
- return "⚠️ Enter the AI's response.", "", ""
189
-
190
- # Merge context: typed text + uploaded file
191
- context_parts = []
192
- if context_text and context_text.strip():
193
- context_parts.append(context_text.strip())
194
- if file_upload is not None:
195
- extracted = process_uploaded_file(file_upload)
196
- if extracted and not extracted.startswith("["):
197
- context_parts.append(extracted)
198
- elif extracted.startswith("["):
199
- return f"⚠️ {extracted}", "", ""
200
- context = "\n\n".join(context_parts)
201
-
202
- gl = score_groundlens(question, response, context)
203
- hhem = score_hhem(question, response, context)
204
-
205
- # groundlens result
206
- if gl["grounded"]:
207
- gl_verdict = "🟢 Looks grounded"
208
- gl_explain = "The AI's answer appears to be based on real information."
209
- else:
210
- gl_verdict = "🔴 Possible hallucination"
211
- gl_explain = "The AI's answer shows signs of being fabricated or not grounded in the source."
212
-
213
- gl_md = f"""### groundlens
214
-
215
- **{gl_verdict}**
216
-
217
- {gl_explain}
218
-
219
- | | |
220
- |---|---|
221
- | **Method** | {gl["method"]} |
222
- | **Score** | {gl["raw_score"]} (threshold: {gl["threshold"]}) |
223
- | **Time** | {gl["elapsed_ms"]} ms |
224
-
225
- *{gl["mode_note"]}*"""
226
-
227
- # HHEM result
228
- if hhem["grounded"]:
229
- hhem_verdict = "🟢 Looks consistent"
230
- hhem_explain = "The classifier considers this answer consistent with the input."
231
- else:
232
- hhem_verdict = "🔴 Possible hallucination"
233
- hhem_explain = "The classifier flagged this answer as potentially hallucinated."
234
-
235
- hhem_md = f"""### Vectara HHEM-2.1-Open
236
-
237
- **{hhem_verdict}**
238
-
239
- {hhem_explain}
240
-
241
- | | |
242
- |---|---|
243
- | **Method** | {hhem["method"]} |
244
- | **Score** | {hhem["raw_score"]} ({hhem["label"]}) |
245
- | **Time** | {hhem["elapsed_ms"]} ms |
246
-
247
- *Fine-tuned flan-T5 classifier.*"""
248
-
249
- # Agreement
250
- agree = gl["grounded"] == hhem["grounded"]
251
- if agree and gl["grounded"]:
252
- agreement_md = "### 🔵 Both methods agree: the answer looks reliable."
253
- elif agree and not gl["grounded"]:
254
- agreement_md = "### 🔴 Both methods agree: this answer is likely hallucinated."
255
- else:
256
- agreement_md = """### 🟠 The two methods disagree.
257
-
258
- This often happens with **subtle factual errors** — the answer sounds right and
259
- uses the correct vocabulary, but gets specific facts wrong. Embedding geometry
260
- (groundlens) measures the shape of the answer; the classifier (HHEM) evaluates
261
- its content differently. When they disagree, it's worth checking the facts manually.
262
-
263
- [Learn more about hallucination types →](https://docs.groundlens.dev/theory/hallucination-taxonomy/)"""
264
-
265
- return gl_md, hhem_md, agreement_md
266
-
267
-
268
- # ─────────────────────────────────────────────────────────────────────────────
269
- # EXAMPLES
270
- # ─────────────────────────────────────────────────────────────────────────────
271
-
272
- EXAMPLES = [
273
- [
274
- "What does the water damage policy cover?",
275
- "Coverage includes burst pipes and sudden appliance failure up to "
276
- "$50,000. Flood damage requires a separate NFIP policy. "
277
- "Deductible is $1,500 per occurrence.",
278
- "The policy covers burst pipes and sudden appliance failure up to "
279
- "$50,000 per occurrence, with a $1,500 deductible.",
280
- ],
281
- [
282
- "What does the water damage policy cover?",
283
- "Coverage includes burst pipes and sudden appliance failure up to "
284
- "$50,000. Flood damage requires a separate NFIP policy. "
285
- "Deductible is $1,500 per occurrence.",
286
- "The policy covers all water damage including floods "
287
- "with no deductible required.",
288
- ],
289
- [
290
- "What causes seasons on Earth?",
291
- "",
292
- "Seasons are caused by Earth's 23.5-degree axial tilt, which "
293
- "changes how directly sunlight hits each hemisphere.",
294
- ],
295
- [
296
- "What causes seasons on Earth?",
297
- "",
298
- "Seasons are regulated by the Atmospheric Regulation Committee, "
299
- "a UN body established in 1952 that adjusts global temperature "
300
- "through orbital satellites.",
301
- ],
302
- ]
303
-
304
-
305
- # ─────────────────────────────────────────────────────────────────────────────
306
- # THEME — dark, matching groundlens.dev
307
- # ─────────────────────────────────────────────────────────────────────────────
308
-
309
- theme = gr.themes.Base(
310
- primary_hue=gr.themes.Color(
311
- c50="#fff7ed",
312
- c100="#ffedd5",
313
- c200="#fed7aa",
314
- c300="#fdba74",
315
- c400="#fb923c",
316
- c500="#fc7604",
317
- c600="#ea580c",
318
- c700="#c2410c",
319
- c800="#9a3412",
320
- c900="#7c2d12",
321
- c950="#431407",
322
- ),
323
- secondary_hue="slate",
324
- neutral_hue="slate",
325
- font=gr.themes.GoogleFont("Inter"),
326
- font_mono=gr.themes.GoogleFont("JetBrains Mono"),
327
- text_size=gr.themes.sizes.text_lg,
328
- radius_size=gr.themes.sizes.radius_md,
329
- ).set(
330
- body_background_fill="#0a0a0a",
331
- body_background_fill_dark="#0a0a0a",
332
- body_text_color="#e2e8f0",
333
- body_text_color_dark="#e2e8f0",
334
- body_text_size="1rem",
335
- block_background_fill="#141414",
336
- block_background_fill_dark="#141414",
337
- block_border_color="#1e293b",
338
- block_border_color_dark="#1e293b",
339
- block_label_text_color="#94a3b8",
340
- block_label_text_color_dark="#94a3b8",
341
- block_label_text_size="0.95rem",
342
- block_title_text_color="#e2e8f0",
343
- block_title_text_color_dark="#e2e8f0",
344
- input_background_fill="#1e1e1e",
345
- input_background_fill_dark="#1e1e1e",
346
- input_border_color="#334155",
347
- input_border_color_dark="#334155",
348
- input_text_size="1rem",
349
- input_placeholder_color="#64748b",
350
- input_placeholder_color_dark="#64748b",
351
- button_primary_background_fill="#fc7604",
352
- button_primary_background_fill_dark="#fc7604",
353
- button_primary_background_fill_hover="#fb923c",
354
- button_primary_background_fill_hover_dark="#fb923c",
355
- button_primary_text_color="#0a0a0a",
356
- button_primary_text_color_dark="#0a0a0a",
357
- button_large_text_size="1.1rem",
358
- border_color_primary="#fc7604",
359
- border_color_primary_dark="#fc7604",
360
- )
361
-
362
-
363
- # ─────────────────────────────────────────────────────────────────────────────
364
- # INTERFACE
365
- # ─────────────────────────────────────────────────────────────────────────────
366
-
367
- css = """
368
- .gradio-container {
369
- max-width: 1200px !important;
370
- margin: 0 auto !important;
371
- padding: 1.5rem !important;
372
- }
373
- h1 { color: #fc7604 !important; font-size: 2.2rem !important; font-weight: 700 !important; margin-bottom: 0.2rem !important; }
374
- h3 { font-size: 1.15rem !important; }
375
- .subtitle { color: #94a3b8 !important; font-size: 1.1rem !important; margin-top: 0 !important; }
376
- a { color: #fd9a42 !important; }
377
- a:hover { color: #fec08a !important; }
378
- .step-label { color: #fc7604; font-weight: 600; font-size: 1.05rem; }
379
- .links-bar { font-size: 0.9rem; color: #64748b; margin-top: 0.5rem; }
380
- .links-bar a { color: #64748b !important; }
381
- .links-bar a:hover { color: #fd9a42 !important; }
382
- footer { display: none !important; }
383
-
384
- /* Unified context box */
385
- .context-box {
386
- border: 1px solid #334155 !important;
387
- border-radius: 8px !important;
388
- padding: 1rem !important;
389
- background: #141414 !important;
390
- }
391
- .context-box .block {
392
- border: none !important;
393
- background: transparent !important;
394
- padding: 0 !important;
395
- box-shadow: none !important;
396
- }
397
- .context-box .wrap {
398
- gap: 0.75rem !important;
399
- }
400
- .context-box textarea {
401
- background: #1e1e1e !important;
402
- border: 1px solid #334155 !important;
403
- border-radius: 6px !important;
404
- }
405
- .context-divider {
406
- text-align: center;
407
- color: #64748b !important;
408
- font-size: 0.85rem !important;
409
- margin: 0.25rem 0 !important;
410
- padding: 0 !important;
411
- }
412
- .context-divider p { margin: 0 !important; }
413
- .context-box .file-upload,
414
- .context-box .upload-button {
415
- border: 1px dashed #475569 !important;
416
- border-radius: 6px !important;
417
- background: #1a1a1a !important;
418
- }
419
- .context-box .file-preview {
420
- border: none !important;
421
- }
422
-
423
- @media (max-width: 768px) {
424
- .gradio-container { padding: 0.75rem !important; }
425
- h1 { font-size: 1.6rem !important; }
426
- }
427
- """
428
-
429
- with gr.Blocks(
430
- title="groundlens — Check if your AI is hallucinating",
431
- theme=theme,
432
- css=css,
433
- ) as demo:
434
-
435
- gr.Markdown("""
436
- # groundlens
437
-
438
- <p class="subtitle">Check if an AI gave you a real answer or made something up.</p>
439
- """)
440
-
441
- gr.Markdown("""
442
- You asked an AI a question and got an answer. Was it real or hallucinated?
443
- Paste both below and we'll check using two independent methods: **groundlens**
444
- (geometric analysis) and **Vectara HHEM** (neural classifier).
445
- """)
446
-
447
- gr.Markdown("""<p class="links-bar">
448
- <a href="https://github.com/groundlens-dev/groundlens">GitHub</a> ·
449
- <a href="https://docs.groundlens.dev">Docs</a> ·
450
- <a href="https://pypi.org/project/groundlens/">PyPI</a> ·
451
- <a href="https://arxiv.org/abs/2512.13771">SGI paper</a> ·
452
- <a href="https://arxiv.org/pdf/2602.13224v3">Taxonomy</a> ·
453
- <a href="https://arxiv.org/abs/2603.13259">Mechanistic paper</a>
454
- </p>""")
455
-
456
- # ── Step 1: Question ──
457
- gr.Markdown('<p class="step-label">1. What did you ask the AI?</p>')
458
- q_in = gr.Textbox(
459
- show_label=False,
460
- placeholder="e.g. What does our insurance policy cover for water damage?",
461
- lines=2,
462
- )
463
-
464
- # ── Step 2: Context ──
465
- gr.Markdown(
466
- '<p class="step-label">2. Did you give the AI any source material? (optional)</p>'
467
- )
468
- gr.Markdown(
469
- "If you gave the AI a document, a webpage, an Excel file, or any reference "
470
- "material to base its answer on, paste the text here or upload the file. "
471
- "If you just asked a question with no source, skip this step.",
472
- )
473
-
474
- with gr.Group(elem_classes=["context-box"]):
475
- ctx_in = gr.Textbox(
476
- show_label=False,
477
- placeholder="Paste source text here...",
478
- lines=4,
479
- container=False,
480
- )
481
- gr.Markdown("— or —", elem_classes=["context-divider"])
482
- file_in = gr.File(
483
- label="Upload a file (PDF, Excel, CSV, TXT — max 20 pages / 200 rows)",
484
- file_types=[".pdf", ".xlsx", ".xls", ".csv", ".txt"],
485
- file_count="single",
486
- height=60,
487
- )
488
-
489
- # ── Step 3: Response ──
490
- gr.Markdown('<p class="step-label">3. What did the AI answer?</p>')
491
- r_in = gr.Textbox(
492
- show_label=False,
493
- placeholder="Paste the AI's response here...",
494
- lines=4,
495
- )
496
-
497
- # ── Evaluate button ──
498
- run_btn = gr.Button(
499
- "Check for hallucination",
500
- variant="primary",
501
- size="lg",
502
- )
503
-
504
- # ── Results ──
505
- with gr.Row(equal_height=True):
506
- gl_out = gr.Markdown()
507
- hhem_out = gr.Markdown()
508
-
509
- agreement_out = gr.Markdown()
510
-
511
- # ── Examples ──
512
- gr.Markdown("---")
513
- gr.Markdown("### Try an example")
514
-
515
- gr.Examples(
516
- examples=EXAMPLES,
517
- inputs=[q_in, ctx_in, r_in],
518
- label="",
519
- )
520
-
521
- # ── Footer ──
522
- gr.Markdown("""
523
- ---
524
-
525
- <p style="color:#475569; font-size:0.85rem; text-align:center;">
526
- <strong>groundlens</strong> is open source (MIT). Built by
527
- <a href="https://jmarin.info" style="color:#64748b !important;">Javier Marin</a>.
528
- This demo runs the same library available via <code>pip install groundlens</code>.<br>
529
- groundlens is verification triage, not a truth oracle. It tells you which answers
530
- deserve trust and which need a closer look.
531
- </p>
532
- """)
533
-
534
- # ── Event binding ──
535
- run_btn.click(
536
- fn=run_comparison,
537
- inputs=[q_in, ctx_in, file_in, r_in],
538
- outputs=[gl_out, hhem_out, agreement_out],
539
- )
540
-
541
-
542
- if __name__ == "__main__":
543
- demo.launch()