""" FactEval Gradio Demo – Interactive factuality checker. Run locally: python demo/app.py Run on Colab: Upload facteval/ folder, then run this file. """ import json import gradio as gr from facteval import analyze, fast_check EXAMPLES = [ [ "Patient presents with acute appendicitis. Given 500mg Amoxicillin. Discharge scheduled for tomorrow.", "Patient was diagnosed with acute appendicitis and underwent successful appendectomy. Post-operative care includes IV fluids and rest. No antibiotics were administered. Patient will remain under observation for 48 hours." ], [ "Tesla's Q3 revenue reached $25 billion, a 40% year-over-year increase. The company delivered 500,000 vehicles in the quarter.", "Tesla reported Q3 revenue of $23.35 billion, representing a 9% year-over-year increase. Vehicle deliveries for the quarter totaled 435,059." ], [ "To start a React project, run `npm init react-app my-app` in your terminal. This will install React v17 by default.", "To create a new React single-page application, the recommended command is `npx create-react-app my-app`. This installs the latest stable version of React, currently v18." ] ] def run_check(answer: str, contexts: str, calibrator_path: str = ""): """Run FactEval pipeline and format results for Gradio.""" if not answer.strip(): return "⚠️ Please enter an answer to check.", "", "", "" context_list = [c.strip() for c in contexts.strip().split("\n") if c.strip()] if not context_list: return "⚠️ Please enter at least one context passage.", "", "", "" cal_path = calibrator_path.strip() if calibrator_path.strip() else None result = analyze(answer, context_list, calibrator_path=cal_path) # 1. Highlighted answer (the viral feature) highlighted_html = f"""

{result.get("highlighted_answer", answer)}

""" # 2. Per-claim verdicts with reasons details_parts = [] for c in result["claims"]: label = c["label"] colors = {"supported": "#22c55e", "contradicted": "#ef4444", "unverifiable": "#f59e0b"} emojis = {"supported": "✅", "contradicted": "❌", "unverifiable": "❓"} color = colors.get(label, "#94a3b8") emoji = emojis.get(label, "") conf = c.get("calibrated_confidence", c["confidence"]) diag = c.get("diagnostics", {}) diag_type = diag.get("failure_type", "") diag_badge_colors = { "verified": "#22c55e", "hallucination": "#ef4444", "possible_hallucination": "#f97316", "no_evidence": "#6b7280", "retrieval_gap": "#8b5cf6", "inconclusive": "#f59e0b", } badge_color = diag_badge_colors.get(diag_type, "#64748b") suggestion = diag.get("suggestion", "") details_parts.append(f"""

{emoji} {c["claim"]} {diag_type.replace("_", " ")}

{c.get("reason", "")}

💡 ' + suggestion + '

' if suggestion else ''}

Confidence: {conf:.1%} {"• Evidence score: " + f"{c['evidence_score']:.3f}" if c.get("evidence_score") else ""} • Retrieval: {diag.get("retrieval_quality", "n/a")}

""") details_html = '

' + ''.join(details_parts) + '

' # 3. Summary card s = result["summary"] summary_html = f"""

📊 Summary

{s['total_claims']}

Total Claims

{s['supported']}

Supported

{s['contradicted']}

Contradicted

{s['unverifiable']}

Unverifiable

Hallucination Rate
{s['hallucination_rate']:.0%}

⏱ {result['pipeline_time_seconds']:.1f}s {'• 📐 calibrated' if result.get('calibrated') else '• raw scores'}

""" # 4. Raw JSON json_output = json.dumps(result, indent=2, ensure_ascii=False) return highlighted_html, details_html, summary_html, json_output # ── Gradio Interface ───────────────────────────────────────────────────────── with gr.Blocks( title="FactEval – Hallucination Detector", theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"), css=""" .gradio-container { max-width: 1400px !important; } footer { display: none !important; } """, ) as demo: gr.Markdown( """ # 🔍 FactEval – Find Exactly Which Parts Are Hallucinated Paste an LLM-generated answer and reference contexts. FactEval highlights ✅ **supported**, ❌ **contradicted**, and ❓ **unverifiable** claims. """ ) with gr.Row(): # LEFT COLUMN: Inputs & Examples with gr.Column(scale=1): answer_input = gr.Textbox( label="LLM Answer", placeholder="Enter the text to fact-check...", value=EXAMPLES[0][0], lines=4, ) context_input = gr.Textbox( label="Reference Contexts (one per line)", placeholder="Enter ground truth passages, one per line...", value=EXAMPLES[0][1], lines=5, ) calibrator_input = gr.Textbox( label="Calibrator Path (optional)", placeholder="Path to calibrator.pkl", lines=1, ) check_btn = gr.Button("🔍 Check Factuality", variant="primary", size="lg") gr.Examples( examples=EXAMPLES, inputs=[answer_input, context_input], label="Try these examples", ) # RIGHT COLUMN: Outputs with gr.Column(scale=1): gr.Markdown("### 📝 Highlighted Answer") highlighted_output = gr.HTML() with gr.Row(): with gr.Column(scale=2): gr.Markdown("### 📋 Claim Details") details_output = gr.HTML() with gr.Column(scale=1): summary_output = gr.HTML() with gr.Accordion("Raw JSON Output", open=False): json_output = gr.Code(language="json") check_btn.click( fn=run_check, inputs=[answer_input, context_input, calibrator_input], outputs=[highlighted_output, details_output, summary_output, json_output], ) demo.load( fn=run_check, inputs=[answer_input, context_input, calibrator_input], outputs=[highlighted_output, details_output, summary_output, json_output], ) if __name__ == "__main__": demo.launch(share=True)