Spaces:

sahilfarib
/

FactEval

Running

App Files Files Community

Sahil al farib commited on 2 days ago

Commit

8fb73f8

1 Parent(s): b29689f

Deploy FactEval: claim-level hallucination detection with Gradio demo

Browse files

Files changed (13) hide show

README.md +20 -6
app.py +5 -0
demo/app.py +202 -0
facteval/__init__.py +61 -0
facteval/calibrator.py +90 -0
facteval/claim_extractor.py +138 -0
facteval/cli.py +127 -0
facteval/config.py +29 -0
facteval/core.py +328 -0
facteval/models.py +46 -0
facteval/retriever.py +151 -0
facteval/verifier.py +235 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,15 +1,29 @@
 ---
 title: FactEval
-emoji: 🏆
-colorFrom: indigo
-colorTo: blue
 sdk: gradio
-sdk_version: 6.14.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 license: mit
 short_description: Find exactly which parts of your LLM output are hallucinated
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: FactEval
+emoji: 🔍
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
 pinned: false
 license: mit
 short_description: Find exactly which parts of your LLM output are hallucinated
 ---
+# 🔍 FactEval
+**Find exactly which parts of your LLM output are hallucinated.**
+Debug hallucinations in RAG and LLM pipelines with claim-level verification.
+Paste an LLM-generated answer and reference contexts. FactEval highlights ✅ **supported**, ❌ **contradicted**, and ❓ **unverifiable** claims with human-readable reasons and pipeline diagnostics.
+## How it works
+1. **Claim Extraction** — Breaks the answer into atomic claims (Qwen2.5-1.5B)
+2. **Evidence Retrieval** — Finds the most relevant sentences from your contexts (MiniLM + FAISS)
+3. **NLI Verification** — Checks each claim against evidence (DeBERTa-v3)
+4. **Calibration** — Produces trustworthy confidence scores (Isotonic Regression)
+📦 [GitHub Repository](https://github.com/sahilaf/FactEval)

app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""HF Spaces entry point — imports the Gradio demo from demo/app.py."""
+from demo.app import demo
+if __name__ == "__main__":
+    demo.launch()

demo/app.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+FactEval Gradio Demo – Interactive factuality checker.
+Run locally:  python demo/app.py
+Run on Colab: Upload facteval/ folder, then run this file.
+"""
+import json
+import gradio as gr
+from facteval import check, verify
+EXAMPLES = [
+    [
+        "Paris is the capital of Germany and has 5 million people.",
+        "Paris is the capital of France. Paris has approximately 2.2 million inhabitants.\nGermany's capital is Berlin.",
+    ],
+    [
+        "Python was created by Guido van Rossum and first released in 2005.",
+        "Python was created by Guido van Rossum and first released in 1991.",
+    ],
+    [
+        "The Amazon rainforest produces 20% of the world's oxygen and spans across nine countries.",
+        "The Amazon rainforest produces about 6% of the world's oxygen.\nThe Amazon rainforest spans across nine countries in South America.",
+    ],
+    [
+        "Albert Einstein developed the theory of relativity and won the Nobel Prize in Physics in 1921 for his work on the photoelectric effect.",
+        "Albert Einstein developed the theory of relativity. He won the Nobel Prize in Physics in 1921 for his explanation of the photoelectric effect.",
+    ],
+]
+def run_check(answer: str, contexts: str, calibrator_path: str = ""):
+    """Run FactEval pipeline and format results for Gradio."""
+    if not answer.strip():
+        return "⚠️ Please enter an answer to check.", "", "", ""
+    context_list = [c.strip() for c in contexts.strip().split("\n") if c.strip()]
+    if not context_list:
+        return "⚠️ Please enter at least one context passage.", "", "", ""
+    cal_path = calibrator_path.strip() if calibrator_path.strip() else None
+    result = check(answer, context_list, calibrator_path=cal_path)
+    # 1. Highlighted answer (the viral feature)
+    highlighted_html = f"""
+    <div style="font-family: Inter, sans-serif; font-size: 18px; line-height: 2;
+                padding: 20px; border-radius: 12px; background: #0f172a; color: #e2e8f0;">
+        {result.get("highlighted_answer", answer)}
+    </div>
+    """
+    # 2. Per-claim verdicts with reasons
+    details_parts = []
+    for c in result["claims"]:
+        label = c["label"]
+        colors = {"supported": "#22c55e", "contradicted": "#ef4444", "unverifiable": "#f59e0b"}
+        emojis = {"supported": "✅", "contradicted": "❌", "unverifiable": "❓"}
+        color = colors.get(label, "#94a3b8")
+        emoji = emojis.get(label, "")
+        conf = c.get("calibrated_confidence", c["confidence"])
+        diag = c.get("diagnostics", {})
+        diag_type = diag.get("failure_type", "")
+        diag_badge_colors = {
+            "verified": "#22c55e", "hallucination": "#ef4444", "possible_hallucination": "#f97316",
+            "no_evidence": "#6b7280", "retrieval_gap": "#8b5cf6", "inconclusive": "#f59e0b",
+        }
+        badge_color = diag_badge_colors.get(diag_type, "#64748b")
+        suggestion = diag.get("suggestion", "")
+        details_parts.append(f"""
+        <div style="padding: 12px; margin: 8px 0; border-left: 4px solid {color};
+                    background: {color}10; border-radius: 0 8px 8px 0; font-family: Inter, sans-serif;">
+            <div style="font-weight: 600; font-size: 15px; color: #f1f5f9;">
+                {emoji} {c["claim"]}
+                <span style="font-size: 11px; padding: 2px 8px; border-radius: 12px;
+                             background: {badge_color}30; color: {badge_color}; margin-left: 8px;">
+                    {diag_type.replace("_", " ")}
+                </span>
+            </div>
+            <div style="font-size: 13px; color: #94a3b8; margin-top: 4px;">
+                {c.get("reason", "")}
+            </div>
+            {'<div style="font-size: 12px; color: #f59e0b; margin-top: 4px; font-style: italic;">💡 ' + suggestion + '</div>' if suggestion else ''}
+            <div style="font-size: 12px; color: #64748b; margin-top: 4px;">
+                Confidence: {conf:.1%}
+                {"• Evidence score: " + f"{c['evidence_score']:.3f}" if c.get("evidence_score") else ""}
+                • Retrieval: {diag.get("retrieval_quality", "n/a")}
+            </div>
+        </div>
+        """)
+    details_html = '<div>' + ''.join(details_parts) + '</div>'
+    # 3. Summary card
+    s = result["summary"]
+    summary_html = f"""
+    <div style="font-family: Inter, sans-serif; padding: 16px; border-radius: 12px;
+                background: linear-gradient(135deg, #1e293b, #334155); color: white;">
+        <h3 style="margin: 0 0 12px 0; color: #e2e8f0;">📊 Summary</h3>
+        <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 8px;">
+            <div style="padding: 8px; background: #ffffff10; border-radius: 8px;">
+                <div style="font-size: 24px; font-weight: bold;">{s['total_claims']}</div>
+                <div style="font-size: 12px; color: #94a3b8;">Total Claims</div>
+            </div>
+            <div style="padding: 8px; background: #22c55e20; border-radius: 8px;">
+                <div style="font-size: 24px; font-weight: bold; color: #22c55e;">{s['supported']}</div>
+                <div style="font-size: 12px; color: #94a3b8;">Supported</div>
+            </div>
+            <div style="padding: 8px; background: #ef444420; border-radius: 8px;">
+                <div style="font-size: 24px; font-weight: bold; color: #ef4444;">{s['contradicted']}</div>
+                <div style="font-size: 12px; color: #94a3b8;">Contradicted</div>
+            </div>
+            <div style="padding: 8px; background: #f59e0b20; border-radius: 8px;">
+                <div style="font-size: 24px; font-weight: bold; color: #f59e0b;">{s['unverifiable']}</div>
+                <div style="font-size: 12px; color: #94a3b8;">Unverifiable</div>
+            </div>
+        </div>
+        <div style="margin-top: 12px; padding: 8px; background: #ffffff10; border-radius: 8px; text-align: center;">
+            <span style="font-size: 14px; color: #94a3b8;">Hallucination Rate</span><br>
+            <span style="font-size: 28px; font-weight: bold;
+                         color: {'#22c55e' if s['hallucination_rate'] < 0.3 else '#ef4444'};">
+                {s['hallucination_rate']:.0%}
+            </span>
+        </div>
+        <div style="margin-top: 8px; font-size: 11px; color: #64748b; text-align: right;">
+            ⏱ {result['pipeline_time_seconds']:.1f}s
+            {'• 📐 calibrated' if result.get('calibrated') else '• raw scores'}
+        </div>
+    </div>
+    """
+    # 4. Raw JSON
+    json_output = json.dumps(result, indent=2, ensure_ascii=False)
+    return highlighted_html, details_html, summary_html, json_output
+# ── Gradio Interface ─────────────────────────────────────────────────────────
+with gr.Blocks(
+    title="FactEval – Hallucination Detector",
+    theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
+    css="""
+        .gradio-container { max-width: 960px !important; }
+        footer { display: none !important; }
+    """,
+) as demo:
+    gr.Markdown(
+        """
+        # 🔍 FactEval – Find Exactly Which Parts Are Hallucinated
+        Paste an LLM-generated answer and reference contexts.
+        FactEval highlights ✅ **supported**, ❌ **contradicted**, and ❓ **unverifiable** claims.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            answer_input = gr.Textbox(
+                label="LLM Answer",
+                placeholder="Enter the text to fact-check...",
+                lines=4,
+            )
+            context_input = gr.Textbox(
+                label="Reference Contexts (one per line)",
+                placeholder="Enter ground truth passages, one per line...",
+                lines=5,
+            )
+            calibrator_input = gr.Textbox(
+                label="Calibrator Path (optional)",
+                placeholder="Path to calibrator.pkl",
+                lines=1,
+            )
+            check_btn = gr.Button("🔍 Check Factuality", variant="primary", size="lg")
+    gr.Markdown("### 📝 Highlighted Answer")
+    highlighted_output = gr.HTML()
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown("### 📋 Claim Details")
+            details_output = gr.HTML()
+        with gr.Column(scale=1):
+            summary_output = gr.HTML()
+    with gr.Accordion("Raw JSON Output", open=False):
+        json_output = gr.Code(language="json")
+    check_btn.click(
+        fn=run_check,
+        inputs=[answer_input, context_input, calibrator_input],
+        outputs=[highlighted_output, details_output, summary_output, json_output],
+    )
+    gr.Examples(
+        examples=EXAMPLES,
+        inputs=[answer_input, context_input],
+        label="Try these examples",
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

facteval/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""FactEval – Find exactly which parts of your LLM output are hallucinated."""
+# Suppress known harmless warnings from dependencies before any imports
+import os as _os
+import sys as _sys
+import warnings as _warnings
+import logging as _logging
+import contextlib as _contextlib
+import io as _io
+# Suppress safetensors / accelerate noise
+_os.environ.setdefault("SAFETENSORS_LOG_LEVEL", "error")
+_os.environ.setdefault("ACCELERATE_LOG_LEVEL", "error")
+_logging.getLogger("safetensors").setLevel(_logging.ERROR)
+_logging.getLogger("accelerate").setLevel(_logging.ERROR)
+# Suppress HF Hub unauthenticated request warnings
+_logging.getLogger("huggingface_hub.utils._http").setLevel(_logging.ERROR)
+_logging.getLogger("huggingface_hub").setLevel(_logging.ERROR)
+# Suppress transformers info-level noise
+_logging.getLogger("transformers.modeling_utils").setLevel(_logging.ERROR)
+_logging.getLogger("transformers.generation.configuration_utils").setLevel(_logging.ERROR)
+# Suppress FutureWarning about clean_up_tokenization_spaces
+_warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
+@_contextlib.contextmanager
+def suppress_loading_noise():
+    """Suppress stdout + stderr noise during model loading (LOAD REPORT, sharding info)."""
+    old_stdout, old_stderr = _sys.stdout, _sys.stderr
+    _sys.stdout = _io.StringIO()
+    _sys.stderr = _io.StringIO()
+    try:
+        yield
+    finally:
+        _sys.stdout = old_stdout
+        _sys.stderr = old_stderr
+# Backward compat alias
+suppress_stdout = suppress_loading_noise
+# ── Public API ───────────────────────────────────────────────────────────────
+from facteval.core import check, verify
+from facteval.models import Claim, Evidence, ClaimWithEvidence
+from facteval.verifier import FactLabel, VerificationResult
+__version__ = "0.1.0"
+__all__ = [
+    "check",
+    "verify",
+    "Claim",
+    "Evidence",
+    "ClaimWithEvidence",
+    "FactLabel",
+    "VerificationResult",
+    "suppress_loading_noise",
+]

facteval/calibrator.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Calibrator – Transforms raw NLI scores into calibrated probabilities.
+Uses isotonic regression models (fitted in Week 0) to produce trustworthy
+confidence scores and calibration error estimates.
+Falls back gracefully to raw scores if no calibrator file is available.
+"""
+import logging
+import pickle
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class Calibrator:
+    """Apply isotonic regression calibration to raw NLI probabilities."""
+    def __init__(self, calibrator_path: str | Path | None = None):
+        """
+        Load a pre-fitted calibrator from a pickle file.
+        Args:
+            calibrator_path: Path to the pickle file containing a dict of
+                {label_name: IsotonicRegression} objects.
+                If None or file doesn't exist, falls back to raw scores.
+        """
+        self._calibrators: dict | None = None
+        if calibrator_path is not None:
+            path = Path(calibrator_path)
+            if path.exists():
+                with open(path, "rb") as f:
+                    self._calibrators = pickle.load(f)
+                logger.info(
+                    "Loaded calibrator from %s (labels: %s)",
+                    path, list(self._calibrators.keys()),
+                )
+            else:
+                logger.warning("Calibrator file not found: %s. Using raw scores.", path)
+    @property
+    def is_calibrated(self) -> bool:
+        """Whether a calibrator is loaded."""
+        return self._calibrators is not None
+    def calibrate(self, raw_scores: dict[str, float]) -> tuple[float, float]:
+        """
+        Calibrate raw NLI probabilities.
+        Args:
+            raw_scores: Dict mapping label names to raw probabilities
+                        (e.g. {"entailment": 0.95, "neutral": 0.03, "contradiction": 0.02}).
+        Returns:
+            (calibrated_confidence, calibration_error)
+            - calibrated_confidence: The calibrated probability for the predicted label.
+            - calibration_error: Absolute difference between raw and calibrated confidence.
+        """
+        if not raw_scores:
+            return 0.0, 0.0
+        # Find the predicted label (highest raw score)
+        predicted_label = max(raw_scores, key=raw_scores.get)
+        raw_confidence = raw_scores[predicted_label]
+        if not self.is_calibrated:
+            # Fallback: return raw confidence with an estimated error
+            return raw_confidence, self._estimate_error(raw_confidence)
+        # Apply isotonic regression for each label
+        calibrated_scores = {}
+        for label, raw_prob in raw_scores.items():
+            if label in self._calibrators:
+                cal_prob = float(self._calibrators[label].predict([[raw_prob]])[0])
+                calibrated_scores[label] = max(0.0, min(1.0, cal_prob))
+            else:
+                calibrated_scores[label] = raw_prob
+        calibrated_confidence = calibrated_scores.get(predicted_label, raw_confidence)
+        calibration_error = abs(raw_confidence - calibrated_confidence)
+        return round(calibrated_confidence, 4), round(calibration_error, 4)
+    @staticmethod
+    def _estimate_error(raw_confidence: float) -> float:
+        """Rough error estimate when no calibrator is available."""
+        # Higher confidence → lower estimated error, but never zero
+        return round(max(0.02, (1.0 - raw_confidence) * 0.3), 4)

facteval/claim_extractor.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+Claim Extractor – Breaks text into atomic, verifiable claims.
+Uses Qwen2.5-1.5B-Instruct (chosen in Week 0 for speed and output quality)
+with the model's chat template to produce clean numbered lists.
+"""
+import re
+import logging
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from facteval import suppress_stdout
+from facteval.config import (
+    CLAIM_MODEL,
+    CLAIM_SYSTEM_PROMPT,
+    CLAIM_USER_PROMPT,
+    MAX_CLAIMS,
+    MAX_NEW_TOKENS,
+)
+from facteval.models import Claim
+logger = logging.getLogger(__name__)
+class ClaimExtractor:
+    """Extract atomic claims from text using a causal LM with chat prompting."""
+    def __init__(
+        self,
+        model_name: str = CLAIM_MODEL,
+        device: str | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        self.model_name = model_name
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.dtype = dtype or (torch.float16 if self.device == "cuda" else torch.float32)
+        logger.info("Loading claim extractor: %s on %s", model_name, self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name, trust_remote_code=True
+        )
+        with suppress_stdout():
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                dtype=self.dtype,
+                device_map="auto" if self.device == "cuda" else None,
+                trust_remote_code=True,
+            )
+        if self.device == "cpu":
+            self.model = self.model.to(self.device)
+        self.model.eval()
+        # Clear sampling params from generation_config to avoid
+        # "generation flags are not valid" warnings with do_sample=False
+        gen_cfg = self.model.generation_config
+        for attr in ("temperature", "top_p", "top_k"):
+            if hasattr(gen_cfg, attr):
+                setattr(gen_cfg, attr, None)
+        logger.info("Claim extractor ready.")
+    def extract(
+        self,
+        text: str,
+        max_claims: int = MAX_CLAIMS,
+        max_new_tokens: int = MAX_NEW_TOKENS,
+    ) -> list[Claim]:
+        """
+        Extract atomic claims from *text*.
+        Args:
+            text: The text to decompose into claims.
+            max_claims: Maximum number of claims to return.
+            max_new_tokens: Generation length cap (prevents rambling).
+        Returns:
+            A deduplicated list of Claim objects.
+        """
+        if not text or not text.strip():
+            return []
+        raw_output = self._generate(text, max_new_tokens)
+        claims = self._parse_claims(raw_output, text, max_claims)
+        logger.info("Extracted %d claims from %d-char text.", len(claims), len(text))
+        return claims
+    # ── Private helpers ──────────────────────────────────────────────────────
+    def _generate(self, text: str, max_new_tokens: int) -> str:
+        """Run the LLM to generate claim text."""
+        messages = [
+            {"role": "system", "content": CLAIM_SYSTEM_PROMPT},
+            {"role": "user", "content": CLAIM_USER_PROMPT.format(text=text)},
+        ]
+        prompt = self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=False,
+            )
+        # Decode only the newly generated tokens
+        generated = output_ids[0][inputs["input_ids"].shape[1]:]
+        return self.tokenizer.decode(generated, skip_special_tokens=True).strip()
+    @staticmethod
+    def _parse_claims(
+        raw: str, source_text: str, max_claims: int
+    ) -> list[Claim]:
+        """Parse numbered/bulleted list into deduplicated Claim objects."""
+        seen: set[str] = set()
+        claims: list[Claim] = []
+        for line in raw.split("\n"):
+            # Strip numbering (e.g. "1.", "1)", "- ", "• ")
+            cleaned = re.sub(r"^[\d.\)\-•\s]+", "", line).strip()
+            if len(cleaned) <= 5:
+                continue
+            # Normalize for dedup (lowercase, collapse whitespace)
+            key = re.sub(r"\s+", " ", cleaned.lower())
+            if key in seen:
+                continue
+            seen.add(key)
+            claims.append(Claim(text=cleaned, source_text=source_text))
+            if len(claims) >= max_claims:
+                break
+        return claims

facteval/cli.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+CLI – Command-line interface for FactEval.
+Usage:
+    facteval check input.json
+    facteval check input.json --output output.json
+    facteval check --answer "..." --context "ctx1" --context "ctx2"
+"""
+import argparse
+import json
+import sys
+import logging
+def main():
+    """Entry point for the facteval CLI."""
+    parser = argparse.ArgumentParser(
+        prog="facteval",
+        description="FactEval – Claim-level factuality evaluation with calibrated confidence.",
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # ── facteval check ───────────────────────────────────────────────────────
+    check_parser = subparsers.add_parser(
+        "check", help="Check an answer for factual accuracy against provided contexts."
+    )
+    check_parser.add_argument(
+        "input_file", nargs="?", default=None,
+        help='JSON file with "answer" and "contexts" keys.',
+    )
+    check_parser.add_argument(
+        "--answer", "-a", type=str, default=None,
+        help="The answer text to check (alternative to input file).",
+    )
+    check_parser.add_argument(
+        "--context", "-c", action="append", default=None,
+        help="Context passage (can be repeated). Alternative to input file.",
+    )
+    check_parser.add_argument(
+        "--output", "-o", type=str, default=None,
+        help="Output file path. If not provided, prints to stdout.",
+    )
+    check_parser.add_argument(
+        "--calibrator", type=str, default=None,
+        help="Path to a pre-fitted calibrator pickle file.",
+    )
+    check_parser.add_argument(
+        "--top-k", type=int, default=3,
+        help="Number of evidence sentences to retrieve per claim (default: 3).",
+    )
+    check_parser.add_argument(
+        "--max-claims", type=int, default=10,
+        help="Maximum number of claims to extract (default: 10).",
+    )
+    check_parser.add_argument(
+        "--verbose", "-v", action="store_true",
+        help="Enable verbose logging.",
+    )
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        sys.exit(0)
+    if args.command == "check":
+        _run_check(args)
+def _run_check(args):
+    """Execute the check command."""
+    # Configure logging
+    level = logging.INFO if args.verbose else logging.WARNING
+    logging.basicConfig(level=level, format="%(name)s | %(message)s")
+    # Parse input
+    answer, contexts = _parse_input(args)
+    if answer is None:
+        print("Error: Provide either an input JSON file or --answer + --context flags.", file=sys.stderr)
+        sys.exit(1)
+    # Import here to avoid slow import on --help
+    from facteval.core import check
+    # Run pipeline
+    result = check(
+        answer=answer,
+        contexts=contexts,
+        top_k=args.top_k,
+        max_claims=args.max_claims,
+        calibrator_path=args.calibrator,
+    )
+    # Output
+    output_json = json.dumps(result, indent=2, ensure_ascii=False)
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(output_json)
+        print(f"Results saved to {args.output}", file=sys.stderr)
+    else:
+        print(output_json)
+def _parse_input(args) -> tuple[str | None, list[str]]:
+    """Parse answer and contexts from file or CLI flags."""
+    # Option 1: JSON file
+    if args.input_file:
+        with open(args.input_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        return data.get("answer"), data.get("contexts", [])
+    # Option 2: CLI flags
+    if args.answer:
+        return args.answer, args.context or []
+    # Option 3: stdin
+    if not sys.stdin.isatty():
+        data = json.load(sys.stdin)
+        return data.get("answer"), data.get("contexts", [])
+    return None, []
+if __name__ == "__main__":
+    main()

facteval/config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Default configuration for FactEval models and parameters.
+"""
+# ── Model IDs (Hugging Face Hub) ─────────────────────────────────────────────
+# Claim extraction – chosen in Week 0: 1.5B was 3.5x faster with cleaner output
+CLAIM_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
+# Sentence embeddings for evidence retrieval
+EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+# NLI verification (used in Week 2)
+NLI_MODEL = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
+# ── Retrieval defaults ───────────────────────────────────────────────────────
+DEFAULT_TOP_K = 3
+MIN_EVIDENCE_SCORE = 0.3  # Below this, evidence is too weak to use
+# ── Claim extraction defaults ────────────────────────────────────────────────
+MAX_NEW_TOKENS = 200
+MAX_CLAIMS = 10
+CLAIM_SYSTEM_PROMPT = (
+    "You are a claim extraction engine. Given a text, break it into atomic, "
+    "independently verifiable claims. Each claim states exactly ONE fact. "
+    "Return ONLY a numbered list. No explanations, no commentary."
+)
+CLAIM_USER_PROMPT = "Break this into atomic claims:\n\n{text}"

facteval/core.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""
+Core – The main check() and verify() functions that wire the FactEval pipeline.
+Usage:
+    from facteval import check, verify
+    # Full pipeline (extract + retrieve + verify)
+    result = check(answer, contexts)
+    # Lightweight mode (skip extraction, bring your own claims)
+    result = verify(claims=["claim 1", "claim 2"], contexts=docs)
+"""
+import re
+import logging
+import time
+from pathlib import Path
+import numpy as np
+from facteval.calibrator import Calibrator
+from facteval.claim_extractor import ClaimExtractor
+from facteval.retriever import EvidenceRetriever
+from facteval.verifier import Verifier, FactLabel
+from facteval.models import Claim
+logger = logging.getLogger(__name__)
+# Module-level singletons (lazy-loaded)
+_extractor: ClaimExtractor | None = None
+_retriever: EvidenceRetriever | None = None
+_verifier: Verifier | None = None
+_calibrator: Calibrator | None = None
+_calibrator_path: str | None = None
+def _get_extractor() -> ClaimExtractor:
+    global _extractor
+    if _extractor is None:
+        _extractor = ClaimExtractor()
+    return _extractor
+def _get_retriever() -> EvidenceRetriever:
+    global _retriever
+    if _retriever is None:
+        _retriever = EvidenceRetriever()
+    return _retriever
+def _get_verifier() -> Verifier:
+    global _verifier
+    if _verifier is None:
+        _verifier = Verifier()
+    return _verifier
+def _get_calibrator(path: str | None = None) -> Calibrator:
+    global _calibrator, _calibrator_path
+    if _calibrator is None or path != _calibrator_path:
+        _calibrator = Calibrator(calibrator_path=path)
+        _calibrator_path = path
+    return _calibrator
+# ── Full pipeline ────────────────────────────────────────────────────────────
+def check(
+    answer: str,
+    contexts: list[str],
+    top_k: int = 3,
+    max_claims: int = 10,
+    calibrator_path: str | Path | None = None,
+) -> dict:
+    """
+    Run the full FactEval pipeline on an answer + contexts.
+    Stages: extract claims → retrieve evidence → NLI verify → calibrate.
+    Args:
+        answer:          The LLM-generated text to evaluate.
+        contexts:        List of reference passages (ground truth).
+        top_k:           Number of evidence sentences to retrieve per claim.
+        max_claims:      Maximum claims to extract.
+        calibrator_path: Path to a pre-fitted calibrator pickle file.
+    Returns:
+        A dict with claims, summary, highlighted_answer, and pipeline_time.
+    """
+    t0 = time.perf_counter()
+    # 1. Extract claims
+    extractor = _get_extractor()
+    claims = extractor.extract(answer, max_claims=max_claims)
+    logger.info("Extracted %d claims.", len(claims))
+    if not claims:
+        return {
+            "claims": [],
+            "summary": _build_summary([]),
+            "highlighted_answer": answer,
+            "calibrated": False,
+            "pipeline_time_seconds": round(time.perf_counter() - t0, 3),
+        }
+    # 2–5. Shared pipeline
+    return _run_pipeline(claims, contexts, answer, top_k, calibrator_path, t0)
+# ── Lightweight mode ─────────────────────────────────────────────────────────
+def verify(
+    claims: list[str],
+    contexts: list[str],
+    top_k: int = 3,
+    calibrator_path: str | Path | None = None,
+) -> dict:
+    """
+    Verify pre-extracted claims against contexts. Skips claim extraction.
+    Use this when you already have claims and want faster results
+    (avoids the ~1s extraction step and the Qwen model entirely).
+    Args:
+        claims:          List of claim strings to verify.
+        contexts:        List of reference passages (ground truth).
+        top_k:           Number of evidence sentences to retrieve per claim.
+        calibrator_path: Path to a pre-fitted calibrator pickle file.
+    Returns:
+        Same output format as check().
+    """
+    t0 = time.perf_counter()
+    claim_objs = [Claim(text=c) for c in claims if c.strip()]
+    if not claim_objs:
+        return {
+            "claims": [],
+            "summary": _build_summary([]),
+            "highlighted_answer": "",
+            "calibrated": False,
+            "pipeline_time_seconds": round(time.perf_counter() - t0, 3),
+        }
+    answer = " ".join(claims)  # reconstruct for highlighting
+    return _run_pipeline(claim_objs, contexts, answer, top_k, calibrator_path, t0)
+# ── Shared pipeline ──────────────────────────────────────────────────────────
+def _run_pipeline(
+    claims: list[Claim],
+    contexts: list[str],
+    answer: str,
+    top_k: int,
+    calibrator_path: str | Path | None,
+    t0: float,
+) -> dict:
+    """Shared pipeline: retrieve → verify → calibrate → diagnose → highlight."""
+    # 2. Retrieve evidence
+    retriever = _get_retriever()
+    retriever.index(contexts)
+    claims_with_evidence = retriever.retrieve_for_claims(claims, top_k=top_k)
+    # 3. Verify (batch NLI)
+    verifier = _get_verifier()
+    results = verifier.verify_batch(claims_with_evidence)
+    # 4. Calibrate
+    calibrator = _get_calibrator(str(calibrator_path) if calibrator_path else None)
+    for r in results:
+        if r.raw_scores:
+            cal_conf, cal_err = calibrator.calibrate(r.raw_scores)
+            r.calibrated_confidence = cal_conf
+            r.calibration_error = cal_err
+    # 5. Build output with diagnostics
+    elapsed = time.perf_counter() - t0
+    claim_dicts = [r.to_dict() for r in results]
+    # Add diagnostics to each claim
+    for cd in claim_dicts:
+        cd["diagnostics"] = _diagnose(cd)
+    return {
+        "claims": claim_dicts,
+        "summary": _build_summary(results),
+        "highlighted_answer": _highlight_answer_semantic(
+            answer, claim_dicts, retriever.embedder
+        ),
+        "calibrated": calibrator.is_calibrated,
+        "pipeline_time_seconds": round(elapsed, 3),
+    }
+# ── Diagnostics ──────────────────────────────────────────────────────────────
+def _diagnose(claim_dict: dict) -> dict:
+    """
+    Generate pipeline diagnostics for a claim.
+    Tells the developer *why* a claim got its label —
+    was it a retrieval failure or a genuine hallucination?
+    """
+    label = claim_dict["label"]
+    ev_score = claim_dict.get("evidence_score")
+    confidence = claim_dict.get("confidence", 0)
+    # Retrieval quality assessment
+    if ev_score is None:
+        retrieval_quality = "none"
+    elif ev_score >= 0.7:
+        retrieval_quality = "strong"
+    elif ev_score >= 0.4:
+        retrieval_quality = "moderate"
+    else:
+        retrieval_quality = "weak"
+    # Failure type classification
+    if label == "supported":
+        failure_type = "verified"
+        suggestion = None
+    elif label == "contradicted":
+        if retrieval_quality in ("strong", "moderate"):
+            failure_type = "hallucination"
+            suggestion = "Claim directly contradicts the evidence. This is a factual error in the LLM output."
+        else:
+            failure_type = "possible_hallucination"
+            suggestion = "Claim contradicts weak evidence. Consider adding more specific context for reliable verification."
+    elif ev_score is None:
+        failure_type = "no_evidence"
+        suggestion = "No relevant context was provided. Add reference passages covering this topic."
+    elif retrieval_quality == "weak":
+        failure_type = "retrieval_gap"
+        suggestion = "Evidence was found but too dissimilar to trust. The context may not cover this claim."
+    else:
+        failure_type = "inconclusive"
+        suggestion = "Evidence exists but is neutral — neither confirms nor denies the claim."
+    d = {
+        "failure_type": failure_type,
+        "retrieval_quality": retrieval_quality,
+    }
+    if suggestion:
+        d["suggestion"] = suggestion
+    return d
+# ── Summary ──────────────────────────────────────────────────────────────────
+def _build_summary(results: list) -> dict:
+    """Build summary statistics from verification results."""
+    total = len(results)
+    supported = sum(1 for r in results if r.label == FactLabel.SUPPORTED)
+    contradicted = sum(1 for r in results if r.label == FactLabel.CONTRADICTED)
+    unverifiable = total - supported - contradicted
+    return {
+        "total_claims": total,
+        "supported": supported,
+        "contradicted": contradicted,
+        "unverifiable": unverifiable,
+        "hallucination_rate": round(contradicted / max(total, 1), 4),
+    }
+# ── Semantic Highlighting ────────────────────────────────────────────────────
+_LABEL_EMOJI = {"supported": "✅", "contradicted": "❌", "unverifiable": "❓"}
+_LABEL_COLOR = {"supported": "#22c55e", "contradicted": "#ef4444", "unverifiable": "#f59e0b"}
+def _highlight_answer_semantic(answer: str, claim_dicts: list[dict], embedder) -> str:
+    """
+    Map claims to source sentences using embedding similarity (not Jaccard).
+    Uses the retriever's SentenceTransformer to compute cosine similarity
+    between each claim and each sentence in the original answer. This handles
+    paraphrasing, reordering, and partial overlaps much better than token overlap.
+    """
+    if not answer.strip() or not claim_dicts:
+        return answer
+    # Split answer into sentences with positions
+    sentences = []
+    for m in re.finditer(r'[^.!?]+[.!?]*', answer):
+        text = m.group().strip()
+        if text:
+            sentences.append(text)
+    if not sentences:
+        return answer
+    # Compute embedding similarity
+    claim_texts = [c["claim"] for c in claim_dicts]
+    claim_labels = [c["label"] for c in claim_dicts]
+    sent_embeddings = embedder.encode(sentences, normalize_embeddings=True)
+    claim_embeddings = embedder.encode(claim_texts, normalize_embeddings=True)
+    # Similarity matrix: sentences × claims
+    sim_matrix = np.dot(sent_embeddings, claim_embeddings.T)
+    # For each sentence, find best matching claim
+    sentence_labels: dict[str, str] = {}
+    for i, sent_text in enumerate(sentences):
+        best_j = int(sim_matrix[i].argmax())
+        best_sim = float(sim_matrix[i, best_j])
+        if best_sim > 0.35:  # Semantic similarity threshold
+            sentence_labels[sent_text] = claim_labels[best_j]
+    # Build highlighted text (longest matches first to avoid partial replacements)
+    highlighted = answer
+    for sent_text in sorted(sentence_labels, key=len, reverse=True):
+        label = sentence_labels[sent_text]
+        color = _LABEL_COLOR.get(label, "#94a3b8")
+        emoji = _LABEL_EMOJI.get(label, "")
+        highlighted = highlighted.replace(
+            sent_text,
+            f'<mark style="background:{color}30;padding:2px 4px;border-radius:3px">'
+            f'{sent_text} {emoji}</mark>',
+            1,
+        )
+    return highlighted

facteval/models.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Pydantic data models for FactEval's pipeline objects.
+"""
+from pydantic import BaseModel, Field
+class Claim(BaseModel):
+    """A single atomic, verifiable claim extracted from text."""
+    text: str = Field(..., description="The claim statement.")
+    source_text: str = Field(
+        default="",
+        description="The original text this claim was extracted from.",
+    )
+    def __str__(self) -> str:
+        return self.text
+class Evidence(BaseModel):
+    """A single piece of evidence retrieved for a claim."""
+    sentence: str = Field(..., description="The evidence sentence.")
+    score: float = Field(
+        ..., ge=0.0, description="Cosine similarity score (may slightly exceed 1.0 due to float precision)."
+    )
+    source_context: str = Field(
+        default="",
+        description="The full context passage this sentence came from.",
+    )
+    def __str__(self) -> str:
+        return f"[{self.score:.3f}] {self.sentence}"
+class ClaimWithEvidence(BaseModel):
+    """A claim paired with its retrieved evidence."""
+    claim: Claim
+    evidence: list[Evidence] = Field(default_factory=list)
+    @property
+    def best_evidence(self) -> Evidence | None:
+        """Return the highest-scoring evidence, or None."""
+        return self.evidence[0] if self.evidence else None

facteval/retriever.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""
+Evidence Retriever – FAISS-based semantic search over user-provided contexts.
+Encodes context sentences with all-MiniLM-L6-v2 and retrieves the top-k
+most similar evidence sentences for each claim.
+"""
+import re
+import logging
+import numpy as np
+import faiss
+from sentence_transformers import SentenceTransformer
+from facteval import suppress_stdout
+from facteval.config import DEFAULT_TOP_K, EMBEDDING_MODEL, MIN_EVIDENCE_SCORE
+from facteval.models import Claim, Evidence, ClaimWithEvidence
+logger = logging.getLogger(__name__)
+class EvidenceRetriever:
+    """Build a FAISS index over context sentences and retrieve evidence for claims."""
+    def __init__(
+        self,
+        model_name: str = EMBEDDING_MODEL,
+        device: str | None = None,
+    ):
+        self.device = device or ("cuda" if __import__("torch").cuda.is_available() else "cpu")
+        logger.info("Loading embedding model: %s", model_name)
+        with suppress_stdout():
+            self.embedder = SentenceTransformer(model_name, device=self.device)
+        # Populated by .index()
+        self._sentences: list[str] = []
+        self._sentence_to_context: dict[int, str] = {}
+        self._index: faiss.IndexFlatIP | None = None
+    def index(self, contexts: list[str]) -> "EvidenceRetriever":
+        """
+        Build a FAISS index from a list of context passages.
+        Each context is split into individual sentences before indexing.
+        Args:
+            contexts: List of context passages (strings).
+        Returns:
+            self (for chaining: `retriever.index(ctx).retrieve(claim)`).
+        """
+        if not contexts:
+            logger.warning("No contexts provided; retriever will return empty results.")
+            self._sentences = []
+            self._index = None
+            return self
+        self._sentences = []
+        self._sentence_to_context = {}
+        for ctx in contexts:
+            for sent in self._split_sentences(ctx):
+                idx = len(self._sentences)
+                self._sentences.append(sent)
+                self._sentence_to_context[idx] = ctx
+        if not self._sentences:
+            logger.warning("No sentences extracted from contexts.")
+            self._index = None
+            return self
+        logger.info("Indexing %d evidence sentences.", len(self._sentences))
+        embeddings = self.embedder.encode(
+            self._sentences, convert_to_numpy=True, normalize_embeddings=True
+        ).astype(np.float32)
+        dim = embeddings.shape[1]
+        self._index = faiss.IndexFlatIP(dim)  # Cosine similarity (normalized)
+        self._index.add(embeddings)
+        return self
+    def retrieve(
+        self,
+        claim: Claim | str,
+        top_k: int = DEFAULT_TOP_K,
+        min_score: float = MIN_EVIDENCE_SCORE,
+    ) -> list[Evidence]:
+        """
+        Retrieve the top-k most relevant evidence sentences for a claim.
+        Args:
+            claim: A Claim object or plain string.
+            top_k: Number of evidence sentences to return.
+            min_score: Minimum cosine similarity to include.
+        Returns:
+            List of Evidence objects, sorted by score descending.
+        """
+        if self._index is None or not self._sentences:
+            return []
+        query_text = claim.text if isinstance(claim, Claim) else claim
+        q_emb = self.embedder.encode(
+            [query_text], convert_to_numpy=True, normalize_embeddings=True
+        ).astype(np.float32)
+        scores, indices = self._index.search(q_emb, top_k)
+        results: list[Evidence] = []
+        for score, idx in zip(scores[0], indices[0]):
+            if idx < 0 or idx >= len(self._sentences):
+                continue
+            clamped_score = float(min(max(score, 0.0), 1.0))
+            if clamped_score < min_score:
+                continue
+            results.append(
+                Evidence(
+                    sentence=self._sentences[idx],
+                    score=clamped_score,
+                    source_context=self._sentence_to_context.get(idx, ""),
+                )
+            )
+        return results
+    def retrieve_for_claims(
+        self,
+        claims: list[Claim],
+        top_k: int = DEFAULT_TOP_K,
+        min_score: float = MIN_EVIDENCE_SCORE,
+    ) -> list[ClaimWithEvidence]:
+        """
+        Batch-retrieve evidence for a list of claims.
+        Returns:
+            List of ClaimWithEvidence objects.
+        """
+        return [
+            ClaimWithEvidence(
+                claim=claim,
+                evidence=self.retrieve(claim, top_k=top_k, min_score=min_score),
+            )
+            for claim in claims
+        ]
+    @staticmethod
+    def _split_sentences(text: str) -> list[str]:
+        """Split text into sentences on sentence-ending punctuation."""
+        raw = re.split(r"(?<=[.!?])\s+", text)
+        return [s.strip() for s in raw if s.strip() and len(s.strip()) > 3]

facteval/verifier.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+Verifier – NLI-based factual verification of claims against evidence.
+Uses DeBERTa-v3 fine-tuned on MNLI+FEVER+ANLI to classify each
+claim/evidence pair as entailment, contradiction, or neutral.
+Maps NLI labels to FactEval labels: supported, contradicted, unverifiable.
+"""
+import logging
+from enum import Enum
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from facteval import suppress_stdout
+from facteval.config import NLI_MODEL, MIN_EVIDENCE_SCORE
+from facteval.models import Claim, Evidence, ClaimWithEvidence
+logger = logging.getLogger(__name__)
+class FactLabel(str, Enum):
+    """FactEval verdict labels."""
+    SUPPORTED = "supported"
+    CONTRADICTED = "contradicted"
+    UNVERIFIABLE = "unverifiable"
+# Map DeBERTa NLI labels → FactEval labels
+_NLI_TO_FACT = {
+    "entailment": FactLabel.SUPPORTED,
+    "contradiction": FactLabel.CONTRADICTED,
+    "neutral": FactLabel.UNVERIFIABLE,
+}
+class VerificationResult:
+    """Result of verifying a single claim."""
+    def __init__(
+        self,
+        claim: str,
+        label: FactLabel,
+        confidence: float,
+        evidence: str | None,
+        evidence_score: float | None,
+        raw_scores: dict[str, float],
+        reason: str = "",
+        calibrated_confidence: float | None = None,
+        calibration_error: float | None = None,
+    ):
+        self.claim = claim
+        self.label = label
+        self.confidence = confidence
+        self.evidence = evidence
+        self.evidence_score = evidence_score
+        self.raw_scores = raw_scores
+        self.reason = reason
+        self.calibrated_confidence = calibrated_confidence
+        self.calibration_error = calibration_error
+    def to_dict(self) -> dict:
+        d = {
+            "claim": self.claim,
+            "label": self.label.value,
+            "confidence": round(self.confidence, 4),
+            "reason": self.reason,
+            "evidence": self.evidence,
+            "evidence_score": round(self.evidence_score, 4) if self.evidence_score else None,
+            "raw_nli_scores": {k: round(v, 4) for k, v in self.raw_scores.items()},
+        }
+        if self.calibrated_confidence is not None:
+            d["calibrated_confidence"] = round(self.calibrated_confidence, 4)
+        if self.calibration_error is not None:
+            d["calibration_error"] = round(self.calibration_error, 4)
+        return d
+class Verifier:
+    """Verify claims against evidence using NLI."""
+    def __init__(
+        self,
+        model_name: str = NLI_MODEL,
+        device: str | None = None,
+    ):
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info("Loading NLI model: %s on %s", model_name, self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        with suppress_stdout():
+            self.model = AutoModelForSequenceClassification.from_pretrained(
+                model_name
+            ).to(self.device)
+        self.model.eval()
+        self.id2label = self.model.config.id2label
+        logger.info("Verifier ready. Labels: %s", self.id2label)
+    def verify(
+        self,
+        claim_with_evidence: ClaimWithEvidence,
+        min_evidence_score: float = MIN_EVIDENCE_SCORE,
+    ) -> VerificationResult:
+        """
+        Verify a single claim against its retrieved evidence.
+        If no evidence meets the min_score threshold, returns 'unverifiable'
+        with zero confidence.
+        """
+        claim_text = claim_with_evidence.claim.text
+        best = claim_with_evidence.best_evidence
+        # Fallback: no usable evidence
+        if best is None or best.score < min_evidence_score:
+            logger.debug("No evidence for claim: %s", claim_text)
+            return VerificationResult(
+                claim=claim_text,
+                label=FactLabel.UNVERIFIABLE,
+                confidence=0.0,
+                evidence=None,
+                evidence_score=None,
+                raw_scores={},
+                reason="No relevant evidence found in the provided context.",
+            )
+        # Run NLI: premise=evidence, hypothesis=claim
+        return self._run_nli(claim_text, best.sentence, best.score)
+    def verify_batch(
+        self,
+        claims_with_evidence: list[ClaimWithEvidence],
+        min_evidence_score: float = MIN_EVIDENCE_SCORE,
+    ) -> list[VerificationResult]:
+        """
+        Verify a batch of claims using batched NLI inference.
+        Claims without evidence are immediately marked unverifiable.
+        Remaining claims are processed in a single forward pass for speed.
+        """
+        results: list[VerificationResult | None] = [None] * len(claims_with_evidence)
+        nli_pairs: list[tuple[int, str, str, float]] = []
+        for i, cwe in enumerate(claims_with_evidence):
+            claim_text = cwe.claim.text
+            best = cwe.best_evidence
+            if best is None or best.score < min_evidence_score:
+                results[i] = VerificationResult(
+                    claim=claim_text,
+                    label=FactLabel.UNVERIFIABLE,
+                    confidence=0.0,
+                    evidence=None,
+                    evidence_score=None,
+                    raw_scores={},
+                    reason="No relevant evidence found in the provided context.",
+                )
+            else:
+                nli_pairs.append((i, claim_text, best.sentence, best.score))
+        # Batch NLI inference for all claims with evidence
+        if nli_pairs:
+            indices, claims, evidences, scores = zip(*nli_pairs)
+            inputs = self.tokenizer(
+                list(evidences), list(claims),
+                return_tensors="pt",
+                padding=True,
+                truncation=True,
+                max_length=512,
+            ).to(self.device)
+            with torch.no_grad():
+                logits = self.model(**inputs).logits
+            all_probs = torch.softmax(logits, dim=-1).cpu()
+            for idx, probs_t, claim, evidence, ev_score in zip(
+                indices, all_probs, claims, evidences, scores
+            ):
+                probs = probs_t.tolist()
+                label_probs = {self.id2label[i]: float(p) for i, p in enumerate(probs)}
+                predicted_nli = self.id2label[probs_t.argmax().item()]
+                fact_label = _NLI_TO_FACT.get(predicted_nli, FactLabel.UNVERIFIABLE)
+                results[idx] = VerificationResult(
+                    claim=claim,
+                    label=fact_label,
+                    confidence=max(probs),
+                    evidence=evidence,
+                    evidence_score=ev_score,
+                    raw_scores=label_probs,
+                    reason=self._make_reason(fact_label, evidence),
+                )
+        return results
+    def _run_nli(
+        self, claim: str, evidence: str, evidence_score: float
+    ) -> VerificationResult:
+        """Run NLI inference on a single claim/evidence pair."""
+        inputs = self.tokenizer(
+            evidence, claim,
+            return_tensors="pt",
+            truncation=True,
+            max_length=512,
+        ).to(self.device)
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+        probs = torch.softmax(logits, dim=-1).squeeze().cpu().tolist()
+        label_probs = {self.id2label[i]: float(p) for i, p in enumerate(probs)}
+        predicted_nli = self.id2label[logits.argmax().item()]
+        fact_label = _NLI_TO_FACT.get(predicted_nli, FactLabel.UNVERIFIABLE)
+        return VerificationResult(
+            claim=claim,
+            label=fact_label,
+            confidence=max(probs),
+            evidence=evidence,
+            evidence_score=evidence_score,
+            raw_scores=label_probs,
+            reason=self._make_reason(fact_label, evidence),
+        )
+    @staticmethod
+    def _make_reason(label: FactLabel, evidence: str) -> str:
+        """Generate a human-readable reason for the verdict."""
+        ev_short = evidence[:80] + "..." if len(evidence) > 80 else evidence
+        if label == FactLabel.SUPPORTED:
+            return f"Supported by evidence: \"{ev_short}\""
+        elif label == FactLabel.CONTRADICTED:
+            return f"Contradicts evidence: \"{ev_short}\""
+        else:
+            return f"Evidence is neutral — neither confirms nor denies: \"{ev_short}\""

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.0
+transformers>=4.36
+sentence-transformers>=2.2
+faiss-cpu>=1.7
+scikit-learn>=1.3
+pydantic>=2.0
+accelerate>=0.25
+numpy>=1.24
+gradio>=4.0