lexenvs-harbor / src /lexenvs /services /claim_extractor.py
endishai's picture
Upload folder using huggingface_hub
2312199 verified
"""LLM-based claim extractor for factual fidelity scoring.
Extracts structured factual claims from an agent's answer so they can be
verified against the card database. Uses an OpenAI-compatible API (supports
OpenRouter, OpenAI, or any compatible endpoint).
"""
from __future__ import annotations
import json
import logging
from typing import Any
logger = logging.getLogger(__name__)
EXTRACTION_PROMPT = """\
Extract all factual claims about credit cards from the following agent answer.
Return a JSON object with this exact structure:
{
"per_card_claims": [
{
"card_name": "exact card name as mentioned",
"claimed_annual_fee": <number or null if not mentioned>,
"claimed_earning_rates": {"category": rate_as_number, ...},
"claimed_credits": [{"name": "credit_name", "claimed_value": <annual dollar value>}],
"claimed_perks": [{"name": "perk_name", "claimed_value": <annual dollar value>}],
"claimed_signup_bonus_points": <number or null if not mentioned>,
"claimed_signup_bonus_value_usd": <number or null if not mentioned>
}
]
}
Rules:
- Only extract explicit numerical claims (earning rates, fees, dollar values, point amounts)
- For earning rates, use the multiplier (e.g., "4x on dining" → {"dining": 4})
- For credits, use annual value
(e.g., "$10/month Uber credit" → {"name": "uber_cash", "claimed_value": 120})
- Normalize credit/perk names to snake_case
(e.g., "Uber Cash" → "uber_cash", "airline fee credit" → "airline_fee_credit")
- If a card is mentioned but no numerical claims are made about it, skip it
- Do not infer or calculate — only extract what is explicitly stated
- Return ONLY the JSON object, no other text
Agent answer:
"""
class ClaimExtractor:
"""Extracts structured claims from agent answers using an LLM.
Uses the OpenAI-compatible API format, which works with OpenRouter,
OpenAI, and other compatible providers.
"""
def __init__(
self,
api_key: str,
model: str = "anthropic/claude-haiku-4.5",
base_url: str = "https://openrouter.ai/api/v1",
) -> None:
import httpx
self.model = model
self.client = httpx.Client(
base_url=base_url,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
timeout=30.0,
)
def extract_claims(self, agent_answer: str) -> dict[str, Any] | None:
"""Extract factual claims from an agent's answer.
Returns the structured claims dict, or None if extraction fails.
"""
try:
resp = self.client.post(
"/chat/completions",
json={
"model": self.model,
"max_tokens": 2000,
"temperature": 0,
"messages": [
{
"role": "user",
"content": EXTRACTION_PROMPT + agent_answer,
}
],
},
)
resp.raise_for_status()
text = resp.json()["choices"][0]["message"]["content"].strip()
# Strip markdown code fences if present
if text.startswith("```"):
lines = text.split("\n")
lines = [ln for ln in lines if not ln.strip().startswith("```")]
text = "\n".join(lines)
claims = json.loads(text)
if not isinstance(claims, dict) or "per_card_claims" not in claims:
logger.warning("LLM returned invalid claims structure")
return None
return claims
except json.JSONDecodeError as e:
logger.warning("Failed to parse LLM claims output as JSON: %s", e)
return None
except Exception as e:
logger.warning("Claim extraction failed: %s", e)
return None