lexenvs-harbor / src /lexenvs /services /objective_evaluator_service.py
endishai's picture
Upload folder using huggingface_hub
2312199 verified
"""Objective evaluator service.
Evaluates agent answers using the deterministic EV calculator and factual
fidelity scorer. No expert-provided reference values — ground truth is
computed from the card database and user profile.
"""
from __future__ import annotations
import logging
import re
from difflib import SequenceMatcher
from typing import Any
from lexenvs.config.constants import (
compute_composite_reward,
compute_f1_score,
compute_structure_bonus,
)
from lexenvs.schemas.card_database import CardDatabase, UserProfile
from lexenvs.schemas.task import DimensionResult, TaskDefinition
from lexenvs.services.claim_extractor import ClaimExtractor
from lexenvs.services.ev_calculator_service import EVCalculatorService
from lexenvs.services.factual_fidelity_scorer import FactualFidelityScorer
from lexenvs.utils import extract_json_block
logger = logging.getLogger(__name__)
# Tiered structure bonuses for valid JSON (RL gradient signal).
STRUCTURE_BONUS_BASIC = 0.03
STRUCTURE_BONUS_PARTIAL = 0.06
STRUCTURE_BONUS_FULL = 0.10
# Common abbreviations/aliases for card matching in prose
CARD_ALIASES: dict[str, list[str]] = {
"american express platinum": ["amex platinum", "amex plat", "the platinum card"],
"american express gold": ["amex gold"],
"american express blue business plus": ["amex bbb", "blue business plus", "bbp"],
"chase sapphire preferred": ["csp", "sapphire preferred"],
"chase sapphire reserve": ["csr", "sapphire reserve"],
"chase ink business preferred": ["cip", "ink preferred", "ink business preferred"],
"chase ink business cash": ["cic", "ink cash", "ink business cash"],
"chase ink business unlimited": ["ciu", "ink unlimited", "ink business unlimited"],
"capital one venture x": ["venture x"],
"capital one venture": ["venture card"],
"citi staa": ["staa"],
"chase united explorer": ["united explorer"],
"chase united club": ["united club infinite"],
}
class ObjectiveEvaluatorService:
"""Evaluates agent answers using computed ground truth.
Scoring dimensions (all automated):
- card_selection: F1 of recommended cards vs. optimal set
- ev_accuracy: how close agent's total_ev_usd is to computed truth
- factual_fidelity: accuracy of factual claims about cards
- constraint_compliance: did the agent respect user/issuer constraints
"""
def __init__(
self,
card_db: CardDatabase,
claim_extractor: ClaimExtractor | None = None,
) -> None:
self.card_db = card_db
self.ev_calc = EVCalculatorService(card_db)
self.fidelity_scorer = FactualFidelityScorer(card_db)
self.claim_extractor = claim_extractor
def evaluate(
self,
task: TaskDefinition,
agent_answer: str,
) -> tuple[float, list[DimensionResult], dict[str, object]]:
"""Evaluate an agent's answer using objective scoring.
The task's scoring dimensions must include objective-mode config:
- ev_accuracy.reference.user_profile: UserProfile dict
- ev_accuracy.reference.expected_cards: list of card IDs
- constraint_compliance.checks.expected_cards: list of card names
- factual_fidelity.reference.extracted_claims: claims from extractor LLM
Returns:
(reward, dimension_results, metadata)
"""
parsed_json = extract_json_block(agent_answer)
structure_bonus, json_valid, json_has_required = compute_structure_bonus(
parsed_json, STRUCTURE_BONUS_BASIC, STRUCTURE_BONUS_PARTIAL, STRUCTURE_BONUS_FULL
)
dimension_results: list[DimensionResult] = []
hard_constraint_violated = False
for dim_name, dim in task.scoring.dimensions.items():
score = self._score_dimension(
dim_name, dim, parsed_json, task, agent_answer=agent_answer
)
if score is not None:
if (
dim.hard_constraint
and task.scoring.hard_constraint_failure_zeroes_dimension
and score == 0.0
):
hard_constraint_violated = True
dimension_results.append(
DimensionResult(
dimension=dim_name,
score=score,
weight=dim.weight,
status="scored",
)
)
else:
dimension_results.append(
DimensionResult(
dimension=dim_name,
score=None,
weight=dim.weight,
status="pending_reference",
)
)
scored = [(r.score, r.weight) for r in dimension_results if r.score is not None]
reward = compute_composite_reward(scored, structure_bonus, hard_constraint_violated)
metadata: dict[str, object] = {
"json_valid": json_valid,
"json_has_required_fields": json_has_required,
"hard_constraint_violated": hard_constraint_violated,
"evaluation_mode": "objective",
}
# Add computed reference EV to metadata for transparency
ev_dim = task.scoring.dimensions.get("ev_accuracy")
if ev_dim and ev_dim.reference:
profile_data = ev_dim.reference.get("user_profile")
card_ids = ev_dim.reference.get("expected_card_ids")
if profile_data and card_ids:
profile = UserProfile(**profile_data)
portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile)
metadata["computed_reference_ev_usd"] = portfolio.total_ev_usd
metadata["computed_ev_breakdown"] = {
"signup_bonuses_usd": portfolio.total_signup_bonuses_usd,
"ongoing_rewards_usd": portfolio.total_ongoing_rewards_usd,
"credits_usd": portfolio.total_credits_usd,
"perks_usd": portfolio.total_perks_usd,
"annual_fees_usd": portfolio.total_annual_fees_usd,
}
if parsed_json and isinstance(parsed_json, dict):
metadata["parsed_fields"] = list(parsed_json.keys())
return reward, dimension_results, metadata
@staticmethod
def _prose_effort_score(answer: str) -> float:
"""Score answer effort/quality heuristically. Returns 0.0-1.0.
Rewards longer, more detailed answers that show reasoning.
This gives gradient signal even before card matching kicks in.
"""
if not answer or not answer.strip():
return 0.0
word_count = len(answer.split())
# Length tiers: <10 words=0, 10-50=0.2, 50-150=0.5, 150-400=0.8, 400+=1.0
if word_count < 10:
return 0.0
if word_count < 50:
return 0.2
if word_count < 150:
return 0.5
if word_count < 400:
return 0.8
return 1.0
@staticmethod
def _fuzzy_card_match_prose(expected_card: str, answer_lower: str) -> float:
"""Check if a card name or any of its aliases appear in prose.
Returns a match score: 1.0 for exact/alias match, 0.5 for fuzzy
substring match, 0.0 for no match. Uses both alias lookup and
sliding-window fuzzy matching so creative abbreviations still score.
"""
card_lower = expected_card.lower()
if card_lower in answer_lower:
return 1.0
# Check aliases
aliases = CARD_ALIASES.get(card_lower, [])
if any(alias in answer_lower for alias in aliases):
return 1.0
# Fuzzy sliding window: check each word-window of the answer
# against the card name. This catches partial names like
# "Sapphire Preferred" matching "Chase Sapphire Preferred".
card_words = card_lower.split()
answer_words = answer_lower.split()
window_size = len(card_words)
if window_size == 0 or len(answer_words) < window_size:
return 0.0
best_sim = 0.0
for i in range(len(answer_words) - window_size + 1):
window = " ".join(answer_words[i : i + window_size])
sim = SequenceMatcher(None, card_lower, window).ratio()
best_sim = max(best_sim, sim)
if best_sim >= 0.8:
return 0.5
return 0.0
def _score_dimension(
self,
dim_name: str,
dim: Any,
parsed_json: dict[str, Any] | None,
task: TaskDefinition,
agent_answer: str = "",
) -> float | None:
"""Score a single dimension."""
if parsed_json is None:
# No JSON — use prose-based fallbacks for reward variance
if dim_name == "card_selection" and agent_answer:
return self._score_card_selection_from_prose(dim, agent_answer)
if dim_name == "constraint_compliance" and agent_answer:
return self._score_constraint_compliance_from_prose(dim, agent_answer)
if dim_name == "ev_accuracy" and agent_answer:
return self._score_ev_accuracy_from_prose(dim, agent_answer)
if dim_name == "factual_fidelity" and agent_answer:
return self._score_factual_fidelity(dim, {}, agent_answer=agent_answer)
return 0.0
if dim_name == "ev_accuracy":
return self._score_ev_accuracy(dim, parsed_json)
if dim_name == "card_selection":
return self._score_card_selection(dim, parsed_json)
if dim_name == "constraint_compliance":
return self._score_constraint_compliance(dim, parsed_json)
if dim_name == "factual_fidelity":
return self._score_factual_fidelity(dim, parsed_json, agent_answer=agent_answer)
if dim.type == "human":
return None
return None
def _score_ev_accuracy(self, dim: Any, parsed_json: dict[str, Any]) -> float:
"""Score EV accuracy against computed ground truth."""
ref = dim.reference
if not ref:
return 0.0
profile_data = ref.get("user_profile")
card_ids = ref.get("expected_card_ids")
if not profile_data or not card_ids:
# Fall back to static reference_ev_usd if provided
ref_ev = ref.get("reference_ev_usd")
if ref_ev is None:
return 0.0
return self._ev_accuracy_score(
ref_ev, parsed_json.get("total_ev_usd"), ref.get("ev_tolerance_pct", 0.05)
)
# Compute ground truth from card database
profile = UserProfile(**profile_data)
portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile)
ref_ev = portfolio.total_ev_usd
tolerance = ref.get("ev_tolerance_pct", 0.05)
return self._ev_accuracy_score(ref_ev, parsed_json.get("total_ev_usd"), tolerance)
def _ev_accuracy_score(self, ref_ev: float, agent_ev: Any, tolerance: float) -> float:
"""Linear decay EV accuracy scoring."""
if agent_ev is None:
return 0.0
try:
agent_ev = float(agent_ev)
ref_ev = float(ref_ev)
except (TypeError, ValueError):
return 0.0
if ref_ev == 0:
return 1.0 if agent_ev == 0 else 0.0
error_pct = abs(agent_ev - ref_ev) / abs(ref_ev)
if error_pct <= tolerance:
return 1.0
max_error = tolerance * 5 # wider decay for more partial credit
if error_pct >= max_error:
return 0.0
return float(1.0 - (error_pct - tolerance) / (max_error - tolerance))
def _score_card_selection(self, dim: Any, parsed_json: dict[str, Any]) -> float:
"""Score card selection using F1 against expected cards with fuzzy matching."""
source = dim.checks or dim.reference
if not source:
return 0.0
expected_cards = source.get("expected_cards")
agent_cards = parsed_json.get("recommended_cards")
if expected_cards is None or agent_cards is None:
return 0.0
expected_set = {c.lower().strip() for c in expected_cards}
agent_set = {c.lower().strip() for c in agent_cards}
return compute_f1_score(expected_set, agent_set)
def _score_card_selection_from_prose(self, dim: Any, agent_answer: str) -> float:
"""Fallback: check if expected card names appear in prose.
Uses alias matching (e.g. "CSR" -> "Chase Sapphire Reserve").
Blends card match score (0.7 weight) with effort score (0.3 weight)
so the model gets gradient signal for writing detailed answers even
before it nails the exact right cards.
Capped at 0.7 to incentivize proper JSON output.
"""
source = dim.checks or dim.reference
if not source:
return self._prose_effort_score(agent_answer) * 0.15
expected_cards = source.get("expected_cards")
if not expected_cards:
return self._prose_effort_score(agent_answer) * 0.15
answer_lower = agent_answer.lower()
matches = sum(self._fuzzy_card_match_prose(c, answer_lower) for c in expected_cards)
card_score = min(matches / len(expected_cards), 1.0)
effort = self._prose_effort_score(agent_answer)
# Blend: 70% card matching, 30% effort — capped at 0.7
return (card_score * 0.7 + effort * 0.3) * 0.7
def _score_constraint_compliance_from_prose(self, dim: Any, agent_answer: str) -> float:
"""Fallback: check if expected cards appear in prose.
Uses alias matching. Blends card match with effort score.
Capped at 0.6 to incentivize JSON output.
"""
source = dim.checks or dim.reference
if not source:
return self._prose_effort_score(agent_answer) * 0.15
expected_cards = source.get("expected_cards")
if not expected_cards:
return self._prose_effort_score(agent_answer) * 0.15
answer_lower = agent_answer.lower()
matches = sum(self._fuzzy_card_match_prose(c, answer_lower) for c in expected_cards)
card_score = min(matches / len(expected_cards), 1.0)
effort = self._prose_effort_score(agent_answer)
return (card_score * 0.7 + effort * 0.3) * 0.6
def _score_ev_accuracy_from_prose(self, dim: Any, agent_answer: str) -> float:
"""Fallback: look for dollar amounts in prose and compare to reference.
Gives partial credit for mentioning any dollar amounts (shows the model
is attempting EV calculation). Higher credit for amounts close to the
reference. Capped at 0.5 to incentivize structured JSON output.
"""
ref = dim.reference
if not ref:
return self._prose_effort_score(agent_answer) * 0.1
# Base credit for effort
effort = self._prose_effort_score(agent_answer)
# Try to extract dollar amounts from prose
amounts = re.findall(r"\$[\d,]+(?:\.\d{1,2})?", agent_answer)
if not amounts:
# No dollar amounts but wrote something — small credit for effort
return effort * 0.1
# Get reference EV
profile_data = ref.get("user_profile")
card_ids = ref.get("expected_card_ids")
ref_ev = None
if profile_data and card_ids:
profile = UserProfile(**profile_data)
portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile)
ref_ev = portfolio.total_ev_usd
else:
ref_ev = ref.get("reference_ev_usd")
if ref_ev is None or ref_ev == 0:
return 0.15 # Credit for attempting EV calculation
# Check if any extracted amount is close to reference
best_score = 0.15 # Base credit for mentioning dollar amounts
for amt_str in amounts:
try:
amt = float(amt_str.replace("$", "").replace(",", ""))
error_pct = abs(amt - ref_ev) / abs(ref_ev)
if error_pct <= 0.05:
best_score = max(best_score, 0.5)
elif error_pct <= 0.15:
best_score = max(best_score, 0.4)
elif error_pct <= 0.30:
best_score = max(best_score, 0.3)
elif error_pct <= 0.50:
best_score = max(best_score, 0.2)
except ValueError:
continue
return best_score
def _score_constraint_compliance(self, dim: Any, parsed_json: dict[str, Any]) -> float:
"""Score constraint compliance (card selection + housing)."""
source = dim.checks or dim.reference
if not source:
return 0.0
scores: list[tuple[float, float]] = []
# Card matching via F1 with fuzzy matching
expected_cards = source.get("expected_cards")
agent_cards = parsed_json.get("recommended_cards")
if expected_cards is not None and agent_cards is not None:
expected_set = {c.lower().strip() for c in expected_cards}
agent_set = {c.lower().strip() for c in agent_cards}
if not expected_set and not agent_set:
scores.append((1.0, 0.7))
elif expected_set:
f1 = compute_f1_score(expected_set, agent_set)
scores.append((f1, 0.7))
# Skip when expected_set is empty — task doesn't require specific cards
# Housing option match
expected_housing = source.get("expected_housing_option")
agent_housing = parsed_json.get("housing_option")
if expected_housing is not None:
match = (
agent_housing is not None
and str(agent_housing).lower().strip() == str(expected_housing).lower().strip()
)
scores.append((1.0 if match else 0.0, 0.3))
if not scores:
return 0.0
total_weight = sum(w for _, w in scores)
return sum(s * w for s, w in scores) / total_weight if total_weight else 0.0
def _score_factual_fidelity(
self,
dim: Any,
parsed_json: dict[str, Any],
agent_answer: str = "",
) -> float | None:
"""Score factual fidelity of agent claims.
Extracts claims from the agent's answer using an LLM (if available),
then verifies them against the card database. Falls back to
pre-extracted claims in the task reference if no extractor is configured.
"""
ref = dim.reference or {}
# Try LLM extraction first (preferred — works for any answer)
extracted_claims = None
if self.claim_extractor and agent_answer:
extracted_claims = self.claim_extractor.extract_claims(agent_answer)
if extracted_claims:
n = len(extracted_claims.get("per_card_claims", []))
logger.debug("Extracted %d card claims via LLM", n)
# Fall back to pre-extracted claims in task reference
if not extracted_claims:
extracted_claims = ref.get("extracted_claims")
if not extracted_claims:
# No extractor and no pre-extracted claims — not scorable
return None
# Get reference EV for normalization (use 1000 if missing or null)
ev_dim_ref = ref.get("reference_ev_usd") or 1000
result = self.fidelity_scorer.score_claims(extracted_claims, ev_dim_ref)
return result.score
def _compute_reward(
self,
results: list[DimensionResult],
structure_bonus: float,
hard_constraint_violated: bool,
) -> float:
"""Compute composite reward."""
scored = [(r.score, r.weight) for r in results if r.score is not None]
return compute_composite_reward(scored, structure_bonus, hard_constraint_violated)
def create_objective_evaluator_service(
card_db: CardDatabase,
) -> ObjectiveEvaluatorService:
"""Factory for svcs registration."""
return ObjectiveEvaluatorService(card_db)