Spaces:

endishai
/

lexenvs-harbor

Sleeping

App Files Files Community

lexenvs-harbor / src /lexenvs /services /objective_evaluator_service.py

endishai

Upload folder using huggingface_hub

2312199 verified 26 days ago

raw

history blame contribute delete

20.2 kB

	"""Objective evaluator service.

	Evaluates agent answers using the deterministic EV calculator and factual
	fidelity scorer. No expert-provided reference values — ground truth is
	computed from the card database and user profile.
	"""

	from __future__ import annotations

	import logging
	import re
	from difflib import SequenceMatcher
	from typing import Any

	from lexenvs.config.constants import (
	compute_composite_reward,
	compute_f1_score,
	compute_structure_bonus,
	)
	from lexenvs.schemas.card_database import CardDatabase, UserProfile
	from lexenvs.schemas.task import DimensionResult, TaskDefinition
	from lexenvs.services.claim_extractor import ClaimExtractor
	from lexenvs.services.ev_calculator_service import EVCalculatorService
	from lexenvs.services.factual_fidelity_scorer import FactualFidelityScorer
	from lexenvs.utils import extract_json_block

	logger = logging.getLogger(__name__)

	# Tiered structure bonuses for valid JSON (RL gradient signal).
	STRUCTURE_BONUS_BASIC = 0.03
	STRUCTURE_BONUS_PARTIAL = 0.06
	STRUCTURE_BONUS_FULL = 0.10

	# Common abbreviations/aliases for card matching in prose
	CARD_ALIASES: dict[str, list[str]] = {
	"american express platinum": ["amex platinum", "amex plat", "the platinum card"],
	"american express gold": ["amex gold"],
	"american express blue business plus": ["amex bbb", "blue business plus", "bbp"],
	"chase sapphire preferred": ["csp", "sapphire preferred"],
	"chase sapphire reserve": ["csr", "sapphire reserve"],
	"chase ink business preferred": ["cip", "ink preferred", "ink business preferred"],
	"chase ink business cash": ["cic", "ink cash", "ink business cash"],
	"chase ink business unlimited": ["ciu", "ink unlimited", "ink business unlimited"],
	"capital one venture x": ["venture x"],
	"capital one venture": ["venture card"],
	"citi staa": ["staa"],
	"chase united explorer": ["united explorer"],
	"chase united club": ["united club infinite"],
	}


	class ObjectiveEvaluatorService:
	"""Evaluates agent answers using computed ground truth.

	Scoring dimensions (all automated):
	- card_selection: F1 of recommended cards vs. optimal set
	- ev_accuracy: how close agent's total_ev_usd is to computed truth
	- factual_fidelity: accuracy of factual claims about cards
	- constraint_compliance: did the agent respect user/issuer constraints
	"""

	def __init__(
	self,
	card_db: CardDatabase,
	claim_extractor: ClaimExtractor \| None = None,
	) -> None:
	self.card_db = card_db
	self.ev_calc = EVCalculatorService(card_db)
	self.fidelity_scorer = FactualFidelityScorer(card_db)
	self.claim_extractor = claim_extractor

	def evaluate(
	self,
	task: TaskDefinition,
	agent_answer: str,
	) -> tuple[float, list[DimensionResult], dict[str, object]]:
	"""Evaluate an agent's answer using objective scoring.

	The task's scoring dimensions must include objective-mode config:
	- ev_accuracy.reference.user_profile: UserProfile dict
	- ev_accuracy.reference.expected_cards: list of card IDs
	- constraint_compliance.checks.expected_cards: list of card names
	- factual_fidelity.reference.extracted_claims: claims from extractor LLM

	Returns:
	(reward, dimension_results, metadata)
	"""
	parsed_json = extract_json_block(agent_answer)

	structure_bonus, json_valid, json_has_required = compute_structure_bonus(
	parsed_json, STRUCTURE_BONUS_BASIC, STRUCTURE_BONUS_PARTIAL, STRUCTURE_BONUS_FULL
	)

	dimension_results: list[DimensionResult] = []
	hard_constraint_violated = False

	for dim_name, dim in task.scoring.dimensions.items():
	score = self._score_dimension(
	dim_name, dim, parsed_json, task, agent_answer=agent_answer
	)

	if score is not None:
	if (
	dim.hard_constraint
	and task.scoring.hard_constraint_failure_zeroes_dimension
	and score == 0.0
	):
	hard_constraint_violated = True

	dimension_results.append(
	DimensionResult(
	dimension=dim_name,
	score=score,
	weight=dim.weight,
	status="scored",
	)
	)
	else:
	dimension_results.append(
	DimensionResult(
	dimension=dim_name,
	score=None,
	weight=dim.weight,
	status="pending_reference",
	)
	)

	scored = [(r.score, r.weight) for r in dimension_results if r.score is not None]
	reward = compute_composite_reward(scored, structure_bonus, hard_constraint_violated)

	metadata: dict[str, object] = {
	"json_valid": json_valid,
	"json_has_required_fields": json_has_required,
	"hard_constraint_violated": hard_constraint_violated,
	"evaluation_mode": "objective",
	}

	# Add computed reference EV to metadata for transparency
	ev_dim = task.scoring.dimensions.get("ev_accuracy")
	if ev_dim and ev_dim.reference:
	profile_data = ev_dim.reference.get("user_profile")
	card_ids = ev_dim.reference.get("expected_card_ids")
	if profile_data and card_ids:
	profile = UserProfile(**profile_data)
	portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile)
	metadata["computed_reference_ev_usd"] = portfolio.total_ev_usd
	metadata["computed_ev_breakdown"] = {
	"signup_bonuses_usd": portfolio.total_signup_bonuses_usd,
	"ongoing_rewards_usd": portfolio.total_ongoing_rewards_usd,
	"credits_usd": portfolio.total_credits_usd,
	"perks_usd": portfolio.total_perks_usd,
	"annual_fees_usd": portfolio.total_annual_fees_usd,
	}

	if parsed_json and isinstance(parsed_json, dict):
	metadata["parsed_fields"] = list(parsed_json.keys())

	return reward, dimension_results, metadata

	@staticmethod
	def _prose_effort_score(answer: str) -> float:
	"""Score answer effort/quality heuristically. Returns 0.0-1.0.

	Rewards longer, more detailed answers that show reasoning.
	This gives gradient signal even before card matching kicks in.
	"""
	if not answer or not answer.strip():
	return 0.0
	word_count = len(answer.split())
	# Length tiers: <10 words=0, 10-50=0.2, 50-150=0.5, 150-400=0.8, 400+=1.0
	if word_count < 10:
	return 0.0
	if word_count < 50:
	return 0.2
	if word_count < 150:
	return 0.5
	if word_count < 400:
	return 0.8
	return 1.0

	@staticmethod
	def _fuzzy_card_match_prose(expected_card: str, answer_lower: str) -> float:
	"""Check if a card name or any of its aliases appear in prose.

	Returns a match score: 1.0 for exact/alias match, 0.5 for fuzzy
	substring match, 0.0 for no match. Uses both alias lookup and
	sliding-window fuzzy matching so creative abbreviations still score.
	"""
	card_lower = expected_card.lower()
	if card_lower in answer_lower:
	return 1.0
	# Check aliases
	aliases = CARD_ALIASES.get(card_lower, [])
	if any(alias in answer_lower for alias in aliases):
	return 1.0
	# Fuzzy sliding window: check each word-window of the answer
	# against the card name. This catches partial names like
	# "Sapphire Preferred" matching "Chase Sapphire Preferred".
	card_words = card_lower.split()
	answer_words = answer_lower.split()
	window_size = len(card_words)
	if window_size == 0 or len(answer_words) < window_size:
	return 0.0
	best_sim = 0.0
	for i in range(len(answer_words) - window_size + 1):
	window = " ".join(answer_words[i : i + window_size])
	sim = SequenceMatcher(None, card_lower, window).ratio()
	best_sim = max(best_sim, sim)
	if best_sim >= 0.8:
	return 0.5
	return 0.0

	def _score_dimension(
	self,
	dim_name: str,
	dim: Any,
	parsed_json: dict[str, Any] \| None,
	task: TaskDefinition,
	agent_answer: str = "",
	) -> float \| None:
	"""Score a single dimension."""
	if parsed_json is None:
	# No JSON — use prose-based fallbacks for reward variance
	if dim_name == "card_selection" and agent_answer:
	return self._score_card_selection_from_prose(dim, agent_answer)
	if dim_name == "constraint_compliance" and agent_answer:
	return self._score_constraint_compliance_from_prose(dim, agent_answer)
	if dim_name == "ev_accuracy" and agent_answer:
	return self._score_ev_accuracy_from_prose(dim, agent_answer)
	if dim_name == "factual_fidelity" and agent_answer:
	return self._score_factual_fidelity(dim, {}, agent_answer=agent_answer)
	return 0.0

	if dim_name == "ev_accuracy":
	return self._score_ev_accuracy(dim, parsed_json)

	if dim_name == "card_selection":
	return self._score_card_selection(dim, parsed_json)

	if dim_name == "constraint_compliance":
	return self._score_constraint_compliance(dim, parsed_json)

	if dim_name == "factual_fidelity":
	return self._score_factual_fidelity(dim, parsed_json, agent_answer=agent_answer)

	if dim.type == "human":
	return None

	return None

	def _score_ev_accuracy(self, dim: Any, parsed_json: dict[str, Any]) -> float:
	"""Score EV accuracy against computed ground truth."""
	ref = dim.reference
	if not ref:
	return 0.0

	profile_data = ref.get("user_profile")
	card_ids = ref.get("expected_card_ids")
	if not profile_data or not card_ids:
	# Fall back to static reference_ev_usd if provided
	ref_ev = ref.get("reference_ev_usd")
	if ref_ev is None:
	return 0.0
	return self._ev_accuracy_score(
	ref_ev, parsed_json.get("total_ev_usd"), ref.get("ev_tolerance_pct", 0.05)
	)

	# Compute ground truth from card database
	profile = UserProfile(**profile_data)
	portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile)
	ref_ev = portfolio.total_ev_usd
	tolerance = ref.get("ev_tolerance_pct", 0.05)

	return self._ev_accuracy_score(ref_ev, parsed_json.get("total_ev_usd"), tolerance)

	def _ev_accuracy_score(self, ref_ev: float, agent_ev: Any, tolerance: float) -> float:
	"""Linear decay EV accuracy scoring."""
	if agent_ev is None:
	return 0.0
	try:
	agent_ev = float(agent_ev)
	ref_ev = float(ref_ev)
	except (TypeError, ValueError):
	return 0.0

	if ref_ev == 0:
	return 1.0 if agent_ev == 0 else 0.0

	error_pct = abs(agent_ev - ref_ev) / abs(ref_ev)
	if error_pct <= tolerance:
	return 1.0
	max_error = tolerance * 5 # wider decay for more partial credit
	if error_pct >= max_error:
	return 0.0
	return float(1.0 - (error_pct - tolerance) / (max_error - tolerance))

	def _score_card_selection(self, dim: Any, parsed_json: dict[str, Any]) -> float:
	"""Score card selection using F1 against expected cards with fuzzy matching."""
	source = dim.checks or dim.reference
	if not source:
	return 0.0

	expected_cards = source.get("expected_cards")
	agent_cards = parsed_json.get("recommended_cards")
	if expected_cards is None or agent_cards is None:
	return 0.0

	expected_set = {c.lower().strip() for c in expected_cards}
	agent_set = {c.lower().strip() for c in agent_cards}

	return compute_f1_score(expected_set, agent_set)

	def _score_card_selection_from_prose(self, dim: Any, agent_answer: str) -> float:
	"""Fallback: check if expected card names appear in prose.

	Uses alias matching (e.g. "CSR" -> "Chase Sapphire Reserve").
	Blends card match score (0.7 weight) with effort score (0.3 weight)
	so the model gets gradient signal for writing detailed answers even
	before it nails the exact right cards.
	Capped at 0.7 to incentivize proper JSON output.
	"""
	source = dim.checks or dim.reference
	if not source:
	return self._prose_effort_score(agent_answer) * 0.15
	expected_cards = source.get("expected_cards")
	if not expected_cards:
	return self._prose_effort_score(agent_answer) * 0.15
	answer_lower = agent_answer.lower()
	matches = sum(self._fuzzy_card_match_prose(c, answer_lower) for c in expected_cards)
	card_score = min(matches / len(expected_cards), 1.0)
	effort = self._prose_effort_score(agent_answer)
	# Blend: 70% card matching, 30% effort — capped at 0.7
	return (card_score * 0.7 + effort * 0.3) * 0.7

	def _score_constraint_compliance_from_prose(self, dim: Any, agent_answer: str) -> float:
	"""Fallback: check if expected cards appear in prose.

	Uses alias matching. Blends card match with effort score.
	Capped at 0.6 to incentivize JSON output.
	"""
	source = dim.checks or dim.reference
	if not source:
	return self._prose_effort_score(agent_answer) * 0.15
	expected_cards = source.get("expected_cards")
	if not expected_cards:
	return self._prose_effort_score(agent_answer) * 0.15
	answer_lower = agent_answer.lower()
	matches = sum(self._fuzzy_card_match_prose(c, answer_lower) for c in expected_cards)
	card_score = min(matches / len(expected_cards), 1.0)
	effort = self._prose_effort_score(agent_answer)
	return (card_score * 0.7 + effort * 0.3) * 0.6

	def _score_ev_accuracy_from_prose(self, dim: Any, agent_answer: str) -> float:
	"""Fallback: look for dollar amounts in prose and compare to reference.

	Gives partial credit for mentioning any dollar amounts (shows the model
	is attempting EV calculation). Higher credit for amounts close to the
	reference. Capped at 0.5 to incentivize structured JSON output.
	"""
	ref = dim.reference
	if not ref:
	return self._prose_effort_score(agent_answer) * 0.1

	# Base credit for effort
	effort = self._prose_effort_score(agent_answer)

	# Try to extract dollar amounts from prose
	amounts = re.findall(r"\$[\d,]+(?:\.\d{1,2})?", agent_answer)
	if not amounts:
	# No dollar amounts but wrote something — small credit for effort
	return effort * 0.1

	# Get reference EV
	profile_data = ref.get("user_profile")
	card_ids = ref.get("expected_card_ids")
	ref_ev = None
	if profile_data and card_ids:
	profile = UserProfile(**profile_data)
	portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile)
	ref_ev = portfolio.total_ev_usd
	else:
	ref_ev = ref.get("reference_ev_usd")

	if ref_ev is None or ref_ev == 0:
	return 0.15 # Credit for attempting EV calculation

	# Check if any extracted amount is close to reference
	best_score = 0.15 # Base credit for mentioning dollar amounts
	for amt_str in amounts:
	try:
	amt = float(amt_str.replace("$", "").replace(",", ""))
	error_pct = abs(amt - ref_ev) / abs(ref_ev)
	if error_pct <= 0.05:
	best_score = max(best_score, 0.5)
	elif error_pct <= 0.15:
	best_score = max(best_score, 0.4)
	elif error_pct <= 0.30:
	best_score = max(best_score, 0.3)
	elif error_pct <= 0.50:
	best_score = max(best_score, 0.2)
	except ValueError:
	continue
	return best_score

	def _score_constraint_compliance(self, dim: Any, parsed_json: dict[str, Any]) -> float:
	"""Score constraint compliance (card selection + housing)."""
	source = dim.checks or dim.reference
	if not source:
	return 0.0

	scores: list[tuple[float, float]] = []

	# Card matching via F1 with fuzzy matching
	expected_cards = source.get("expected_cards")
	agent_cards = parsed_json.get("recommended_cards")
	if expected_cards is not None and agent_cards is not None:
	expected_set = {c.lower().strip() for c in expected_cards}
	agent_set = {c.lower().strip() for c in agent_cards}
	if not expected_set and not agent_set:
	scores.append((1.0, 0.7))
	elif expected_set:
	f1 = compute_f1_score(expected_set, agent_set)
	scores.append((f1, 0.7))
	# Skip when expected_set is empty — task doesn't require specific cards

	# Housing option match
	expected_housing = source.get("expected_housing_option")
	agent_housing = parsed_json.get("housing_option")
	if expected_housing is not None:
	match = (
	agent_housing is not None
	and str(agent_housing).lower().strip() == str(expected_housing).lower().strip()
	)
	scores.append((1.0 if match else 0.0, 0.3))

	if not scores:
	return 0.0

	total_weight = sum(w for _, w in scores)
	return sum(s * w for s, w in scores) / total_weight if total_weight else 0.0

	def _score_factual_fidelity(
	self,
	dim: Any,
	parsed_json: dict[str, Any],
	agent_answer: str = "",
	) -> float \| None:
	"""Score factual fidelity of agent claims.

	Extracts claims from the agent's answer using an LLM (if available),
	then verifies them against the card database. Falls back to
	pre-extracted claims in the task reference if no extractor is configured.
	"""
	ref = dim.reference or {}

	# Try LLM extraction first (preferred — works for any answer)
	extracted_claims = None
	if self.claim_extractor and agent_answer:
	extracted_claims = self.claim_extractor.extract_claims(agent_answer)
	if extracted_claims:
	n = len(extracted_claims.get("per_card_claims", []))
	logger.debug("Extracted %d card claims via LLM", n)

	# Fall back to pre-extracted claims in task reference
	if not extracted_claims:
	extracted_claims = ref.get("extracted_claims")

	if not extracted_claims:
	# No extractor and no pre-extracted claims — not scorable
	return None

	# Get reference EV for normalization (use 1000 if missing or null)
	ev_dim_ref = ref.get("reference_ev_usd") or 1000

	result = self.fidelity_scorer.score_claims(extracted_claims, ev_dim_ref)
	return result.score

	def _compute_reward(
	self,
	results: list[DimensionResult],
	structure_bonus: float,
	hard_constraint_violated: bool,
	) -> float:
	"""Compute composite reward."""
	scored = [(r.score, r.weight) for r in results if r.score is not None]
	return compute_composite_reward(scored, structure_bonus, hard_constraint_violated)


	def create_objective_evaluator_service(
	card_db: CardDatabase,
	) -> ObjectiveEvaluatorService:
	"""Factory for svcs registration."""
	return ObjectiveEvaluatorService(card_db)