Spaces:
Sleeping
Sleeping
| """Objective evaluator service. | |
| Evaluates agent answers using the deterministic EV calculator and factual | |
| fidelity scorer. No expert-provided reference values — ground truth is | |
| computed from the card database and user profile. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| from difflib import SequenceMatcher | |
| from typing import Any | |
| from lexenvs.config.constants import ( | |
| compute_composite_reward, | |
| compute_f1_score, | |
| compute_structure_bonus, | |
| ) | |
| from lexenvs.schemas.card_database import CardDatabase, UserProfile | |
| from lexenvs.schemas.task import DimensionResult, TaskDefinition | |
| from lexenvs.services.claim_extractor import ClaimExtractor | |
| from lexenvs.services.ev_calculator_service import EVCalculatorService | |
| from lexenvs.services.factual_fidelity_scorer import FactualFidelityScorer | |
| from lexenvs.utils import extract_json_block | |
| logger = logging.getLogger(__name__) | |
| # Tiered structure bonuses for valid JSON (RL gradient signal). | |
| STRUCTURE_BONUS_BASIC = 0.03 | |
| STRUCTURE_BONUS_PARTIAL = 0.06 | |
| STRUCTURE_BONUS_FULL = 0.10 | |
| # Common abbreviations/aliases for card matching in prose | |
| CARD_ALIASES: dict[str, list[str]] = { | |
| "american express platinum": ["amex platinum", "amex plat", "the platinum card"], | |
| "american express gold": ["amex gold"], | |
| "american express blue business plus": ["amex bbb", "blue business plus", "bbp"], | |
| "chase sapphire preferred": ["csp", "sapphire preferred"], | |
| "chase sapphire reserve": ["csr", "sapphire reserve"], | |
| "chase ink business preferred": ["cip", "ink preferred", "ink business preferred"], | |
| "chase ink business cash": ["cic", "ink cash", "ink business cash"], | |
| "chase ink business unlimited": ["ciu", "ink unlimited", "ink business unlimited"], | |
| "capital one venture x": ["venture x"], | |
| "capital one venture": ["venture card"], | |
| "citi staa": ["staa"], | |
| "chase united explorer": ["united explorer"], | |
| "chase united club": ["united club infinite"], | |
| } | |
| class ObjectiveEvaluatorService: | |
| """Evaluates agent answers using computed ground truth. | |
| Scoring dimensions (all automated): | |
| - card_selection: F1 of recommended cards vs. optimal set | |
| - ev_accuracy: how close agent's total_ev_usd is to computed truth | |
| - factual_fidelity: accuracy of factual claims about cards | |
| - constraint_compliance: did the agent respect user/issuer constraints | |
| """ | |
| def __init__( | |
| self, | |
| card_db: CardDatabase, | |
| claim_extractor: ClaimExtractor | None = None, | |
| ) -> None: | |
| self.card_db = card_db | |
| self.ev_calc = EVCalculatorService(card_db) | |
| self.fidelity_scorer = FactualFidelityScorer(card_db) | |
| self.claim_extractor = claim_extractor | |
| def evaluate( | |
| self, | |
| task: TaskDefinition, | |
| agent_answer: str, | |
| ) -> tuple[float, list[DimensionResult], dict[str, object]]: | |
| """Evaluate an agent's answer using objective scoring. | |
| The task's scoring dimensions must include objective-mode config: | |
| - ev_accuracy.reference.user_profile: UserProfile dict | |
| - ev_accuracy.reference.expected_cards: list of card IDs | |
| - constraint_compliance.checks.expected_cards: list of card names | |
| - factual_fidelity.reference.extracted_claims: claims from extractor LLM | |
| Returns: | |
| (reward, dimension_results, metadata) | |
| """ | |
| parsed_json = extract_json_block(agent_answer) | |
| structure_bonus, json_valid, json_has_required = compute_structure_bonus( | |
| parsed_json, STRUCTURE_BONUS_BASIC, STRUCTURE_BONUS_PARTIAL, STRUCTURE_BONUS_FULL | |
| ) | |
| dimension_results: list[DimensionResult] = [] | |
| hard_constraint_violated = False | |
| for dim_name, dim in task.scoring.dimensions.items(): | |
| score = self._score_dimension( | |
| dim_name, dim, parsed_json, task, agent_answer=agent_answer | |
| ) | |
| if score is not None: | |
| if ( | |
| dim.hard_constraint | |
| and task.scoring.hard_constraint_failure_zeroes_dimension | |
| and score == 0.0 | |
| ): | |
| hard_constraint_violated = True | |
| dimension_results.append( | |
| DimensionResult( | |
| dimension=dim_name, | |
| score=score, | |
| weight=dim.weight, | |
| status="scored", | |
| ) | |
| ) | |
| else: | |
| dimension_results.append( | |
| DimensionResult( | |
| dimension=dim_name, | |
| score=None, | |
| weight=dim.weight, | |
| status="pending_reference", | |
| ) | |
| ) | |
| scored = [(r.score, r.weight) for r in dimension_results if r.score is not None] | |
| reward = compute_composite_reward(scored, structure_bonus, hard_constraint_violated) | |
| metadata: dict[str, object] = { | |
| "json_valid": json_valid, | |
| "json_has_required_fields": json_has_required, | |
| "hard_constraint_violated": hard_constraint_violated, | |
| "evaluation_mode": "objective", | |
| } | |
| # Add computed reference EV to metadata for transparency | |
| ev_dim = task.scoring.dimensions.get("ev_accuracy") | |
| if ev_dim and ev_dim.reference: | |
| profile_data = ev_dim.reference.get("user_profile") | |
| card_ids = ev_dim.reference.get("expected_card_ids") | |
| if profile_data and card_ids: | |
| profile = UserProfile(**profile_data) | |
| portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile) | |
| metadata["computed_reference_ev_usd"] = portfolio.total_ev_usd | |
| metadata["computed_ev_breakdown"] = { | |
| "signup_bonuses_usd": portfolio.total_signup_bonuses_usd, | |
| "ongoing_rewards_usd": portfolio.total_ongoing_rewards_usd, | |
| "credits_usd": portfolio.total_credits_usd, | |
| "perks_usd": portfolio.total_perks_usd, | |
| "annual_fees_usd": portfolio.total_annual_fees_usd, | |
| } | |
| if parsed_json and isinstance(parsed_json, dict): | |
| metadata["parsed_fields"] = list(parsed_json.keys()) | |
| return reward, dimension_results, metadata | |
| def _prose_effort_score(answer: str) -> float: | |
| """Score answer effort/quality heuristically. Returns 0.0-1.0. | |
| Rewards longer, more detailed answers that show reasoning. | |
| This gives gradient signal even before card matching kicks in. | |
| """ | |
| if not answer or not answer.strip(): | |
| return 0.0 | |
| word_count = len(answer.split()) | |
| # Length tiers: <10 words=0, 10-50=0.2, 50-150=0.5, 150-400=0.8, 400+=1.0 | |
| if word_count < 10: | |
| return 0.0 | |
| if word_count < 50: | |
| return 0.2 | |
| if word_count < 150: | |
| return 0.5 | |
| if word_count < 400: | |
| return 0.8 | |
| return 1.0 | |
| def _fuzzy_card_match_prose(expected_card: str, answer_lower: str) -> float: | |
| """Check if a card name or any of its aliases appear in prose. | |
| Returns a match score: 1.0 for exact/alias match, 0.5 for fuzzy | |
| substring match, 0.0 for no match. Uses both alias lookup and | |
| sliding-window fuzzy matching so creative abbreviations still score. | |
| """ | |
| card_lower = expected_card.lower() | |
| if card_lower in answer_lower: | |
| return 1.0 | |
| # Check aliases | |
| aliases = CARD_ALIASES.get(card_lower, []) | |
| if any(alias in answer_lower for alias in aliases): | |
| return 1.0 | |
| # Fuzzy sliding window: check each word-window of the answer | |
| # against the card name. This catches partial names like | |
| # "Sapphire Preferred" matching "Chase Sapphire Preferred". | |
| card_words = card_lower.split() | |
| answer_words = answer_lower.split() | |
| window_size = len(card_words) | |
| if window_size == 0 or len(answer_words) < window_size: | |
| return 0.0 | |
| best_sim = 0.0 | |
| for i in range(len(answer_words) - window_size + 1): | |
| window = " ".join(answer_words[i : i + window_size]) | |
| sim = SequenceMatcher(None, card_lower, window).ratio() | |
| best_sim = max(best_sim, sim) | |
| if best_sim >= 0.8: | |
| return 0.5 | |
| return 0.0 | |
| def _score_dimension( | |
| self, | |
| dim_name: str, | |
| dim: Any, | |
| parsed_json: dict[str, Any] | None, | |
| task: TaskDefinition, | |
| agent_answer: str = "", | |
| ) -> float | None: | |
| """Score a single dimension.""" | |
| if parsed_json is None: | |
| # No JSON — use prose-based fallbacks for reward variance | |
| if dim_name == "card_selection" and agent_answer: | |
| return self._score_card_selection_from_prose(dim, agent_answer) | |
| if dim_name == "constraint_compliance" and agent_answer: | |
| return self._score_constraint_compliance_from_prose(dim, agent_answer) | |
| if dim_name == "ev_accuracy" and agent_answer: | |
| return self._score_ev_accuracy_from_prose(dim, agent_answer) | |
| if dim_name == "factual_fidelity" and agent_answer: | |
| return self._score_factual_fidelity(dim, {}, agent_answer=agent_answer) | |
| return 0.0 | |
| if dim_name == "ev_accuracy": | |
| return self._score_ev_accuracy(dim, parsed_json) | |
| if dim_name == "card_selection": | |
| return self._score_card_selection(dim, parsed_json) | |
| if dim_name == "constraint_compliance": | |
| return self._score_constraint_compliance(dim, parsed_json) | |
| if dim_name == "factual_fidelity": | |
| return self._score_factual_fidelity(dim, parsed_json, agent_answer=agent_answer) | |
| if dim.type == "human": | |
| return None | |
| return None | |
| def _score_ev_accuracy(self, dim: Any, parsed_json: dict[str, Any]) -> float: | |
| """Score EV accuracy against computed ground truth.""" | |
| ref = dim.reference | |
| if not ref: | |
| return 0.0 | |
| profile_data = ref.get("user_profile") | |
| card_ids = ref.get("expected_card_ids") | |
| if not profile_data or not card_ids: | |
| # Fall back to static reference_ev_usd if provided | |
| ref_ev = ref.get("reference_ev_usd") | |
| if ref_ev is None: | |
| return 0.0 | |
| return self._ev_accuracy_score( | |
| ref_ev, parsed_json.get("total_ev_usd"), ref.get("ev_tolerance_pct", 0.05) | |
| ) | |
| # Compute ground truth from card database | |
| profile = UserProfile(**profile_data) | |
| portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile) | |
| ref_ev = portfolio.total_ev_usd | |
| tolerance = ref.get("ev_tolerance_pct", 0.05) | |
| return self._ev_accuracy_score(ref_ev, parsed_json.get("total_ev_usd"), tolerance) | |
| def _ev_accuracy_score(self, ref_ev: float, agent_ev: Any, tolerance: float) -> float: | |
| """Linear decay EV accuracy scoring.""" | |
| if agent_ev is None: | |
| return 0.0 | |
| try: | |
| agent_ev = float(agent_ev) | |
| ref_ev = float(ref_ev) | |
| except (TypeError, ValueError): | |
| return 0.0 | |
| if ref_ev == 0: | |
| return 1.0 if agent_ev == 0 else 0.0 | |
| error_pct = abs(agent_ev - ref_ev) / abs(ref_ev) | |
| if error_pct <= tolerance: | |
| return 1.0 | |
| max_error = tolerance * 5 # wider decay for more partial credit | |
| if error_pct >= max_error: | |
| return 0.0 | |
| return float(1.0 - (error_pct - tolerance) / (max_error - tolerance)) | |
| def _score_card_selection(self, dim: Any, parsed_json: dict[str, Any]) -> float: | |
| """Score card selection using F1 against expected cards with fuzzy matching.""" | |
| source = dim.checks or dim.reference | |
| if not source: | |
| return 0.0 | |
| expected_cards = source.get("expected_cards") | |
| agent_cards = parsed_json.get("recommended_cards") | |
| if expected_cards is None or agent_cards is None: | |
| return 0.0 | |
| expected_set = {c.lower().strip() for c in expected_cards} | |
| agent_set = {c.lower().strip() for c in agent_cards} | |
| return compute_f1_score(expected_set, agent_set) | |
| def _score_card_selection_from_prose(self, dim: Any, agent_answer: str) -> float: | |
| """Fallback: check if expected card names appear in prose. | |
| Uses alias matching (e.g. "CSR" -> "Chase Sapphire Reserve"). | |
| Blends card match score (0.7 weight) with effort score (0.3 weight) | |
| so the model gets gradient signal for writing detailed answers even | |
| before it nails the exact right cards. | |
| Capped at 0.7 to incentivize proper JSON output. | |
| """ | |
| source = dim.checks or dim.reference | |
| if not source: | |
| return self._prose_effort_score(agent_answer) * 0.15 | |
| expected_cards = source.get("expected_cards") | |
| if not expected_cards: | |
| return self._prose_effort_score(agent_answer) * 0.15 | |
| answer_lower = agent_answer.lower() | |
| matches = sum(self._fuzzy_card_match_prose(c, answer_lower) for c in expected_cards) | |
| card_score = min(matches / len(expected_cards), 1.0) | |
| effort = self._prose_effort_score(agent_answer) | |
| # Blend: 70% card matching, 30% effort — capped at 0.7 | |
| return (card_score * 0.7 + effort * 0.3) * 0.7 | |
| def _score_constraint_compliance_from_prose(self, dim: Any, agent_answer: str) -> float: | |
| """Fallback: check if expected cards appear in prose. | |
| Uses alias matching. Blends card match with effort score. | |
| Capped at 0.6 to incentivize JSON output. | |
| """ | |
| source = dim.checks or dim.reference | |
| if not source: | |
| return self._prose_effort_score(agent_answer) * 0.15 | |
| expected_cards = source.get("expected_cards") | |
| if not expected_cards: | |
| return self._prose_effort_score(agent_answer) * 0.15 | |
| answer_lower = agent_answer.lower() | |
| matches = sum(self._fuzzy_card_match_prose(c, answer_lower) for c in expected_cards) | |
| card_score = min(matches / len(expected_cards), 1.0) | |
| effort = self._prose_effort_score(agent_answer) | |
| return (card_score * 0.7 + effort * 0.3) * 0.6 | |
| def _score_ev_accuracy_from_prose(self, dim: Any, agent_answer: str) -> float: | |
| """Fallback: look for dollar amounts in prose and compare to reference. | |
| Gives partial credit for mentioning any dollar amounts (shows the model | |
| is attempting EV calculation). Higher credit for amounts close to the | |
| reference. Capped at 0.5 to incentivize structured JSON output. | |
| """ | |
| ref = dim.reference | |
| if not ref: | |
| return self._prose_effort_score(agent_answer) * 0.1 | |
| # Base credit for effort | |
| effort = self._prose_effort_score(agent_answer) | |
| # Try to extract dollar amounts from prose | |
| amounts = re.findall(r"\$[\d,]+(?:\.\d{1,2})?", agent_answer) | |
| if not amounts: | |
| # No dollar amounts but wrote something — small credit for effort | |
| return effort * 0.1 | |
| # Get reference EV | |
| profile_data = ref.get("user_profile") | |
| card_ids = ref.get("expected_card_ids") | |
| ref_ev = None | |
| if profile_data and card_ids: | |
| profile = UserProfile(**profile_data) | |
| portfolio = self.ev_calc.compute_portfolio_ev(card_ids, profile) | |
| ref_ev = portfolio.total_ev_usd | |
| else: | |
| ref_ev = ref.get("reference_ev_usd") | |
| if ref_ev is None or ref_ev == 0: | |
| return 0.15 # Credit for attempting EV calculation | |
| # Check if any extracted amount is close to reference | |
| best_score = 0.15 # Base credit for mentioning dollar amounts | |
| for amt_str in amounts: | |
| try: | |
| amt = float(amt_str.replace("$", "").replace(",", "")) | |
| error_pct = abs(amt - ref_ev) / abs(ref_ev) | |
| if error_pct <= 0.05: | |
| best_score = max(best_score, 0.5) | |
| elif error_pct <= 0.15: | |
| best_score = max(best_score, 0.4) | |
| elif error_pct <= 0.30: | |
| best_score = max(best_score, 0.3) | |
| elif error_pct <= 0.50: | |
| best_score = max(best_score, 0.2) | |
| except ValueError: | |
| continue | |
| return best_score | |
| def _score_constraint_compliance(self, dim: Any, parsed_json: dict[str, Any]) -> float: | |
| """Score constraint compliance (card selection + housing).""" | |
| source = dim.checks or dim.reference | |
| if not source: | |
| return 0.0 | |
| scores: list[tuple[float, float]] = [] | |
| # Card matching via F1 with fuzzy matching | |
| expected_cards = source.get("expected_cards") | |
| agent_cards = parsed_json.get("recommended_cards") | |
| if expected_cards is not None and agent_cards is not None: | |
| expected_set = {c.lower().strip() for c in expected_cards} | |
| agent_set = {c.lower().strip() for c in agent_cards} | |
| if not expected_set and not agent_set: | |
| scores.append((1.0, 0.7)) | |
| elif expected_set: | |
| f1 = compute_f1_score(expected_set, agent_set) | |
| scores.append((f1, 0.7)) | |
| # Skip when expected_set is empty — task doesn't require specific cards | |
| # Housing option match | |
| expected_housing = source.get("expected_housing_option") | |
| agent_housing = parsed_json.get("housing_option") | |
| if expected_housing is not None: | |
| match = ( | |
| agent_housing is not None | |
| and str(agent_housing).lower().strip() == str(expected_housing).lower().strip() | |
| ) | |
| scores.append((1.0 if match else 0.0, 0.3)) | |
| if not scores: | |
| return 0.0 | |
| total_weight = sum(w for _, w in scores) | |
| return sum(s * w for s, w in scores) / total_weight if total_weight else 0.0 | |
| def _score_factual_fidelity( | |
| self, | |
| dim: Any, | |
| parsed_json: dict[str, Any], | |
| agent_answer: str = "", | |
| ) -> float | None: | |
| """Score factual fidelity of agent claims. | |
| Extracts claims from the agent's answer using an LLM (if available), | |
| then verifies them against the card database. Falls back to | |
| pre-extracted claims in the task reference if no extractor is configured. | |
| """ | |
| ref = dim.reference or {} | |
| # Try LLM extraction first (preferred — works for any answer) | |
| extracted_claims = None | |
| if self.claim_extractor and agent_answer: | |
| extracted_claims = self.claim_extractor.extract_claims(agent_answer) | |
| if extracted_claims: | |
| n = len(extracted_claims.get("per_card_claims", [])) | |
| logger.debug("Extracted %d card claims via LLM", n) | |
| # Fall back to pre-extracted claims in task reference | |
| if not extracted_claims: | |
| extracted_claims = ref.get("extracted_claims") | |
| if not extracted_claims: | |
| # No extractor and no pre-extracted claims — not scorable | |
| return None | |
| # Get reference EV for normalization (use 1000 if missing or null) | |
| ev_dim_ref = ref.get("reference_ev_usd") or 1000 | |
| result = self.fidelity_scorer.score_claims(extracted_claims, ev_dim_ref) | |
| return result.score | |
| def _compute_reward( | |
| self, | |
| results: list[DimensionResult], | |
| structure_bonus: float, | |
| hard_constraint_violated: bool, | |
| ) -> float: | |
| """Compute composite reward.""" | |
| scored = [(r.score, r.weight) for r in results if r.score is not None] | |
| return compute_composite_reward(scored, structure_bonus, hard_constraint_violated) | |
| def create_objective_evaluator_service( | |
| card_db: CardDatabase, | |
| ) -> ObjectiveEvaluatorService: | |
| """Factory for svcs registration.""" | |
| return ObjectiveEvaluatorService(card_db) | |