Spaces:
Sleeping
Sleeping
| """Shared constants and utilities for evaluation services.""" | |
| from __future__ import annotations | |
| from difflib import SequenceMatcher | |
| # Required fields in the agent's structured JSON output | |
| REQUIRED_JSON_FIELDS = { | |
| "recommended_cards", | |
| "total_ev_usd", | |
| "ev_breakdown", | |
| "constraints_respected", | |
| } | |
| # Multiplier applied to reward when hard constraint is violated. | |
| # Preserves some gradient signal instead of zeroing out entirely. | |
| HARD_CONSTRAINT_PENALTY = 0.1 | |
| def fuzzy_match_cards(expected: set[str], agent: set[str]) -> float: | |
| """Count matching cards with fuzzy matching. | |
| Exact matches count as 1.0, fuzzy matches (>= 0.8 similarity) as 0.5. | |
| Returns a float "true positive" count that can exceed integer values | |
| due to partial credit. | |
| """ | |
| tp = 0.0 | |
| remaining_agent = set(agent) | |
| for exp_card in expected: | |
| if exp_card in remaining_agent: | |
| tp += 1.0 | |
| remaining_agent.discard(exp_card) | |
| else: | |
| best_score = 0.0 | |
| best_match = None | |
| for ag_card in remaining_agent: | |
| sim = SequenceMatcher(None, exp_card, ag_card).ratio() | |
| if sim > best_score: | |
| best_score = sim | |
| best_match = ag_card | |
| if best_score >= 0.8 and best_match is not None: | |
| tp += 0.5 | |
| remaining_agent.discard(best_match) | |
| return tp | |
| def compute_f1_score(expected_set: set[str], agent_set: set[str]) -> float: | |
| """Compute F1 score for card matching using fuzzy matching. | |
| Returns 1.0 when both sets are empty, 0.0 if only one is empty. | |
| """ | |
| if not expected_set and not agent_set: | |
| return 1.0 | |
| if not expected_set or not agent_set: | |
| return 0.0 | |
| tp = fuzzy_match_cards(expected_set, agent_set) | |
| precision = tp / len(agent_set) | |
| recall = tp / len(expected_set) | |
| if precision + recall == 0: | |
| return 0.0 | |
| return 2 * precision * recall / (precision + recall) | |
| def compute_composite_reward( | |
| scored_dimensions: list[tuple[float, float]], | |
| structure_bonus: float, | |
| hard_constraint_violated: bool, | |
| ) -> float: | |
| """Compute overall reward from scored dimension (score, weight) pairs. | |
| Strategy: | |
| - If hard constraint violated -> reward * HARD_CONSTRAINT_PENALTY (not zero) | |
| - If no dimensions scored -> use structure bonus as fallback | |
| - Otherwise -> weighted average of scored dimensions + structure bonus | |
| """ | |
| if not scored_dimensions: | |
| reward = structure_bonus | |
| else: | |
| total_weight = sum(w for _, w in scored_dimensions) | |
| if total_weight == 0: | |
| reward = 0.0 | |
| else: | |
| weighted_sum = sum(s * w for s, w in scored_dimensions) | |
| reward = weighted_sum / total_weight + structure_bonus | |
| if hard_constraint_violated: | |
| reward *= HARD_CONSTRAINT_PENALTY | |
| return min(max(reward, 0.0), 1.0) | |
| def compute_structure_bonus( | |
| parsed_json: dict | None, # type: ignore[type-arg] | |
| bonus_basic: float, | |
| bonus_partial: float, | |
| bonus_full: float, | |
| ) -> tuple[float, bool, bool]: | |
| """Compute tiered structure bonus for valid JSON. | |
| Returns (structure_bonus, json_valid, json_has_required_fields). | |
| """ | |
| json_valid = parsed_json is not None | |
| json_has_required = ( | |
| json_valid | |
| and isinstance(parsed_json, dict) | |
| and REQUIRED_JSON_FIELDS.issubset(parsed_json.keys()) | |
| ) | |
| structure_bonus = 0.0 | |
| if json_valid and isinstance(parsed_json, dict): | |
| has_cards = "recommended_cards" in parsed_json | |
| has_ev = "total_ev_usd" in parsed_json | |
| if json_has_required: | |
| structure_bonus = bonus_full | |
| elif has_cards and has_ev: | |
| structure_bonus = bonus_partial | |
| else: | |
| structure_bonus = bonus_basic | |
| return structure_bonus, json_valid, json_has_required | |