import re import math from typing import Dict, Any, Tuple class VerifierSystem: def __init__(self): pass def check_exact_match(self, prediction: str, ground_truth: str) -> bool: """1. Exact match verifier""" return prediction.strip().lower() == ground_truth.strip().lower() def check_numeric_tolerance(self, prediction: str, ground_truth: str, tol: float = 1e-4) -> bool: """2. Numeric tolerance checker""" try: pred_val = float(prediction.strip()) gt_val = float(ground_truth.strip()) return math.isclose(pred_val, gt_val, rel_tol=tol, abs_tol=tol) except ValueError: return False def check_python_execution(self, prediction: str, ground_truth: str) -> bool: """3. Python execution (eval safe expressions)""" # If prediction is an expression like "2+3", try evaluating it safely safe_dict = {"__builtins__": None, "math": math} try: # We are verifying if evaluating the prediction gives ground truth pred_eval = eval(prediction.strip(), safe_dict, {}) try: gt_eval = float(ground_truth.strip()) return math.isclose(float(pred_eval), gt_eval, rel_tol=1e-4, abs_tol=1e-4) except ValueError: return str(pred_eval).strip().lower() == ground_truth.strip().lower() except Exception: return False def mock_llm_judge(self, reasoning: str, prediction: str, ground_truth: str) -> float: """4. LLM judge (mock or placeholder scoring reasoning quality) Returns reasoning quality score Q (0.0 to 1.0) """ # A simple heuristic for mock judge: # Longer reasoning with step-like markers suggests higher quality in this mock step_markers = ['step', 'first', 'then', 'because', 'therefore', 'equals', '=', '+', '-'] score = 0.0 # Length bonus (up to 0.4) length = len(reasoning.split()) score += min(0.4, length * 0.01) # Structure bonus (up to 0.6) lower_reasoning = reasoning.lower() marker_count = sum(1 for m in step_markers if m in lower_reasoning) score += min(0.6, marker_count * 0.1) return round(min(1.0, score), 2) def check_process_supervision(self, reasoning: str) -> float: """ [PAPER TRACEABILITY: Process Supervision (Lightweight PRM)] E. PROCESS SUPERVISION (STEP-AWARE REWARD) Validates reasoning steps (basic heuristics). Penalizes logical jumps and rewards structured step-by-step reasoning. """ lower_r = reasoning.lower() score = 0.0 # Check stepwise structure if "step 1" in lower_r and "step 2" in lower_r: score += 0.5 elif "first" in lower_r and ("then" in lower_r or "next" in lower_r): score += 0.3 # Penalize missing steps if it's very short but claims complex operations if len(lower_r.split()) < 10 and ("=" in lower_r or "so" in lower_r): score -= 0.5 # Logical jump penalty return max(-1.0, min(1.0, score)) def check_reflection(self, reasoning: str, c: float) -> float: """ [PAPER TRACEABILITY: Reflection Module] H. REFLECTION MODULE Model generates "What could be wrong?" Penalize if contradiction with final answer, reward correct self-correction. """ lower_r = reasoning.lower() score = 0.0 reflection_phrases = ["what could be wrong", "wait,", "let me check", "alternatively"] if any(phrase in lower_r for phrase in reflection_phrases): # Reflection attempted if c >= 1.0: score += 1.0 # Correct self-correction / successful verification else: score -= 0.5 # Contradiction or failed correction return score def check_numerical_integration(self, prediction: str, sympy_f: Any) -> bool: """ [PAPER TRACEABILITY: Section 3.1.3 Solution Verification] Numerical multi-point quadrature verification. Instead of evaluating integrals, we differentiate the prediction F_pred(x) and compare it to the ground truth integrand f(x) at 5 random points. """ import sympy as sp import random x = sp.Symbol('x') try: # Clean prediction string clean_pred = prediction.strip() if "Answer:" in clean_pred: clean_pred = clean_pred.split("Answer:")[-1].strip() clean_pred = clean_pred.replace("+ C", "").replace("+C", "").strip() F_pred = sp.parse_expr(clean_pred) f_pred = sp.diff(F_pred, x) # Evaluate at 5 random points for _ in range(5): test_point = random.uniform(-5, 5) p_val = float(f_pred.subs(x, test_point).evalf()) t_val = float(sympy_f.subs(x, test_point).evalf()) # Paper uses 10^-2 relative tolerance if not math.isclose(p_val, t_val, rel_tol=1e-2, abs_tol=1e-2): return False return True except Exception: return False def verify(self, reasoning: str, prediction: str, ground_truth: str, sympy_f: Any = None) -> Tuple[float, float, float, float]: """ Run all verifiers. Returns Correctness (C), Reasoning Quality (Q), Process Supervision (P), and Reflection (R). """ c = 0.0 if self.check_exact_match(prediction, ground_truth): c = 1.0 elif sympy_f is not None and self.check_numerical_integration(prediction, sympy_f): c = 1.0 elif self.check_numeric_tolerance(prediction, ground_truth): c = 1.0 elif self.check_python_execution(prediction, ground_truth): c = 1.0 q = self.mock_llm_judge(reasoning, prediction, ground_truth) p = self.check_process_supervision(reasoning) r = self.check_reflection(reasoning, c) return c, q, p, r