import random import math from typing import Dict, Any, List, Tuple class RewardSystem: def __init__(self, max_len: int = 1000): self.max_len = max_len def compute_diversity(self, current_answer: str, history: List[Dict[str, Any]]) -> float: """ D = diversity (difference from past attempts) If repeated answer, returns a steep exponential penalty: D = -exp(1.0). Otherwise, returns D = 1.0. """ if not history: return 1.0 cur_ans_clean = current_answer.strip().lower() for attempt in history: prev_ans = attempt.get('final_answer', '').strip().lower() if prev_ans == cur_ans_clean: return -math.exp(1.0) # Approx -2.71steep penalty # If unique, give full diversity bonus return 1.0 def compute_efficiency(self, action_string: str) -> float: """ E = efficiency. We use a Gaussian penalty curve: E = exp(- (len_ratio)^2 ) - 1 This smoothly penalizes overly verbose answers. """ approx_tokens = len(action_string) / 4.0 optimal_tokens = 50.0 # Assumed ideal length # Ratio mapping constraint ratio = (approx_tokens - optimal_tokens) / optimal_tokens # Smooth gaussian-like decay towards -1.0 e = math.exp(- (ratio ** 2)) - 1.0 return e def compute_exploration_bonus(self, action_string: str, times_seen: int) -> float: """ [PAPER TRACEABILITY: Exploration via Entropy Bonus] G. EXPLORATION VIA ENTROPY BONUS Computes output diversity (token variance) and adds bonus. X = (entropy_bonus) / sqrt(1 + times_seen_problem) """ # Simple structural entropy estimation (unique character distribution variance) length = len(action_string) if length > 0: unique_ratio = len(set(action_string)) / length entropy_bonus = math.log1p(unique_ratio) # Non-linear scaling else: entropy_bonus = 0.0 return entropy_bonus / math.sqrt(1.0 + times_seen) def detect_trivial_output(self, action_string: str) -> bool: """Anti-reward hacking: detect trivial constant outputs""" # If the output is just a single character repeated or very low entropy if len(action_string) < 2: return True unique_chars = len(set(action_string)) if unique_chars < 3 and len(action_string) > 10: return True return False def compute_reward(self, correctness: float, reasoning_quality: float, process_supervision: float, reflection_score: float, action_str: str, final_answer: str, history: List[Dict[str, Any]], times_seen_problem: int) -> Tuple[float, Dict[str, float]]: """ [PAPER TRACEABILITY: DeepSeekMath-inspired reward composite] R = 0.4*C + 0.2*Q_smooth + 0.15*D + 0.1*E + 0.1*P + 0.1*R + 0.15*X + noise """ if self.detect_trivial_output(action_str): # Anti-hacking strongly penalized components = {"C": 0.0, "Q": 0.0, "D": 0.0, "E": -1.0, "X": 0.0, "noise": 0.0} return -1.0, components c = correctness q = reasoning_quality d = self.compute_diversity(final_answer, history) # If repeated answer, C is zeroed to prevent hacking if d < 0: c = 0.0 e = self.compute_efficiency(action_str) x = self.compute_exploration_bonus(action_str, times_seen_problem) noise = random.gauss(0, 0.05) # Smoothly squish reasoning quality using tanh to bound its impact q_smooth = math.tanh(q) # Normalize variables mapping entirely into the [0, 1] domain p_norm = (process_supervision + 1.0) / 2.0 # Scales [-1, 1] to [0, 1] r_norm = (reflection_score + 0.5) / 1.5 # Scales [-0.5, 1.0] to [0, 1] q_norm = min(1.0, max(0.0, q_smooth)) # New Simplified Composite Reward Equation (Strictly bounded [0, 1]) # Base coefficients sum exactly to 1.0. Noise is removed to satisfy bounds. total_r = (0.4 * c) + (0.3 * q_norm) + (0.2 * p_norm) + (0.1 * r_norm) components = { "total_reward": total_r, "C_correctness": c, "Q_reasoning": q_smooth, "P_process_supervision": process_supervision, "R_reflection": reflection_score, "D_diversity": d, "E_efficiency": e, "X_exploration": x, "noise": noise } return total_r, components