AutoMathReasoner / env /rewards.py
Pratap-K's picture
Modigy Train method
8093eea
import random
import math
from typing import Dict, Any, List, Tuple
class RewardSystem:
def __init__(self, max_len: int = 1000):
self.max_len = max_len
def compute_diversity(self, current_answer: str, history: List[Dict[str, Any]]) -> float:
"""
D = diversity (difference from past attempts)
If repeated answer, returns a steep exponential penalty: D = -exp(1.0).
Otherwise, returns D = 1.0.
"""
if not history:
return 1.0
cur_ans_clean = current_answer.strip().lower()
for attempt in history:
prev_ans = attempt.get('final_answer', '').strip().lower()
if prev_ans == cur_ans_clean:
return -math.exp(1.0) # Approx -2.71steep penalty
# If unique, give full diversity bonus
return 1.0
def compute_efficiency(self, action_string: str) -> float:
"""
E = efficiency. We use a Gaussian penalty curve:
E = exp(- (len_ratio)^2 ) - 1
This smoothly penalizes overly verbose answers.
"""
approx_tokens = len(action_string) / 4.0
optimal_tokens = 50.0 # Assumed ideal length
# Ratio mapping constraint
ratio = (approx_tokens - optimal_tokens) / optimal_tokens
# Smooth gaussian-like decay towards -1.0
e = math.exp(- (ratio ** 2)) - 1.0
return e
def compute_exploration_bonus(self, action_string: str, times_seen: int) -> float:
"""
[PAPER TRACEABILITY: Exploration via Entropy Bonus]
G. EXPLORATION VIA ENTROPY BONUS
Computes output diversity (token variance) and adds bonus.
X = (entropy_bonus) / sqrt(1 + times_seen_problem)
"""
# Simple structural entropy estimation (unique character distribution variance)
length = len(action_string)
if length > 0:
unique_ratio = len(set(action_string)) / length
entropy_bonus = math.log1p(unique_ratio) # Non-linear scaling
else:
entropy_bonus = 0.0
return entropy_bonus / math.sqrt(1.0 + times_seen)
def detect_trivial_output(self, action_string: str) -> bool:
"""Anti-reward hacking: detect trivial constant outputs"""
# If the output is just a single character repeated or very low entropy
if len(action_string) < 2:
return True
unique_chars = len(set(action_string))
if unique_chars < 3 and len(action_string) > 10:
return True
return False
def compute_reward(self,
correctness: float,
reasoning_quality: float,
process_supervision: float,
reflection_score: float,
action_str: str,
final_answer: str,
history: List[Dict[str, Any]],
times_seen_problem: int) -> Tuple[float, Dict[str, float]]:
"""
[PAPER TRACEABILITY: DeepSeekMath-inspired reward composite]
R = 0.4*C + 0.2*Q_smooth + 0.15*D + 0.1*E + 0.1*P + 0.1*R + 0.15*X + noise
"""
if self.detect_trivial_output(action_str):
# Anti-hacking strongly penalized
components = {"C": 0.0, "Q": 0.0, "D": 0.0, "E": -1.0, "X": 0.0, "noise": 0.0}
return -1.0, components
c = correctness
q = reasoning_quality
d = self.compute_diversity(final_answer, history)
# If repeated answer, C is zeroed to prevent hacking
if d < 0:
c = 0.0
e = self.compute_efficiency(action_str)
x = self.compute_exploration_bonus(action_str, times_seen_problem)
noise = random.gauss(0, 0.05)
# Smoothly squish reasoning quality using tanh to bound its impact
q_smooth = math.tanh(q)
# Normalize variables mapping entirely into the [0, 1] domain
p_norm = (process_supervision + 1.0) / 2.0 # Scales [-1, 1] to [0, 1]
r_norm = (reflection_score + 0.5) / 1.5 # Scales [-0.5, 1.0] to [0, 1]
q_norm = min(1.0, max(0.0, q_smooth))
# New Simplified Composite Reward Equation (Strictly bounded [0, 1])
# Base coefficients sum exactly to 1.0. Noise is removed to satisfy bounds.
total_r = (0.4 * c) + (0.3 * q_norm) + (0.2 * p_norm) + (0.1 * r_norm)
components = {
"total_reward": total_r,
"C_correctness": c,
"Q_reasoning": q_smooth,
"P_process_supervision": process_supervision,
"R_reflection": reflection_score,
"D_diversity": d,
"E_efficiency": e,
"X_exploration": x,
"noise": noise
}
return total_r, components