Spaces:

Pratap-K
/

AutoMathReasoner

Sleeping

App Files Files Community

AutoMathReasoner / env /rewards.py

Pratap-K

Modigy Train method

8093eea 13 days ago

raw

history blame contribute delete

4.92 kB

	import random
	import math
	from typing import Dict, Any, List, Tuple

	class RewardSystem:
	def __init__(self, max_len: int = 1000):
	self.max_len = max_len

	def compute_diversity(self, current_answer: str, history: List[Dict[str, Any]]) -> float:
	"""
	D = diversity (difference from past attempts)
	If repeated answer, returns a steep exponential penalty: D = -exp(1.0).
	Otherwise, returns D = 1.0.
	"""
	if not history:
	return 1.0

	cur_ans_clean = current_answer.strip().lower()

	for attempt in history:
	prev_ans = attempt.get('final_answer', '').strip().lower()
	if prev_ans == cur_ans_clean:
	return -math.exp(1.0) # Approx -2.71steep penalty

	# If unique, give full diversity bonus
	return 1.0

	def compute_efficiency(self, action_string: str) -> float:
	"""
	E = efficiency. We use a Gaussian penalty curve:
	E = exp(- (len_ratio)^2 ) - 1
	This smoothly penalizes overly verbose answers.
	"""
	approx_tokens = len(action_string) / 4.0
	optimal_tokens = 50.0 # Assumed ideal length

	# Ratio mapping constraint
	ratio = (approx_tokens - optimal_tokens) / optimal_tokens

	# Smooth gaussian-like decay towards -1.0
	e = math.exp(- (ratio ** 2)) - 1.0
	return e

	def compute_exploration_bonus(self, action_string: str, times_seen: int) -> float:
	"""
	[PAPER TRACEABILITY: Exploration via Entropy Bonus]
	G. EXPLORATION VIA ENTROPY BONUS
	Computes output diversity (token variance) and adds bonus.
	X = (entropy_bonus) / sqrt(1 + times_seen_problem)
	"""
	# Simple structural entropy estimation (unique character distribution variance)
	length = len(action_string)
	if length > 0:
	unique_ratio = len(set(action_string)) / length
	entropy_bonus = math.log1p(unique_ratio) # Non-linear scaling
	else:
	entropy_bonus = 0.0

	return entropy_bonus / math.sqrt(1.0 + times_seen)

	def detect_trivial_output(self, action_string: str) -> bool:
	"""Anti-reward hacking: detect trivial constant outputs"""
	# If the output is just a single character repeated or very low entropy
	if len(action_string) < 2:
	return True
	unique_chars = len(set(action_string))
	if unique_chars < 3 and len(action_string) > 10:
	return True
	return False

	def compute_reward(self,
	correctness: float,
	reasoning_quality: float,
	process_supervision: float,
	reflection_score: float,
	action_str: str,
	final_answer: str,
	history: List[Dict[str, Any]],
	times_seen_problem: int) -> Tuple[float, Dict[str, float]]:
	"""
	[PAPER TRACEABILITY: DeepSeekMath-inspired reward composite]
	R = 0.4C + 0.2Q_smooth + 0.15D + 0.1E + 0.1P + 0.1R + 0.15*X + noise
	"""
	if self.detect_trivial_output(action_str):
	# Anti-hacking strongly penalized
	components = {"C": 0.0, "Q": 0.0, "D": 0.0, "E": -1.0, "X": 0.0, "noise": 0.0}
	return -1.0, components

	c = correctness
	q = reasoning_quality
	d = self.compute_diversity(final_answer, history)

	# If repeated answer, C is zeroed to prevent hacking
	if d < 0:
	c = 0.0

	e = self.compute_efficiency(action_str)
	x = self.compute_exploration_bonus(action_str, times_seen_problem)

	noise = random.gauss(0, 0.05)

	# Smoothly squish reasoning quality using tanh to bound its impact
	q_smooth = math.tanh(q)

	# Normalize variables mapping entirely into the [0, 1] domain
	p_norm = (process_supervision + 1.0) / 2.0 # Scales [-1, 1] to [0, 1]
	r_norm = (reflection_score + 0.5) / 1.5 # Scales [-0.5, 1.0] to [0, 1]
	q_norm = min(1.0, max(0.0, q_smooth))

	# New Simplified Composite Reward Equation (Strictly bounded [0, 1])
	# Base coefficients sum exactly to 1.0. Noise is removed to satisfy bounds.
	total_r = (0.4 * c) + (0.3 * q_norm) + (0.2 * p_norm) + (0.1 * r_norm)
	components = {
	"total_reward": total_r,
	"C_correctness": c,
	"Q_reasoning": q_smooth,
	"P_process_supervision": process_supervision,
	"R_reflection": reflection_score,
	"D_diversity": d,
	"E_efficiency": e,
	"X_exploration": x,
	"noise": noise
	}

	return total_r, components