File size: 4,921 Bytes
a8211b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7813169
a8211b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8093eea
 
 
 
a8211b4
8093eea
 
 
a8211b4
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import random
import math
from typing import Dict, Any, List, Tuple

class RewardSystem:
    def __init__(self, max_len: int = 1000):
        self.max_len = max_len

    def compute_diversity(self, current_answer: str, history: List[Dict[str, Any]]) -> float:
        """
        D = diversity (difference from past attempts)
        If repeated answer, returns a steep exponential penalty: D = -exp(1.0).
        Otherwise, returns D = 1.0.
        """
        if not history:
            return 1.0
            
        cur_ans_clean = current_answer.strip().lower()
        
        for attempt in history:
            prev_ans = attempt.get('final_answer', '').strip().lower()
            if prev_ans == cur_ans_clean:
                return -math.exp(1.0) # Approx -2.71steep penalty
                
        # If unique, give full diversity bonus
        return 1.0

    def compute_efficiency(self, action_string: str) -> float:
        """
        E = efficiency. We use a Gaussian penalty curve:
        E = exp(- (len_ratio)^2 ) - 1
        This smoothly penalizes overly verbose answers.
        """
        approx_tokens = len(action_string) / 4.0
        optimal_tokens = 50.0  # Assumed ideal length
        
        # Ratio mapping constraint
        ratio = (approx_tokens - optimal_tokens) / optimal_tokens
        
        # Smooth gaussian-like decay towards -1.0
        e = math.exp(- (ratio ** 2)) - 1.0
        return e
        
    def compute_exploration_bonus(self, action_string: str, times_seen: int) -> float:
        """
        [PAPER TRACEABILITY: Exploration via Entropy Bonus]
        G. EXPLORATION VIA ENTROPY BONUS
        Computes output diversity (token variance) and adds bonus.
        X = (entropy_bonus) / sqrt(1 + times_seen_problem)
        """
        # Simple structural entropy estimation (unique character distribution variance)
        length = len(action_string)
        if length > 0:
            unique_ratio = len(set(action_string)) / length
            entropy_bonus = math.log1p(unique_ratio)  # Non-linear scaling
        else:
            entropy_bonus = 0.0
            
        return entropy_bonus / math.sqrt(1.0 + times_seen)

    def detect_trivial_output(self, action_string: str) -> bool:
        """Anti-reward hacking: detect trivial constant outputs"""
        # If the output is just a single character repeated or very low entropy
        if len(action_string) < 2:
            return True
        unique_chars = len(set(action_string))
        if unique_chars < 3 and len(action_string) > 10:
            return True
        return False

    def compute_reward(self, 
                      correctness: float, 
                      reasoning_quality: float,
                      process_supervision: float,
                      reflection_score: float,
                      action_str: str, 
                      final_answer: str,
                      history: List[Dict[str, Any]],
                      times_seen_problem: int) -> Tuple[float, Dict[str, float]]:
        """
        [PAPER TRACEABILITY: DeepSeekMath-inspired reward composite]
        R = 0.4*C + 0.2*Q_smooth + 0.15*D + 0.1*E + 0.1*P + 0.1*R + 0.15*X + noise
        """
        if self.detect_trivial_output(action_str):
            # Anti-hacking strongly penalized
            components = {"C": 0.0, "Q": 0.0, "D": 0.0, "E": -1.0, "X": 0.0, "noise": 0.0}
            return -1.0, components
            
        c = correctness
        q = reasoning_quality
        d = self.compute_diversity(final_answer, history)
        
        # If repeated answer, C is zeroed to prevent hacking
        if d < 0:
            c = 0.0
            
        e = self.compute_efficiency(action_str)
        x = self.compute_exploration_bonus(action_str, times_seen_problem)
        
        noise = random.gauss(0, 0.05)
        
        # Smoothly squish reasoning quality using tanh to bound its impact
        q_smooth = math.tanh(q)
        
        # Normalize variables mapping entirely into the [0, 1] domain
        p_norm = (process_supervision + 1.0) / 2.0  # Scales [-1, 1] to [0, 1]
        r_norm = (reflection_score + 0.5) / 1.5     # Scales [-0.5, 1.0] to [0, 1]
        q_norm = min(1.0, max(0.0, q_smooth))
        
        # New Simplified Composite Reward Equation (Strictly bounded [0, 1])
        # Base coefficients sum exactly to 1.0. Noise is removed to satisfy bounds.
        total_r = (0.4 * c) + (0.3 * q_norm) + (0.2 * p_norm) + (0.1 * r_norm)
        components = {
            "total_reward": total_r,
            "C_correctness": c,
            "Q_reasoning": q_smooth,
            "P_process_supervision": process_supervision,
            "R_reflection": reflection_score,
            "D_diversity": d,
            "E_efficiency": e,
            "X_exploration": x,
            "noise": noise
        }
        
        return total_r, components