import random
import math
from typing import Dict, Any, List, Tuple

class RewardSystem:
    def __init__(self, max_len: int = 1000):
        self.max_len = max_len

    def compute_diversity(self, current_answer: str, history: List[Dict[str, Any]]) -> float:
        """
        D = diversity (difference from past attempts)
        If repeated answer, returns a steep exponential penalty: D = -exp(1.0).
        Otherwise, returns D = 1.0.
        """
        if not history:
            return 1.0
            
        cur_ans_clean = current_answer.strip().lower()
        
        for attempt in history:
            prev_ans = attempt.get('final_answer', '').strip().lower()
            if prev_ans == cur_ans_clean:
                return -math.exp(1.0) # Approx -2.71steep penalty
                
        # If unique, give full diversity bonus
        return 1.0

    def compute_efficiency(self, action_string: str) -> float:
        """
        E = efficiency. We use a Gaussian penalty curve:
        E = exp(- (len_ratio)^2 ) - 1
        This smoothly penalizes overly verbose answers.
        """
        approx_tokens = len(action_string) / 4.0
        optimal_tokens = 50.0  # Assumed ideal length
        
        # Ratio mapping constraint
        ratio = (approx_tokens - optimal_tokens) / optimal_tokens
        
        # Smooth gaussian-like decay towards -1.0
        e = math.exp(- (ratio ** 2)) - 1.0
        return e
        
    def compute_exploration_bonus(self, action_string: str, times_seen: int) -> float:
        """
        [PAPER TRACEABILITY: Exploration via Entropy Bonus]
        G. EXPLORATION VIA ENTROPY BONUS
        Computes output diversity (token variance) and adds bonus.
        X = (entropy_bonus) / sqrt(1 + times_seen_problem)
        """
        # Simple structural entropy estimation (unique character distribution variance)
        length = len(action_string)
        if length > 0:
            unique_ratio = len(set(action_string)) / length
            entropy_bonus = math.log1p(unique_ratio)  # Non-linear scaling
        else:
            entropy_bonus = 0.0
            
        return entropy_bonus / math.sqrt(1.0 + times_seen)

    def detect_trivial_output(self, action_string: str) -> bool:
        """Anti-reward hacking: detect trivial constant outputs"""
        # If the output is just a single character repeated or very low entropy
        if len(action_string) < 2:
            return True
        unique_chars = len(set(action_string))
        if unique_chars < 3 and len(action_string) > 10:
            return True
        return False

    def compute_reward(self, 
                      correctness: float, 
                      reasoning_quality: float,
                      process_supervision: float,
                      reflection_score: float,
                      action_str: str, 
                      final_answer: str,
                      history: List[Dict[str, Any]],
                      times_seen_problem: int) -> Tuple[float, Dict[str, float]]:
        """
        [PAPER TRACEABILITY: DeepSeekMath-inspired reward composite]
        R = 0.4*C + 0.2*Q_smooth + 0.15*D + 0.1*E + 0.1*P + 0.1*R + 0.15*X + noise
        """
        if self.detect_trivial_output(action_str):
            # Anti-hacking strongly penalized
            components = {"C": 0.0, "Q": 0.0, "D": 0.0, "E": -1.0, "X": 0.0, "noise": 0.0}
            return -1.0, components
            
        c = correctness
        q = reasoning_quality
        d = self.compute_diversity(final_answer, history)
        
        # If repeated answer, C is zeroed to prevent hacking
        if d < 0:
            c = 0.0
            
        e = self.compute_efficiency(action_str)
        x = self.compute_exploration_bonus(action_str, times_seen_problem)
        
        noise = random.gauss(0, 0.05)
        
        # Smoothly squish reasoning quality using tanh to bound its impact
        q_smooth = math.tanh(q)
        
        # Normalize variables mapping entirely into the [0, 1] domain
        p_norm = (process_supervision + 1.0) / 2.0  # Scales [-1, 1] to [0, 1]
        r_norm = (reflection_score + 0.5) / 1.5     # Scales [-0.5, 1.0] to [0, 1]
        q_norm = min(1.0, max(0.0, q_smooth))
        
        # New Simplified Composite Reward Equation (Strictly bounded [0, 1])
        # Base coefficients sum exactly to 1.0. Noise is removed to satisfy bounds.
        total_r = (0.4 * c) + (0.3 * q_norm) + (0.2 * p_norm) + (0.1 * r_norm)
        components = {
            "total_reward": total_r,
            "C_correctness": c,
            "Q_reasoning": q_smooth,
            "P_process_supervision": process_supervision,
            "R_reflection": reflection_score,
            "D_diversity": d,
            "E_efficiency": e,
            "X_exploration": x,
            "noise": noise
        }
        
        return total_r, components