Spaces:

Pratap-K
/

AutoMathReasoner

Sleeping

+import logging
+from uuid import uuid4
+from collections import deque
+from typing import Dict, Any, List
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+try:
+    from .models import AutomathreasonerAction, AutomathreasonerObservation
+    from .generator import TaskGenerationEngine
+    from .verifier import VerifierSystem
+    from .rewards import RewardSystem
+except ImportError:
+    from env.models import AutomathreasonerAction, AutomathreasonerObservation
+    from env.generator import TaskGenerationEngine
+    from env.verifier import VerifierSystem
+    from env.rewards import RewardSystem
+logger = logging.getLogger(__name__)
+class AutomathreasonerEnvironment(Environment):
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self):
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self.generator = TaskGenerationEngine()
+        self.verifier = VerifierSystem()
+        self.reward_system = RewardSystem(max_len=2000)
+        # Curriculum tracking
+        self.difficulty_level = 2.0  # Starting difficulty
+        self.rolling_results = deque(maxlen=20) # Keep track of last 20 results (1 for correct, 0 for incorrect)
+        # Current problem state
+        self.current_problem = ""
+        self.current_solution = ""
+        self.times_seen_problem = 0
+        self.history: List[Dict[str, Any]] = []
+        self.max_steps = 3
+    def _update_curriculum(self):
+        """Update difficulty based on rolling accuracy"""
+        if len(self.rolling_results) >= 5:
+            accuracy = sum(self.rolling_results) / len(self.rolling_results)
+            if accuracy > 0.7:
+                self.difficulty_level += 0.5
+            elif accuracy < 0.6:
+                self.difficulty_level = max(1.0, self.difficulty_level - 0.5)
+            logger.info(f"Curriculum Updated: Accuracy={accuracy:.2f}, New Difficulty={self.difficulty_level}")
+    def reset(self) -> AutomathreasonerObservation:
+        """Reset environment to a new problem."""
+        self._update_curriculum()
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        task = self.generator.generate_task(target_difficulty_band=self.difficulty_level)
+        self.current_problem = task['problem']
+        self.current_solution = task['solution']
+        # The generator returns its own continuous difficulty score; we'll expose the target difficulty band
+        self.times_seen_problem = 0
+        self.history = []
+        return AutomathreasonerObservation(
+            problem_text=self.current_problem,
+            difficulty_level=self.difficulty_level,
+            history=[],
+            reward=0.0,
+            done=False
+        )
+    def step(self, action: AutomathreasonerAction) -> AutomathreasonerObservation:  # type: ignore[override]
+        self._state.step_count += 1
+        # Verification
+        c, q, p_sup, r_ref = self.verifier.verify(action.reasoning, action.final_answer, self.current_solution)
+        # Reward
+        action_str = f"{action.reasoning} \n {action.final_answer}"
+        total_r, components = self.reward_system.compute_reward(
+            correctness=c,
+            reasoning_quality=q,
+            process_supervision=p_sup,
+            reflection_score=r_ref,
+            action_str=action_str,
+            final_answer=action.final_answer,
+            history=self.history,
+            times_seen_problem=self.times_seen_problem
+        )
+        self.times_seen_problem += 1
+        # Update history
+        attempt = {
+            "prediction": action.final_answer,
+            "correctness": c
+        }
+        self.history.append(attempt)
+        # Keep only last 3 attempts for observation
+        obs_history = self.history[-3:]
+        is_correct = (c == 1.0)
+        done = is_correct or self._state.step_count >= self.max_steps
+        if done:
+            self.rolling_results.append(1 if is_correct else 0)
+        return AutomathreasonerObservation(
+            problem_text=self.current_problem,
+            difficulty_level=self.difficulty_level,
+            history=obs_history,
+            reward=total_r,
+            done=done,
+            metadata={
+                "reward_components": components,
+                "ground_truth": self.current_solution if done else "HIDDEN", # Only reveal on done or not at all
+                "is_correct": is_correct
+            }
+        )
+    @property
+    def state(self) -> State:
+        return self._state

env/generator.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import random
+from typing import Dict, Any, Tuple
+class TaskGenerationEngine:
+    def __init__(self):
+        # Templates for different types of problems
+        self.arithmetic_templates = [
+            "What is {a} + {b}?",
+            "Calculate {a} - {b}.",
+            "Find the product of {a} and {b}.",
+            "What is {a} divided by {b}?"
+        ]
+        self.algebra_templates = [
+            "Solve for x: {a}x + {b} = {c}",
+            "If {a}y - {b} = {c}, what is y?"
+        ]
+        self.word_problem_templates = [
+            "John has {a} apples. He buys {b} more. Then he gives away {c}. How many apples does John have now?",
+            "A train travels at {a} km/h for {b} hours. How far does it travel?"
+        ]
+    def _score_difficulty(self, steps: int, complexity: int, operations: int) -> float:
+        """
+        D = steps_required + number_complexity + operations_count
+        """
+        return float(steps + complexity + operations)
+    def generate_arithmetic(self, complexity: int) -> Tuple[str, float, str]:
+        a = random.randint(1 * complexity, 10 * complexity)
+        b = random.randint(1 * complexity, 10 * complexity)
+        op = random.choice(['+', '-', '*', '/'])
+        operations = 1
+        steps = 1
+        if op == '+':
+            problem = f"What is {a} + {b}?"
+            answer = str(a + b)
+        elif op == '-':
+            problem = f"Calculate {a} - {b}."
+            answer = str(a - b)
+        elif op == '*':
+            problem = f"Find the product of {a} and {b}."
+            answer = str(a * b)
+        elif op == '/':
+            # Ensure divisible
+            b = max(1, b)
+            a = a * b
+            problem = f"What is {a} divided by {b}?"
+            answer = str(a // b)
+        difficulty = self._score_difficulty(steps, complexity, operations)
+        return problem, difficulty, answer
+    def generate_algebra(self, complexity: int) -> Tuple[str, float, str]:
+        a = random.randint(1, 5 * complexity)
+        x = random.randint(1, 10)
+        b = random.randint(1, 10 * complexity)
+        op = random.choice(['+', '-'])
+        operations = 2
+        steps = 2
+        if op == '+':
+            c = a * x + b
+            problem = f"Solve for x: {a}x + {b} = {c}"
+        else:
+            c = a * x - b
+            problem = f"If {a}x - {b} = {c}, what is x?"
+        answer = str(x)
+        difficulty = self._score_difficulty(steps, complexity, operations)
+        return problem, difficulty, answer
+    def generate_word_problem(self, complexity: int) -> Tuple[str, float, str]:
+        t = random.choice([0, 1])
+        operations = 2
+        steps = 2
+        if t == 0:
+            a = random.randint(5 * complexity, 15 * complexity)
+            b = random.randint(2 * complexity, 10 * complexity)
+            c = random.randint(1, a + b)
+            problem = f"John has {a} apples. He buys {b} more. Then he gives away {c}. How many apples does John have now?"
+            answer = str(a + b - c)
+        else:
+            a = random.randint(20 * complexity, 60 * complexity)
+            b = random.randint(1, 5 * complexity)
+            problem = f"A train travels at {a} km/h for {b} hours. How far does it travel?"
+            answer = str(a * b)
+            operations = 1
+            steps = 1
+        difficulty = self._score_difficulty(steps, complexity, operations)
+        return problem, difficulty, answer
+    def generate_task(self, target_difficulty_band: float) -> Dict[str, Any]:
+        """
+        Generate a task targeting a general difficulty band.
+        target_difficulty_band can guide the complexity parameter.
+        """
+        complexity = max(1, int(target_difficulty_band / 2))
+        prob_type = random.choices(
+            ['arithmetic', 'algebra', 'word_problem'],
+            weights=[1, max(0.5, complexity-1), max(0.5, complexity-1)]
+        )[0]
+        if prob_type == 'arithmetic':
+            problem, diff, ans = self.generate_arithmetic(complexity)
+        elif prob_type == 'algebra':
+            problem, diff, ans = self.generate_algebra(complexity)
+        else:
+            problem, diff, ans = self.generate_word_problem(complexity)
+        return {
+            "problem": problem,
+            "difficulty": diff,
+            "solution": ans,
+            "type": prob_type
+        }

env/models.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Data models for the AutoMathReasoner Environment.
+"""
+from typing import List, Dict, Any
+from pydantic import Field
+from openenv.core.env_server.types import Action, Observation
+class AutomathreasonerAction(Action):
+    """Action for the AutoMathReasoner environment - containing reasoning and final answer."""
+    reasoning: str = Field(..., description="The step-by-step mathematical reasoning.")
+    final_answer: str = Field(..., description="The final numerical or algebraic answer.")
+class AutomathreasonerObservation(Observation):
+    """Observation from the AutoMathReasoner environment."""
+    problem_text: str = Field(default="", description="The text of the generated math problem.")
+    difficulty_level: float = Field(default=1.0, description="The current difficulty level of the problem.")
+    history: List[Dict[str, Any]] = Field(default_factory=list, description="History of the last 3 attempts for this problem.")
+    # Required by OpenEnv base class
+    reward: float = Field(default=0.0, description="Reward received from the previous action.")
+    done: bool = Field(default=False, description="Whether the episode has ended.")

env/rewards.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import random
+import math
+from typing import Dict, Any, List, Tuple
+class RewardSystem:
+    def __init__(self, max_len: int = 1000):
+        self.max_len = max_len
+    def compute_diversity(self, current_answer: str, history: List[Dict[str, Any]]) -> float:
+        """
+        D = diversity (difference from past attempts)
+        If repeated answer, returns a steep exponential penalty: D = -exp(1.0).
+        Otherwise, returns D = 1.0.
+        """
+        if not history:
+            return 1.0
+        cur_ans_clean = current_answer.strip().lower()
+        for attempt in history:
+            prev_ans = attempt.get('final_answer', '').strip().lower()
+            if prev_ans == cur_ans_clean:
+                return -math.exp(1.0) # Approx -2.71steep penalty
+        # If unique, give full diversity bonus
+        return 1.0
+    def compute_efficiency(self, action_string: str) -> float:
+        """
+        E = efficiency. We use a Gaussian penalty curve:
+        E = exp(- (len_ratio)^2 ) - 1
+        This smoothly penalizes overly verbose answers.
+        """
+        approx_tokens = len(action_string) / 4.0
+        optimal_tokens = 50.0  # Assumed ideal length
+        # Ratio mapping constraint
+        ratio = max(0.0, (approx_tokens - optimal_tokens) / optimal_tokens)
+        # Smooth gaussian-like decay towards -1.0
+        e = math.exp(- (ratio ** 2)) - 1.0
+        return e
+    def compute_exploration_bonus(self, action_string: str, times_seen: int) -> float:
+        """
+        [PAPER TRACEABILITY: Exploration via Entropy Bonus]
+        G. EXPLORATION VIA ENTROPY BONUS
+        Computes output diversity (token variance) and adds bonus.
+        X = (entropy_bonus) / sqrt(1 + times_seen_problem)
+        """
+        # Simple structural entropy estimation (unique character distribution variance)
+        length = len(action_string)
+        if length > 0:
+            unique_ratio = len(set(action_string)) / length
+            entropy_bonus = math.log1p(unique_ratio)  # Non-linear scaling
+        else:
+            entropy_bonus = 0.0
+        return entropy_bonus / math.sqrt(1.0 + times_seen)
+    def detect_trivial_output(self, action_string: str) -> bool:
+        """Anti-reward hacking: detect trivial constant outputs"""
+        # If the output is just a single character repeated or very low entropy
+        if len(action_string) < 2:
+            return True
+        unique_chars = len(set(action_string))
+        if unique_chars < 3 and len(action_string) > 10:
+            return True
+        return False
+    def compute_reward(self,
+                      correctness: float,
+                      reasoning_quality: float,
+                      process_supervision: float,
+                      reflection_score: float,
+                      action_str: str,
+                      final_answer: str,
+                      history: List[Dict[str, Any]],
+                      times_seen_problem: int) -> Tuple[float, Dict[str, float]]:
+        """
+        [PAPER TRACEABILITY: DeepSeekMath-inspired reward composite]
+        R = 0.4*C + 0.2*Q_smooth + 0.15*D + 0.1*E + 0.1*P + 0.1*R + 0.15*X + noise
+        """
+        if self.detect_trivial_output(action_str):
+            # Anti-hacking strongly penalized
+            components = {"C": 0.0, "Q": 0.0, "D": 0.0, "E": -1.0, "X": 0.0, "noise": 0.0}
+            return -1.0, components
+        c = correctness
+        q = reasoning_quality
+        d = self.compute_diversity(final_answer, history)
+        # If repeated answer, C is zeroed to prevent hacking
+        if d < 0:
+            c = 0.0
+        e = self.compute_efficiency(action_str)
+        x = self.compute_exploration_bonus(action_str, times_seen_problem)
+        noise = random.gauss(0, 0.05)
+        # Smoothly squish reasoning quality using tanh to bound its impact
+        q_smooth = math.tanh(q)
+        # New Composite Reward Equation
+        total_r = (0.35 * c) + (0.15 * q_smooth) + (0.1 * process_supervision) + (0.1 * reflection_score) + (0.15 * d) + (0.05 * e) + (0.1 * x) + noise
+        components = {
+            "total_reward": total_r,
+            "C_correctness": c,
+            "Q_reasoning": q_smooth,
+            "P_process_supervision": process_supervision,
+            "R_reflection": reflection_score,
+            "D_diversity": d,
+            "E_efficiency": e,
+            "X_exploration": x,
+            "noise": noise
+        }
+        return total_r, components

env/verifier.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import re
+import math
+from typing import Dict, Any, Tuple
+class VerifierSystem:
+    def __init__(self):
+        pass
+    def check_exact_match(self, prediction: str, ground_truth: str) -> bool:
+        """1. Exact match verifier"""
+        return prediction.strip().lower() == ground_truth.strip().lower()
+    def check_numeric_tolerance(self, prediction: str, ground_truth: str, tol: float = 1e-4) -> bool:
+        """2. Numeric tolerance checker"""
+        try:
+            pred_val = float(prediction.strip())
+            gt_val = float(ground_truth.strip())
+            return math.isclose(pred_val, gt_val, rel_tol=tol, abs_tol=tol)
+        except ValueError:
+            return False
+    def check_python_execution(self, prediction: str, ground_truth: str) -> bool:
+        """3. Python execution (eval safe expressions)"""
+        # If prediction is an expression like "2+3", try evaluating it safely
+        safe_dict = {"__builtins__": None, "math": math}
+        try:
+            # We are verifying if evaluating the prediction gives ground truth
+            pred_eval = eval(prediction.strip(), safe_dict, {})
+            try:
+                gt_eval = float(ground_truth.strip())
+                return math.isclose(float(pred_eval), gt_eval, rel_tol=1e-4, abs_tol=1e-4)
+            except ValueError:
+                return str(pred_eval).strip().lower() == ground_truth.strip().lower()
+        except Exception:
+            return False
+    def mock_llm_judge(self, reasoning: str, prediction: str, ground_truth: str) -> float:
+        """4. LLM judge (mock or placeholder scoring reasoning quality)
+        Returns reasoning quality score Q (0.0 to 1.0)
+        """
+        # A simple heuristic for mock judge:
+        # Longer reasoning with step-like markers suggests higher quality in this mock
+        step_markers = ['step', 'first', 'then', 'because', 'therefore', 'equals', '=', '+', '-']
+        score = 0.0
+        # Length bonus (up to 0.4)
+        length = len(reasoning.split())
+        score += min(0.4, length * 0.01)
+        # Structure bonus (up to 0.6)
+        lower_reasoning = reasoning.lower()
+        marker_count = sum(1 for m in step_markers if m in lower_reasoning)
+        score += min(0.6, marker_count * 0.1)
+        return round(min(1.0, score), 2)
+    def check_process_supervision(self, reasoning: str) -> float:
+        """
+        [PAPER TRACEABILITY: Process Supervision (Lightweight PRM)]
+        E. PROCESS SUPERVISION (STEP-AWARE REWARD)
+        Validates reasoning steps (basic heuristics).
+        Penalizes logical jumps and rewards structured step-by-step reasoning.
+        """
+        lower_r = reasoning.lower()
+        score = 0.0
+        # Check stepwise structure
+        if "step 1" in lower_r and "step 2" in lower_r:
+            score += 0.5
+        elif "first" in lower_r and ("then" in lower_r or "next" in lower_r):
+            score += 0.3
+        # Penalize missing steps if it's very short but claims complex operations
+        if len(lower_r.split()) < 10 and ("=" in lower_r or "so" in lower_r):
+            score -= 0.5 # Logical jump penalty
+        return max(-1.0, min(1.0, score))
+    def check_reflection(self, reasoning: str, c: float) -> float:
+        """
+        [PAPER TRACEABILITY: Reflection Module]
+        H. REFLECTION MODULE
+        Model generates "What could be wrong?"
+        Penalize if contradiction with final answer, reward correct self-correction.
+        """
+        lower_r = reasoning.lower()
+        score = 0.0
+        reflection_phrases = ["what could be wrong", "wait,", "let me check", "alternatively"]
+        if any(phrase in lower_r for phrase in reflection_phrases):
+            # Reflection attempted
+            if c >= 1.0:
+                score += 1.0 # Correct self-correction / successful verification
+            else:
+                score -= 0.5 # Contradiction or failed correction
+        return score
+    def verify(self, reasoning: str, prediction: str, ground_truth: str) -> Tuple[float, float, float, float]:
+        """
+        Run all verifiers.
+        Returns Correctness (C), Reasoning Quality (Q), Process Supervision (P), and Reflection (R).
+        """
+        c = 0.0
+        if self.check_exact_match(prediction, ground_truth):
+            c = 1.0
+        elif self.check_numeric_tolerance(prediction, ground_truth):
+            c = 1.0
+        elif self.check_python_execution(prediction, ground_truth):
+            c = 1.0
+        q = self.mock_llm_judge(reasoning, prediction, ground_truth)
+        p = self.check_process_supervision(reasoning)
+        r = self.check_reflection(reasoning, c)
+        return c, q, p, r