Pratap-K commited on
Commit
a8211b4
·
1 Parent(s): c1774db

AutoMathReasoner

Browse files
Files changed (7) hide show
  1. .gitignore +0 -2
  2. env/__init__.py +1 -0
  3. env/environment.py +124 -0
  4. env/generator.py +121 -0
  5. env/models.py +31 -0
  6. env/rewards.py +120 -0
  7. env/verifier.py +117 -0
.gitignore CHANGED
@@ -6,8 +6,6 @@ __pycache__/
6
  # Virtual environments
7
  .venv/
8
  venv/
9
- env/
10
- ENV/
11
  env.bak/
12
  venv.bak/
13
 
 
6
  # Virtual environments
7
  .venv/
8
  venv/
 
 
9
  env.bak/
10
  venv.bak/
11
 
env/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Environment package
env/environment.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from uuid import uuid4
3
+ from collections import deque
4
+ from typing import Dict, Any, List
5
+
6
+ from openenv.core.env_server.interfaces import Environment
7
+ from openenv.core.env_server.types import State
8
+
9
+ try:
10
+ from .models import AutomathreasonerAction, AutomathreasonerObservation
11
+ from .generator import TaskGenerationEngine
12
+ from .verifier import VerifierSystem
13
+ from .rewards import RewardSystem
14
+ except ImportError:
15
+ from env.models import AutomathreasonerAction, AutomathreasonerObservation
16
+ from env.generator import TaskGenerationEngine
17
+ from env.verifier import VerifierSystem
18
+ from env.rewards import RewardSystem
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class AutomathreasonerEnvironment(Environment):
23
+ SUPPORTS_CONCURRENT_SESSIONS: bool = True
24
+
25
+ def __init__(self):
26
+ self._state = State(episode_id=str(uuid4()), step_count=0)
27
+ self.generator = TaskGenerationEngine()
28
+ self.verifier = VerifierSystem()
29
+ self.reward_system = RewardSystem(max_len=2000)
30
+
31
+ # Curriculum tracking
32
+ self.difficulty_level = 2.0 # Starting difficulty
33
+ self.rolling_results = deque(maxlen=20) # Keep track of last 20 results (1 for correct, 0 for incorrect)
34
+
35
+ # Current problem state
36
+ self.current_problem = ""
37
+ self.current_solution = ""
38
+ self.times_seen_problem = 0
39
+ self.history: List[Dict[str, Any]] = []
40
+ self.max_steps = 3
41
+
42
+ def _update_curriculum(self):
43
+ """Update difficulty based on rolling accuracy"""
44
+ if len(self.rolling_results) >= 5:
45
+ accuracy = sum(self.rolling_results) / len(self.rolling_results)
46
+ if accuracy > 0.7:
47
+ self.difficulty_level += 0.5
48
+ elif accuracy < 0.6:
49
+ self.difficulty_level = max(1.0, self.difficulty_level - 0.5)
50
+ logger.info(f"Curriculum Updated: Accuracy={accuracy:.2f}, New Difficulty={self.difficulty_level}")
51
+
52
+ def reset(self) -> AutomathreasonerObservation:
53
+ """Reset environment to a new problem."""
54
+ self._update_curriculum()
55
+
56
+ self._state = State(episode_id=str(uuid4()), step_count=0)
57
+ task = self.generator.generate_task(target_difficulty_band=self.difficulty_level)
58
+
59
+ self.current_problem = task['problem']
60
+ self.current_solution = task['solution']
61
+ # The generator returns its own continuous difficulty score; we'll expose the target difficulty band
62
+ self.times_seen_problem = 0
63
+ self.history = []
64
+
65
+ return AutomathreasonerObservation(
66
+ problem_text=self.current_problem,
67
+ difficulty_level=self.difficulty_level,
68
+ history=[],
69
+ reward=0.0,
70
+ done=False
71
+ )
72
+
73
+ def step(self, action: AutomathreasonerAction) -> AutomathreasonerObservation: # type: ignore[override]
74
+ self._state.step_count += 1
75
+
76
+ # Verification
77
+ c, q, p_sup, r_ref = self.verifier.verify(action.reasoning, action.final_answer, self.current_solution)
78
+
79
+ # Reward
80
+ action_str = f"{action.reasoning} \n {action.final_answer}"
81
+ total_r, components = self.reward_system.compute_reward(
82
+ correctness=c,
83
+ reasoning_quality=q,
84
+ process_supervision=p_sup,
85
+ reflection_score=r_ref,
86
+ action_str=action_str,
87
+ final_answer=action.final_answer,
88
+ history=self.history,
89
+ times_seen_problem=self.times_seen_problem
90
+ )
91
+
92
+ self.times_seen_problem += 1
93
+
94
+ # Update history
95
+ attempt = {
96
+ "prediction": action.final_answer,
97
+ "correctness": c
98
+ }
99
+ self.history.append(attempt)
100
+ # Keep only last 3 attempts for observation
101
+ obs_history = self.history[-3:]
102
+
103
+ is_correct = (c == 1.0)
104
+ done = is_correct or self._state.step_count >= self.max_steps
105
+
106
+ if done:
107
+ self.rolling_results.append(1 if is_correct else 0)
108
+
109
+ return AutomathreasonerObservation(
110
+ problem_text=self.current_problem,
111
+ difficulty_level=self.difficulty_level,
112
+ history=obs_history,
113
+ reward=total_r,
114
+ done=done,
115
+ metadata={
116
+ "reward_components": components,
117
+ "ground_truth": self.current_solution if done else "HIDDEN", # Only reveal on done or not at all
118
+ "is_correct": is_correct
119
+ }
120
+ )
121
+
122
+ @property
123
+ def state(self) -> State:
124
+ return self._state
env/generator.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from typing import Dict, Any, Tuple
3
+
4
+ class TaskGenerationEngine:
5
+ def __init__(self):
6
+ # Templates for different types of problems
7
+ self.arithmetic_templates = [
8
+ "What is {a} + {b}?",
9
+ "Calculate {a} - {b}.",
10
+ "Find the product of {a} and {b}.",
11
+ "What is {a} divided by {b}?"
12
+ ]
13
+ self.algebra_templates = [
14
+ "Solve for x: {a}x + {b} = {c}",
15
+ "If {a}y - {b} = {c}, what is y?"
16
+ ]
17
+ self.word_problem_templates = [
18
+ "John has {a} apples. He buys {b} more. Then he gives away {c}. How many apples does John have now?",
19
+ "A train travels at {a} km/h for {b} hours. How far does it travel?"
20
+ ]
21
+
22
+ def _score_difficulty(self, steps: int, complexity: int, operations: int) -> float:
23
+ """
24
+ D = steps_required + number_complexity + operations_count
25
+ """
26
+ return float(steps + complexity + operations)
27
+
28
+ def generate_arithmetic(self, complexity: int) -> Tuple[str, float, str]:
29
+ a = random.randint(1 * complexity, 10 * complexity)
30
+ b = random.randint(1 * complexity, 10 * complexity)
31
+ op = random.choice(['+', '-', '*', '/'])
32
+
33
+ operations = 1
34
+ steps = 1
35
+
36
+ if op == '+':
37
+ problem = f"What is {a} + {b}?"
38
+ answer = str(a + b)
39
+ elif op == '-':
40
+ problem = f"Calculate {a} - {b}."
41
+ answer = str(a - b)
42
+ elif op == '*':
43
+ problem = f"Find the product of {a} and {b}."
44
+ answer = str(a * b)
45
+ elif op == '/':
46
+ # Ensure divisible
47
+ b = max(1, b)
48
+ a = a * b
49
+ problem = f"What is {a} divided by {b}?"
50
+ answer = str(a // b)
51
+
52
+ difficulty = self._score_difficulty(steps, complexity, operations)
53
+ return problem, difficulty, answer
54
+
55
+ def generate_algebra(self, complexity: int) -> Tuple[str, float, str]:
56
+ a = random.randint(1, 5 * complexity)
57
+ x = random.randint(1, 10)
58
+ b = random.randint(1, 10 * complexity)
59
+ op = random.choice(['+', '-'])
60
+
61
+ operations = 2
62
+ steps = 2
63
+
64
+ if op == '+':
65
+ c = a * x + b
66
+ problem = f"Solve for x: {a}x + {b} = {c}"
67
+ else:
68
+ c = a * x - b
69
+ problem = f"If {a}x - {b} = {c}, what is x?"
70
+
71
+ answer = str(x)
72
+ difficulty = self._score_difficulty(steps, complexity, operations)
73
+ return problem, difficulty, answer
74
+
75
+ def generate_word_problem(self, complexity: int) -> Tuple[str, float, str]:
76
+ t = random.choice([0, 1])
77
+ operations = 2
78
+ steps = 2
79
+
80
+ if t == 0:
81
+ a = random.randint(5 * complexity, 15 * complexity)
82
+ b = random.randint(2 * complexity, 10 * complexity)
83
+ c = random.randint(1, a + b)
84
+ problem = f"John has {a} apples. He buys {b} more. Then he gives away {c}. How many apples does John have now?"
85
+ answer = str(a + b - c)
86
+ else:
87
+ a = random.randint(20 * complexity, 60 * complexity)
88
+ b = random.randint(1, 5 * complexity)
89
+ problem = f"A train travels at {a} km/h for {b} hours. How far does it travel?"
90
+ answer = str(a * b)
91
+ operations = 1
92
+ steps = 1
93
+
94
+ difficulty = self._score_difficulty(steps, complexity, operations)
95
+ return problem, difficulty, answer
96
+
97
+ def generate_task(self, target_difficulty_band: float) -> Dict[str, Any]:
98
+ """
99
+ Generate a task targeting a general difficulty band.
100
+ target_difficulty_band can guide the complexity parameter.
101
+ """
102
+ complexity = max(1, int(target_difficulty_band / 2))
103
+
104
+ prob_type = random.choices(
105
+ ['arithmetic', 'algebra', 'word_problem'],
106
+ weights=[1, max(0.5, complexity-1), max(0.5, complexity-1)]
107
+ )[0]
108
+
109
+ if prob_type == 'arithmetic':
110
+ problem, diff, ans = self.generate_arithmetic(complexity)
111
+ elif prob_type == 'algebra':
112
+ problem, diff, ans = self.generate_algebra(complexity)
113
+ else:
114
+ problem, diff, ans = self.generate_word_problem(complexity)
115
+
116
+ return {
117
+ "problem": problem,
118
+ "difficulty": diff,
119
+ "solution": ans,
120
+ "type": prob_type
121
+ }
env/models.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the AutoMathReasoner Environment.
9
+ """
10
+
11
+ from typing import List, Dict, Any
12
+ from pydantic import Field
13
+ from openenv.core.env_server.types import Action, Observation
14
+
15
+ class AutomathreasonerAction(Action):
16
+ """Action for the AutoMathReasoner environment - containing reasoning and final answer."""
17
+
18
+ reasoning: str = Field(..., description="The step-by-step mathematical reasoning.")
19
+ final_answer: str = Field(..., description="The final numerical or algebraic answer.")
20
+
21
+
22
+ class AutomathreasonerObservation(Observation):
23
+ """Observation from the AutoMathReasoner environment."""
24
+
25
+ problem_text: str = Field(default="", description="The text of the generated math problem.")
26
+ difficulty_level: float = Field(default=1.0, description="The current difficulty level of the problem.")
27
+ history: List[Dict[str, Any]] = Field(default_factory=list, description="History of the last 3 attempts for this problem.")
28
+
29
+ # Required by OpenEnv base class
30
+ reward: float = Field(default=0.0, description="Reward received from the previous action.")
31
+ done: bool = Field(default=False, description="Whether the episode has ended.")
env/rewards.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import math
3
+ from typing import Dict, Any, List, Tuple
4
+
5
+ class RewardSystem:
6
+ def __init__(self, max_len: int = 1000):
7
+ self.max_len = max_len
8
+
9
+ def compute_diversity(self, current_answer: str, history: List[Dict[str, Any]]) -> float:
10
+ """
11
+ D = diversity (difference from past attempts)
12
+ If repeated answer, returns a steep exponential penalty: D = -exp(1.0).
13
+ Otherwise, returns D = 1.0.
14
+ """
15
+ if not history:
16
+ return 1.0
17
+
18
+ cur_ans_clean = current_answer.strip().lower()
19
+
20
+ for attempt in history:
21
+ prev_ans = attempt.get('final_answer', '').strip().lower()
22
+ if prev_ans == cur_ans_clean:
23
+ return -math.exp(1.0) # Approx -2.71steep penalty
24
+
25
+ # If unique, give full diversity bonus
26
+ return 1.0
27
+
28
+ def compute_efficiency(self, action_string: str) -> float:
29
+ """
30
+ E = efficiency. We use a Gaussian penalty curve:
31
+ E = exp(- (len_ratio)^2 ) - 1
32
+ This smoothly penalizes overly verbose answers.
33
+ """
34
+ approx_tokens = len(action_string) / 4.0
35
+ optimal_tokens = 50.0 # Assumed ideal length
36
+
37
+ # Ratio mapping constraint
38
+ ratio = max(0.0, (approx_tokens - optimal_tokens) / optimal_tokens)
39
+
40
+ # Smooth gaussian-like decay towards -1.0
41
+ e = math.exp(- (ratio ** 2)) - 1.0
42
+ return e
43
+
44
+ def compute_exploration_bonus(self, action_string: str, times_seen: int) -> float:
45
+ """
46
+ [PAPER TRACEABILITY: Exploration via Entropy Bonus]
47
+ G. EXPLORATION VIA ENTROPY BONUS
48
+ Computes output diversity (token variance) and adds bonus.
49
+ X = (entropy_bonus) / sqrt(1 + times_seen_problem)
50
+ """
51
+ # Simple structural entropy estimation (unique character distribution variance)
52
+ length = len(action_string)
53
+ if length > 0:
54
+ unique_ratio = len(set(action_string)) / length
55
+ entropy_bonus = math.log1p(unique_ratio) # Non-linear scaling
56
+ else:
57
+ entropy_bonus = 0.0
58
+
59
+ return entropy_bonus / math.sqrt(1.0 + times_seen)
60
+
61
+ def detect_trivial_output(self, action_string: str) -> bool:
62
+ """Anti-reward hacking: detect trivial constant outputs"""
63
+ # If the output is just a single character repeated or very low entropy
64
+ if len(action_string) < 2:
65
+ return True
66
+ unique_chars = len(set(action_string))
67
+ if unique_chars < 3 and len(action_string) > 10:
68
+ return True
69
+ return False
70
+
71
+ def compute_reward(self,
72
+ correctness: float,
73
+ reasoning_quality: float,
74
+ process_supervision: float,
75
+ reflection_score: float,
76
+ action_str: str,
77
+ final_answer: str,
78
+ history: List[Dict[str, Any]],
79
+ times_seen_problem: int) -> Tuple[float, Dict[str, float]]:
80
+ """
81
+ [PAPER TRACEABILITY: DeepSeekMath-inspired reward composite]
82
+ R = 0.4*C + 0.2*Q_smooth + 0.15*D + 0.1*E + 0.1*P + 0.1*R + 0.15*X + noise
83
+ """
84
+ if self.detect_trivial_output(action_str):
85
+ # Anti-hacking strongly penalized
86
+ components = {"C": 0.0, "Q": 0.0, "D": 0.0, "E": -1.0, "X": 0.0, "noise": 0.0}
87
+ return -1.0, components
88
+
89
+ c = correctness
90
+ q = reasoning_quality
91
+ d = self.compute_diversity(final_answer, history)
92
+
93
+ # If repeated answer, C is zeroed to prevent hacking
94
+ if d < 0:
95
+ c = 0.0
96
+
97
+ e = self.compute_efficiency(action_str)
98
+ x = self.compute_exploration_bonus(action_str, times_seen_problem)
99
+
100
+ noise = random.gauss(0, 0.05)
101
+
102
+ # Smoothly squish reasoning quality using tanh to bound its impact
103
+ q_smooth = math.tanh(q)
104
+
105
+ # New Composite Reward Equation
106
+ total_r = (0.35 * c) + (0.15 * q_smooth) + (0.1 * process_supervision) + (0.1 * reflection_score) + (0.15 * d) + (0.05 * e) + (0.1 * x) + noise
107
+
108
+ components = {
109
+ "total_reward": total_r,
110
+ "C_correctness": c,
111
+ "Q_reasoning": q_smooth,
112
+ "P_process_supervision": process_supervision,
113
+ "R_reflection": reflection_score,
114
+ "D_diversity": d,
115
+ "E_efficiency": e,
116
+ "X_exploration": x,
117
+ "noise": noise
118
+ }
119
+
120
+ return total_r, components
env/verifier.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import math
3
+ from typing import Dict, Any, Tuple
4
+
5
+ class VerifierSystem:
6
+ def __init__(self):
7
+ pass
8
+
9
+ def check_exact_match(self, prediction: str, ground_truth: str) -> bool:
10
+ """1. Exact match verifier"""
11
+ return prediction.strip().lower() == ground_truth.strip().lower()
12
+
13
+ def check_numeric_tolerance(self, prediction: str, ground_truth: str, tol: float = 1e-4) -> bool:
14
+ """2. Numeric tolerance checker"""
15
+ try:
16
+ pred_val = float(prediction.strip())
17
+ gt_val = float(ground_truth.strip())
18
+ return math.isclose(pred_val, gt_val, rel_tol=tol, abs_tol=tol)
19
+ except ValueError:
20
+ return False
21
+
22
+ def check_python_execution(self, prediction: str, ground_truth: str) -> bool:
23
+ """3. Python execution (eval safe expressions)"""
24
+ # If prediction is an expression like "2+3", try evaluating it safely
25
+ safe_dict = {"__builtins__": None, "math": math}
26
+ try:
27
+ # We are verifying if evaluating the prediction gives ground truth
28
+ pred_eval = eval(prediction.strip(), safe_dict, {})
29
+ try:
30
+ gt_eval = float(ground_truth.strip())
31
+ return math.isclose(float(pred_eval), gt_eval, rel_tol=1e-4, abs_tol=1e-4)
32
+ except ValueError:
33
+ return str(pred_eval).strip().lower() == ground_truth.strip().lower()
34
+ except Exception:
35
+ return False
36
+
37
+ def mock_llm_judge(self, reasoning: str, prediction: str, ground_truth: str) -> float:
38
+ """4. LLM judge (mock or placeholder scoring reasoning quality)
39
+ Returns reasoning quality score Q (0.0 to 1.0)
40
+ """
41
+ # A simple heuristic for mock judge:
42
+ # Longer reasoning with step-like markers suggests higher quality in this mock
43
+ step_markers = ['step', 'first', 'then', 'because', 'therefore', 'equals', '=', '+', '-']
44
+ score = 0.0
45
+
46
+ # Length bonus (up to 0.4)
47
+ length = len(reasoning.split())
48
+ score += min(0.4, length * 0.01)
49
+
50
+ # Structure bonus (up to 0.6)
51
+ lower_reasoning = reasoning.lower()
52
+ marker_count = sum(1 for m in step_markers if m in lower_reasoning)
53
+ score += min(0.6, marker_count * 0.1)
54
+
55
+ return round(min(1.0, score), 2)
56
+
57
+ def check_process_supervision(self, reasoning: str) -> float:
58
+ """
59
+ [PAPER TRACEABILITY: Process Supervision (Lightweight PRM)]
60
+ E. PROCESS SUPERVISION (STEP-AWARE REWARD)
61
+ Validates reasoning steps (basic heuristics).
62
+ Penalizes logical jumps and rewards structured step-by-step reasoning.
63
+ """
64
+ lower_r = reasoning.lower()
65
+ score = 0.0
66
+
67
+ # Check stepwise structure
68
+ if "step 1" in lower_r and "step 2" in lower_r:
69
+ score += 0.5
70
+ elif "first" in lower_r and ("then" in lower_r or "next" in lower_r):
71
+ score += 0.3
72
+
73
+ # Penalize missing steps if it's very short but claims complex operations
74
+ if len(lower_r.split()) < 10 and ("=" in lower_r or "so" in lower_r):
75
+ score -= 0.5 # Logical jump penalty
76
+
77
+ return max(-1.0, min(1.0, score))
78
+
79
+ def check_reflection(self, reasoning: str, c: float) -> float:
80
+ """
81
+ [PAPER TRACEABILITY: Reflection Module]
82
+ H. REFLECTION MODULE
83
+ Model generates "What could be wrong?"
84
+ Penalize if contradiction with final answer, reward correct self-correction.
85
+ """
86
+ lower_r = reasoning.lower()
87
+ score = 0.0
88
+
89
+ reflection_phrases = ["what could be wrong", "wait,", "let me check", "alternatively"]
90
+ if any(phrase in lower_r for phrase in reflection_phrases):
91
+ # Reflection attempted
92
+ if c >= 1.0:
93
+ score += 1.0 # Correct self-correction / successful verification
94
+ else:
95
+ score -= 0.5 # Contradiction or failed correction
96
+
97
+ return score
98
+
99
+ def verify(self, reasoning: str, prediction: str, ground_truth: str) -> Tuple[float, float, float, float]:
100
+ """
101
+ Run all verifiers.
102
+ Returns Correctness (C), Reasoning Quality (Q), Process Supervision (P), and Reflection (R).
103
+ """
104
+ c = 0.0
105
+ if self.check_exact_match(prediction, ground_truth):
106
+ c = 1.0
107
+ elif self.check_numeric_tolerance(prediction, ground_truth):
108
+ c = 1.0
109
+ elif self.check_python_execution(prediction, ground_truth):
110
+ c = 1.0
111
+
112
+ q = self.mock_llm_judge(reasoning, prediction, ground_truth)
113
+
114
+ p = self.check_process_supervision(reasoning)
115
+ r = self.check_reflection(reasoning, c)
116
+
117
+ return c, q, p, r