Spaces:

Pratap-K
/

AutoMathReasoner

Sleeping

App Files Files Community

Pratap-K commited on 13 days ago

Commit

8093eea

1 Parent(s): a553bf0

Modigy Train method

Browse files

Files changed (2) hide show

env/rewards.py +7 -2
train/train_grpo.py +51 -3

env/rewards.py CHANGED Viewed

@@ -102,9 +102,14 @@ class RewardSystem:
         # Smoothly squish reasoning quality using tanh to bound its impact
         q_smooth = math.tanh(q)
-        # New Composite Reward Equation
-        total_r = (0.35 * c) + (0.15 * q_smooth) + (0.1 * process_supervision) + (0.1 * reflection_score) + (0.15 * d) + (0.05 * e) + (0.1 * x) + noise
         components = {
             "total_reward": total_r,
             "C_correctness": c,

         # Smoothly squish reasoning quality using tanh to bound its impact
         q_smooth = math.tanh(q)
+        # Normalize variables mapping entirely into the [0, 1] domain
+        p_norm = (process_supervision + 1.0) / 2.0  # Scales [-1, 1] to [0, 1]
+        r_norm = (reflection_score + 0.5) / 1.5     # Scales [-0.5, 1.0] to [0, 1]
+        q_norm = min(1.0, max(0.0, q_smooth))
+        # New Simplified Composite Reward Equation (Strictly bounded [0, 1])
+        # Base coefficients sum exactly to 1.0. Noise is removed to satisfy bounds.
+        total_r = (0.4 * c) + (0.3 * q_norm) + (0.2 * p_norm) + (0.1 * r_norm)
         components = {
             "total_reward": total_r,
             "C_correctness": c,

train/train_grpo.py CHANGED Viewed

@@ -208,11 +208,59 @@ def main():
     print("Starting LADDER Training (Curriculum: Recursive Variant Trees)...")
     trainer.train()
     # Showcase TTRL
     run_ttrl(model, tokenizer, "If 4(x+2) - 10 = 14, what is x?", env)
 if __name__ == "__main__":
     main()
-if __name__ == "__main__":
-    main()

     print("Starting LADDER Training (Curriculum: Recursive Variant Trees)...")
     trainer.train()
+    # Generate Training Charts
+    try:
+        import matplotlib.pyplot as plt
+        import os
+        os.makedirs("outputs_math/plots", exist_ok=True)
+        history = trainer.state.log_history
+        # Plot Loss
+        losses = [x["loss"] for x in history if "loss" in x]
+        steps = [x["step"] for x in history if "loss" in x]
+        if losses:
+            plt.figure(figsize=(10, 6))
+            plt.plot(steps, losses, marker="o", color="blue", linewidth=2)
+            plt.title("GRPO Training Loss Over Steps")
+            plt.xlabel("Steps")
+            plt.ylabel("Loss")
+            plt.grid(True, linestyle='--', alpha=0.7)
+            plt.savefig("outputs_math/plots/training_loss.png")
+            plt.close()
+        # Plot Rewards
+        rewards = [x["reward"] for x in history if "reward" in x]
+        r_steps = [x["step"] for x in history if "reward" in x]
+        if rewards:
+            plt.figure(figsize=(10, 6))
+            plt.plot(r_steps, rewards, marker="x", color="green", linewidth=2)
+            plt.title("Average Completion Reward Over Steps")
+            plt.xlabel("Steps")
+            plt.ylabel("Rewards")
+            plt.grid(True, linestyle='--', alpha=0.7)
+            plt.savefig("outputs_math/plots/reward.png")
+            plt.close()
+        # Plot KL Divergence
+        kl = [x["kl"] for x in history if "kl" in x]
+        kl_steps = [x["step"] for x in history if "kl" in x]
+        if kl:
+            plt.figure(figsize=(10, 6))
+            plt.plot(kl_steps, kl, marker="^", color="red", linewidth=2)
+            plt.title("KL Divergence (Policy vs Reference)")
+            plt.xlabel("Steps")
+            plt.ylabel("KL Divergence")
+            plt.grid(True, linestyle='--', alpha=0.7)
+            plt.savefig("outputs_math/plots/kl_divergence.png")
+            plt.close()
+        print(f"✅ Generated training metric plots in 'outputs_math/plots' directory.")
+    except Exception as e:
+        print(f"Could not generate plots: {e}")
     # Showcase TTRL
     run_ttrl(model, tokenizer, "If 4(x+2) - 10 = 14, what is x?", env)
 if __name__ == "__main__":
     main()