# training/plot_rewards.py
# Run after grpo_train.py to generate reward graphs.
# Usage: python training/plot_rewards.py --input salespath_training_outputs/reward_history.txt

import argparse
import os
import sys

import matplotlib
matplotlib.use("Agg")  # headless safe
import matplotlib.pyplot as plt
import numpy as np


def load_rewards(path: str) -> list[float]:
    rewards = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split("\t")
            rewards.append(float(parts[-1]))
    return rewards


def rolling_mean(data: list[float], window: int = 20) -> list[float]:
    result = []
    for i in range(len(data)):
        start = max(0, i - window + 1)
        result.append(float(np.mean(data[start : i + 1])))
    return result


def plot(rewards: list[float], output_path: str):
    steps = list(range(len(rewards)))
    smooth = rolling_mean(rewards, window=20)

    fig, axes = plt.subplots(2, 1, figsize=(12, 8))
    fig.suptitle("SalesPath — Training Reward Curve", fontsize=14, fontweight="bold")

    # Top: raw + smoothed reward
    ax = axes[0]
    ax.plot(steps, rewards, alpha=0.3, color="#5b9bd5", linewidth=0.8, label="Episode reward")
    ax.plot(steps, smooth, color="#e07b3c", linewidth=2.0, label="Rolling mean (20)")
    ax.axhline(0, color="gray", linestyle="--", linewidth=0.8)
    ax.set_ylabel("Total Reward")
    ax.set_xlabel("Episode")
    ax.legend(loc="upper left")
    ax.set_ylim(-1.1, 1.1)
    ax.grid(True, alpha=0.3)

    # Bottom: histogram of rewards
    ax2 = axes[1]
    ax2.hist(rewards, bins=30, color="#5b9bd5", edgecolor="white", alpha=0.8)
    ax2.axvline(np.mean(rewards), color="#e07b3c", linewidth=2, label=f"Mean: {np.mean(rewards):.3f}")
    ax2.set_xlabel("Reward")
    ax2.set_ylabel("Count")
    ax2.set_title("Reward Distribution")
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig(output_path, dpi=150, bbox_inches="tight")
    print(f"Saved reward graph → {output_path}")
    print(f"  Episodes:    {len(rewards)}")
    print(f"  Mean reward: {np.mean(rewards):.4f}")
    print(f"  Max reward:  {np.max(rewards):.4f}")
    print(f"  Min reward:  {np.min(rewards):.4f}")
    print(f"  Std reward:  {np.std(rewards):.4f}")


def main():
    parser = argparse.ArgumentParser(description="Plot SalesPath reward history.")
    parser.add_argument(
        "--input",
        default="salespath_training_outputs/reward_history.txt",
        help="Path to reward_history.txt",
    )
    parser.add_argument(
        "--output",
        default="salespath_training_outputs/reward_graph.png",
        help="Output PNG path",
    )
    args = parser.parse_args()

    if not os.path.exists(args.input):
        print(f"ERROR: {args.input} not found. Run grpo_train.py first.")
        sys.exit(1)

    rewards = load_rewards(args.input)
    if not rewards:
        print("ERROR: No rewards found in file.")
        sys.exit(1)

    os.makedirs(os.path.dirname(args.output), exist_ok=True)
    plot(rewards, args.output)


if __name__ == "__main__":
    main()