# -*- coding: utf-8 -*-
"""
Created on Wed Mar  1 00:43:49 2023

@author: leona
"""

import numpy as np
import torch
from sklearn.cluster import KMeans
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F 
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical

################################## set device ##################################
#print("============================================================================================")
# set device to cpu or cuda
device = torch.device('cpu')
# if(torch.cuda.is_available()): 
#     device = torch.device('cuda:0') 
#     torch.cuda.empty_cache()
#     print("Device set to : " + str(torch.cuda.get_device_name(device)))
# else:
#     print("Device set to : cpu")
#print("============================================================================================")


class NegReLU(nn.Module):
    def forward(self, x):
        return -torch.relu(x)

import torch
import torch.nn as nn

class NoisyLinear(nn.Module):
    def __init__(self, in_features, out_features, std_init=0.4):
        super(NoisyLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.std_init = std_init

        # Parameters without direct noise modification
        self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features))
        self.weight_sigma = nn.Parameter(torch.Tensor(out_features, in_features) * std_init)
        self.bias_mu = nn.Parameter(torch.empty(out_features))
        self.bias_sigma = nn.Parameter(torch.empty(out_features))

        # Initialize parameters
        self.reset_parameters()

        # Register buffers for noise (not requiring gradients)
        self.register_buffer('weight_epsilon', torch.empty(out_features, in_features))
        self.register_buffer('bias_epsilon', torch.empty(out_features))

    def reset_parameters(self):
        # Kaiming initialization adapted for 'tanh' activation
        nn.init.kaiming_uniform_(self.weight_mu, a=np.sqrt(5))
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight_mu)
        bound = 1 / np.sqrt(fan_in)
        nn.init.uniform_(self.bias_mu, -bound, bound)

        # Initialize sigma parameters scaled by std_init
        nn.init.constant_(self.weight_sigma, self.std_init)
        nn.init.constant_(self.bias_sigma, self.std_init)

    def forward(self, input, noise_scale=1.0):
        # Generate new noise on every forward pass, independent of the parameters' gradients
        self.weight_epsilon.normal_()
        self.bias_epsilon.normal_()

        # Clamp the generated noise to prevent extreme values (non-in-place)
        weight_noise = self.weight_epsilon.clamp(-2, 2) * noise_scale
        bias_noise = self.bias_epsilon.clamp(-2, 2) * noise_scale

        # Calculate the final weights and biases by combining mu, sigma and noise
        weight = self.weight_mu + self.weight_sigma * weight_noise
        bias = self.bias_mu + self.bias_sigma * bias_noise

        return F.linear(input, weight, bias)


################################## PDPPO Policy ##################################
class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.post_states = []
        self.logprobs = []
        self.rewards = []
        self.post_rewards = []
        self.state_values = []
        self.state_values_post = []
        self.is_terminals = []
    
    def clear(self,lag):
        self.actions = self.actions[lag:]
        self.states = self.states[lag:]
        self.post_states = self.post_states[lag:]
        self.logprobs = self.logprobs[lag:]
        self.rewards = self.rewards[lag:]
        self.post_rewards = self.post_rewards[lag:]
        self.state_values = self.state_values[lag:]
        self.state_values_post = self.state_values_post[lag:]
        self.is_terminals = self.is_terminals[lag:]

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, has_continuous_action_space, action_std_init, noise_decay_rate=0.9976):
        super(ActorCritic, self).__init__()
        self.noise_scale = 1.0  # Start with full noise
        self.noise_decay_rate = noise_decay_rate
        self.has_continuous_action_space = has_continuous_action_space
        
        if has_continuous_action_space:
            self.action_dim = action_dim
            self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)
        # actor
        if has_continuous_action_space :
            self.actor = nn.Sequential(
                            nn.Linear(state_dim, 128),
                            nn.Tanh(),
                            nn.Linear(128, 128),
                            nn.Tanh(),
                            nn.Linear(128, action_dim),
                            nn.Tanh()
                        )
        else:
            
            self.action_dim = action_dim
            self.fc1 = nn.Linear(state_dim, 128)
            self.fc2 = nn.Linear(128, 128)
            self.actor = nn.Linear(128, self.action_dim.nvec.sum())
            
            
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 128),
                        nn.Tanh(),
                        nn.Linear(128, 128),
                        nn.Tanh(),
                        nn.Linear(128, 1),
                    )
        
        self.critic_post = nn.Sequential(
                        nn.Linear(state_dim, 128),
                        nn.Tanh(),
                        nn.Linear(128, 128),
                        nn.Tanh(),
                        nn.Linear(128, 1),
                    )
        
    def decay_noise(self):
        self.noise_scale *= self.noise_decay_rate
    
    def _initialize_actor(self, m):
        if isinstance(m, nn.Linear):
            # Example: Kaiming initialization for actor layers
            init.kaiming_uniform_(m.weight, nonlinearity='relu')
            if m.bias is not None:
                init.zeros_(m.bias)

    def _initialize_critic(self, m):
        if isinstance(m, nn.Linear):
            # Example: Xavier initialization for critic layers
            init.xavier_uniform_(m.weight)
            if m.bias is not None:
                init.zeros_(m.bias)

    def forward(self, state):
        raise NotImplementedError
    
    
    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_var = torch.full((self.action_dim,), new_action_std * new_action_std).to(device)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling ActorCritic::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")


    def act(self, state):
        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
            dist = MultivariateNormal(action_mean, cov_mat)
        else:
            x = F.relu(self.fc1(state))
            x = F.relu(self.fc2(x))
            logits = self.actor(x)
            logits_shaped = logits.view(len(self.action_dim.nvec), self.action_dim.nvec.max())
            action_probs = nn.functional.softmax(logits_shaped, dim=-1)
            dist = Categorical(action_probs)


        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    
    def evaluate(self, state, post_state, action):

        if self.has_continuous_action_space:
            action_mean = self.actor(state)
            
            action_var = self.action_var.expand_as(action_mean)
            cov_mat = torch.diag_embed(action_var).to(device)
            dist = MultivariateNormal(action_mean, cov_mat)
            
            # For Single Action Environments.
            if self.action_dim == 1:
                action = action.reshape(-1, self.action_dim)
        else:
            x = F.relu(self.fc1(state))
            x = F.relu(self.fc2(x))
            logits = self.actor(x)
            logits_shaped = logits.view(-1,len(self.action_dim.nvec), self.action_dim.nvec.max())
            action_probs = nn.functional.softmax(logits_shaped, dim=-1)
            dist = Categorical(action_probs)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        state_values_post = self.critic_post(post_state)
        
        return action_logprobs, state_values, state_values_post, dist_entropy


class PDPPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, env, has_continuous_action_space, tau, action_std_init=0.6):

        self.has_continuous_action_space = has_continuous_action_space

        if has_continuous_action_space:
            self.action_std = action_std_init
        
        self.tau = tau
        self.env = env
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
        self.policy.actor.apply(self.policy._initialize_actor)
        self.policy.critic.apply(self.policy._initialize_critic)
        self.policy.critic_post.apply(self.policy._initialize_critic)

        self.optimizer_actor = torch.optim.Adam(self.policy.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = torch.optim.Adam(self.policy.critic.parameters(), lr=lr_critic)
        self.optimizer_critic_post = torch.optim.Adam(self.policy.critic_post.parameters(), lr=lr_critic)
        
        self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()

    def set_action_std(self, new_action_std):
        if self.has_continuous_action_space:
            self.action_std = new_action_std
            self.policy.set_action_std(new_action_std)
            self.policy_old.set_action_std(new_action_std)
        else:
            print("--------------------------------------------------------------------------------------------")
            print("WARNING : Calling PDPPO::set_action_std() on discrete action space policy")
            print("--------------------------------------------------------------------------------------------")

    def decay_action_std(self, action_std_decay_rate, min_action_std):
        print("--------------------------------------------------------------------------------------------")
        if self.has_continuous_action_space:
            self.action_std = self.action_std - action_std_decay_rate
            self.action_std = round(self.action_std, 4)
            if (self.action_std <= min_action_std):
                self.action_std = min_action_std
                print("setting actor output action_std to min_action_std : ", self.action_std)
            else:
                print("setting actor output action_std to : ", self.action_std)
            self.set_action_std(self.action_std)

        else:
            print("WARNING : Calling PDPPO::decay_action_std() on discrete action space policy")
        print("--------------------------------------------------------------------------------------------")
    
    def get_post_state(self, action, machine_setup, inventory_level):
        setup_loss = np.zeros(self.env.n_machines, dtype=int)
        setup_costs = np.zeros(self.env.n_machines)
        # if we are just changing the setup, we use the setup cost matrix with the corresponding position given by the actual setup and the new setup
        for m in range(self.env.n_machines):   
            if action[m] != 0: # if the machine is not iddle
                # 1. IF NEEDED CHANGE SETUP
                if machine_setup[m] != action[m] and action[m] != 0:
                    setup_costs[m] = self.env.setup_costs[m][action[m] - 1] 
                    setup_loss[m] = self.env.setup_loss[m][action[m] - 1]
                machine_setup[m] = action[m]
                # 2. PRODUCTION
                production = self.env.machine_production_matrix[m][action[m] - 1] - setup_loss[m]
                inventory_level[action[m] - 1] += production
            else:
                machine_setup[m] = 0
        # return the new machine_setup_inventory_level and the setup_cost        
        return machine_setup, inventory_level, setup_costs 
    
    def select_action(self, state,tau):

        if self.has_continuous_action_space:
            with torch.no_grad():
                state = torch.FloatTensor(state).to(device)
                action, action_logprob, state_val = self.policy_old.act(state,tau)

            self.buffer.states.append(state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)
            self.buffer.state_values.append(state_val)

            return action.detach().numpy().flatten()
        else:
            with torch.no_grad():
                state = torch.FloatTensor(state).to(device)
                action, action_logprob = self.policy_old.act(state)
            
            
            machine_setup, inventory_level, setup_cost = self.get_post_state(action=action.clone(),
                                                                            machine_setup = self.env.machine_setup.copy(), 
                                                                            inventory_level = state[0:self.env.n_items].clone())
            
            post_state = state.clone()
            post_state[0:self.env.n_items] = inventory_level.clone()
            #post_state[self.env.n_items:self.env.n_items+self.env.n_machines] = machine_setup.clone()
            post_state = torch.FloatTensor(post_state).to(device)
            
            self.buffer.states.append(state)
            self.buffer.post_states.append(post_state)
            self.buffer.actions.append(action)
            self.buffer.logprobs.append(action_logprob)
            post_rewards = torch.FloatTensor([-sum(setup_cost)])
            self.buffer.post_rewards.append(post_rewards)
            
            with torch.no_grad():
                state_val = self.policy_old.critic(state)
                state_val_post = self.policy_old.critic_post(post_state)            
            
            self.buffer.state_values.append(state_val)
            self.buffer.state_values_post.append(state_val_post)

            return action.numpy(), post_rewards.numpy()

    def update(self):

        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)

        #rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        post_rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.post_rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            post_rewards.insert(0, discounted_reward)
            

        post_rewards = torch.tensor(post_rewards, dtype=torch.float32).to(device)

        # Normalizing the rewards

        # post_rewards = (post_rewards - post_rewards.mean()) / (post_rewards.std() + 1e-7)

        # pre_rewards = (rewards - post_rewards)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_post_states = torch.squeeze(torch.stack(self.buffer.post_states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
        old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
        old_state_values_post = torch.squeeze(torch.stack(self.buffer.state_values_post, dim=0)).detach().to(device)

        # Calculate advantages for current and subsequent states
        advantages_current = rewards - old_state_values
        advantages_post = post_rewards - old_state_values_post

        advantages = torch.max(advantages_current, advantages_post)
        

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, post_state_values, dist_entropy,  = self.policy.evaluate(old_states, old_post_states, old_actions)

            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss  
            surr1 = ratios * advantages.unsqueeze(1)
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages.unsqueeze(1)

            critic_loss = self.MseLoss(state_values.squeeze(), rewards)

            critic_loss_post = self.MseLoss(post_state_values.squeeze(), post_rewards)

            actor_loss = (-torch.min(surr1, surr2) - 0.001 * dist_entropy).mean() + 0.7*(critic_loss.detach() + critic_loss_post.detach())

            # Update the actor
            self.optimizer_actor.zero_grad()
            actor_loss.backward()
            self.optimizer_actor.step()

            # Update the critic
            self.optimizer_critic.zero_grad()
            critic_loss.backward()
            self.optimizer_critic.step()

            # Update the critic_post
            self.optimizer_critic_post.zero_grad()
            critic_loss_post.backward()
            self.optimizer_critic_post.step()
        
        self.policy_old.load_state_dict(self.policy.state_dict())
    
        # clear buffer
        #self.buffer.clear()
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   
    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))