leokana commited on Feb 23, 2024

Commit

4b36c77

1 Parent(s): febb285

include modifications to test dual critic ppo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

code/Lot-sizing/agents/PDPPO.py +0 -2
code/Lot-sizing/agents/PDPPOAgent.py +1 -1
code/Lot-sizing/agents/PDPPOAgent_one_critic.py +14 -14
code/Lot-sizing/agents/PDPPO_v0.py +0 -328
code/Lot-sizing/agents/{PDPPO one critic.py → PDPPOonecritic.py} +1 -1
code/Lot-sizing/agents/PPO.py +1 -3
code/Lot-sizing/agents/PPOAgent.py +1 -3
code/Lot-sizing/agents/PPOAgent_two_critics.py +385 -0
code/Lot-sizing/agents/{PDPPO_one_critic.py → PPOtwocritics.py} +37 -67
code/Lot-sizing/agents/__init__.py +8 -27
code/Lot-sizing/agents/__pycache__/PDPPO.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PDPPOAgent.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PDPPOAgent_one_critic.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PDPPO_one_critic.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PDPPOonecritic.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PPO.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PPOAgent.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PPOAgent_two_critics.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/PPOtwocritics.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/__init__.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/perfectInfoAgent.cpython-38.pyc +0 -0
code/Lot-sizing/agents/__pycache__/stableBaselineAgents.cpython-38.pyc +0 -0
code/Lot-sizing/agents/perfectInfoAgent.py +18 -0
code/Lot-sizing/agents/stableBaselineAgents.py +320 -0
code/Lot-sizing/envs/__pycache__/__init__.cpython-38.pyc +0 -0
code/Lot-sizing/envs/__pycache__/simplePlant.cpython-38.pyc +0 -0
code/Lot-sizing/envs/__pycache__/singleSequenceDependentMachinePlant.cpython-38.pyc +0 -0
code/Lot-sizing/experiments.py +77 -25
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_0_0.pth +3 -0
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_0.csv +126 -0
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_2.csv +126 -0
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_3.csv +126 -0
code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_0_0.pth +3 -0
code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_0.csv +251 -0
code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_2.csv +216 -0
code/Lot-sizing/logs/best_A2C_15items_5machines_i100_0/best_model.zip +3 -0
code/Lot-sizing/logs/evaluations.npz +0 -0
code/Lot-sizing/models/__pycache__/__init__.cpython-38.pyc +0 -0
code/Lot-sizing/models/__pycache__/multistageOptimization.cpython-38.pyc +0 -0
code/Lot-sizing/models/__pycache__/optimizationProblemInstance.cpython-38.pyc +0 -0
code/Lot-sizing/models/__pycache__/perfectInfoOptimization.cpython-38.pyc +0 -0
code/Lot-sizing/results/PDPPO_15items_5machines_i100_actions_test.npy +3 -0
code/Lot-sizing/results/PDPPO_15items_5machines_i100_costs_test.npy +3 -0
code/Lot-sizing/results/PDPPO_15items_5machines_i100_demands_test.npy +3 -0
code/Lot-sizing/results/PDPPO_15items_5machines_i100_holding_costs_test.npy +3 -0
code/Lot-sizing/results/PDPPO_15items_5machines_i100_lost_sales_test.npy +3 -0
code/Lot-sizing/results/PDPPO_15items_5machines_i100_observations_test.npy +3 -0
code/Lot-sizing/results/PDPPO_15items_5machines_i100_setup_costs_test.npy +3 -0
code/Lot-sizing/results/PPO_15items_5machines_i100_actions_test.npy +3 -0
code/Lot-sizing/results/PPO_15items_5machines_i100_costs_test.npy +3 -0

code/Lot-sizing/agents/PDPPO.py CHANGED Viewed

@@ -309,8 +309,6 @@ class PDPPO:
             # final loss of clipped objective PDPPO
             loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(torch.min(state_values,state_values_post.squeeze()), rewards) - 0.012 * dist_entropy
-            loss_numpy = loss.detach().cpu().numpy()
             # take gradient step
             self.optimizer.zero_grad()
             loss.mean().backward()

             # final loss of clipped objective PDPPO
             loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(torch.min(state_values,state_values_post.squeeze()), rewards) - 0.012 * dist_entropy
             # take gradient step
             self.optimizer.zero_grad()
             loss.mean().backward()

code/Lot-sizing/agents/PDPPOAgent.py CHANGED Viewed

@@ -14,7 +14,7 @@ BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
 AGENTS_DIR = os.path.join(BASE_DIR,'agents')
 sys.path.append(AGENTS_DIR)
 from agents.PDPPO import PDPPO
-from envs import *
 import copy

 AGENTS_DIR = os.path.join(BASE_DIR,'agents')
 sys.path.append(AGENTS_DIR)
 from agents.PDPPO import PDPPO
+from envs import SimplePlant
 import copy

code/Lot-sizing/agents/PDPPOAgent_one_critic.py CHANGED Viewed

@@ -13,8 +13,8 @@ import matplotlib.patches as mpatches # Provides a way of adding a colored patch
 BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
 AGENTS_DIR = os.path.join(BASE_DIR,'agents')
 sys.path.append(AGENTS_DIR)
-from agents.PDPPO import PDPPO
-from envs import *
 import copy
@@ -101,7 +101,7 @@ class SimplePlantSB(SimplePlant):
         return obs
-class PDPPOAgent():
     def __init__(self, env: SimplePlant, settings: dict):
         self.env = SimplePlantSB(env.settings, env.stoch_model)
         self.last_inventory = env.inventory_level
@@ -142,7 +142,7 @@ class PDPPOAgent():
         ## Note : print/log frequencies should be > than self.max_ep_len
-        ################ PDPPO hyperparameters ################
         self.update_timestep = self.max_ep_len * 4      # update policy every n timesteps
         self.K_epochs = 60               # update policy for K epochs in one PDPPO update
@@ -169,7 +169,7 @@ class PDPPOAgent():
         else:
             self.action_dim = self.env.action_space
-        self.pdppo_agent = PDPPO(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space,self.tau, self.action_std)
     ################################### Training ###################################
@@ -187,7 +187,7 @@ class PDPPOAgent():
         if not os.path.exists(log_dir):
               os.makedirs(log_dir)
-        log_dir = log_dir + '/' + self.experiment_name + '_PDPPO/'
         if not os.path.exists(log_dir):
               os.makedirs(log_dir)
@@ -197,7 +197,7 @@ class PDPPOAgent():
         run_num = len(current_num_files)
         #### create new log file for each run
-        log_f_name = log_dir + '/PDPPO_' + self.experiment_name + "_log_" + str(run_num) + ".csv"
         print("current logging run number for " + self.experiment_name + " : ", run_num)
         print("logging at : " + log_f_name)
@@ -215,7 +215,7 @@ class PDPPOAgent():
               os.makedirs(directory)
-        checkpoint_path = directory + "PDPPO_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
         print("save checkpoint path : " + checkpoint_path)
         #####################################################
@@ -241,9 +241,9 @@ class PDPPOAgent():
         else:
             print("Initializing a discrete action space policy")
         print("--------------------------------------------------------------------------------------------")
-        print("PDPPO update frequency : " + str(self.update_timestep) + " timesteps")
-        print("PDPPO K epochs : ", self.K_epochs)
-        print("PDPPO epsilon clip : ", self.eps_clip)
         print("discount factor (self.gamma) : ", self.gamma)
         print("--------------------------------------------------------------------------------------------")
         print("optimizer learning rate actor : ", self.lr_actor)
@@ -259,7 +259,7 @@ class PDPPOAgent():
         ################# training procedure ################
         # initialize a PDPPO agent
-        self.PDPPO_agent = PDPPO(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space, self.action_std)
         # track total training time
         start_time = datetime.now().replace(microsecond=0)
@@ -388,7 +388,7 @@ class PDPPOAgent():
     def load_agent(self,path):
         #directory = "PDPPO_preTrained" + '/' + env_name + '/'
         directory = self.LOG_DIR
-        directory = directory + '/' + self.experiment_name + '_PDPPO' + '/'
-        checkpoint_path = directory + "PDPPO_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
         print("loading network from : " + checkpoint_path)
         self.pdppo_agent.load(checkpoint_path)

 BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
 AGENTS_DIR = os.path.join(BASE_DIR,'agents')
 sys.path.append(AGENTS_DIR)
+from agents.PDPPOonecritic import PDPPOonecritic
+from envs import SimplePlant
 import copy
         return obs
+class PDPPOAgent_one_critic():
     def __init__(self, env: SimplePlant, settings: dict):
         self.env = SimplePlantSB(env.settings, env.stoch_model)
         self.last_inventory = env.inventory_level
         ## Note : print/log frequencies should be > than self.max_ep_len
+        ################ PDPPO_one_critic hyperparameters ################
         self.update_timestep = self.max_ep_len * 4      # update policy every n timesteps
         self.K_epochs = 60               # update policy for K epochs in one PDPPO update
         else:
             self.action_dim = self.env.action_space
+        self.pdppo_agent = PDPPO_one_critic(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space,self.tau, self.action_std)
     ################################### Training ###################################
         if not os.path.exists(log_dir):
               os.makedirs(log_dir)
+        log_dir = log_dir + '/' + self.experiment_name + '_PDPPO_one_critic/'
         if not os.path.exists(log_dir):
               os.makedirs(log_dir)
         run_num = len(current_num_files)
         #### create new log file for each run
+        log_f_name = log_dir + '/PDPPO_one_critic_' + self.experiment_name + "_log_" + str(run_num) + ".csv"
         print("current logging run number for " + self.experiment_name + " : ", run_num)
         print("logging at : " + log_f_name)
               os.makedirs(directory)
+        checkpoint_path = directory + "PDPPO_one_critic_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
         print("save checkpoint path : " + checkpoint_path)
         #####################################################
         else:
             print("Initializing a discrete action space policy")
         print("--------------------------------------------------------------------------------------------")
+        print("PDPPO_one_critic update frequency : " + str(self.update_timestep) + " timesteps")
+        print("PDPPO_one_critic K epochs : ", self.K_epochs)
+        print("PDPPO_one_critic epsilon clip : ", self.eps_clip)
         print("discount factor (self.gamma) : ", self.gamma)
         print("--------------------------------------------------------------------------------------------")
         print("optimizer learning rate actor : ", self.lr_actor)
         ################# training procedure ################
         # initialize a PDPPO agent
+        self.PDPPO_agent = PDPPO_one_critic(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space, self.action_std)
         # track total training time
         start_time = datetime.now().replace(microsecond=0)
     def load_agent(self,path):
         #directory = "PDPPO_preTrained" + '/' + env_name + '/'
         directory = self.LOG_DIR
+        directory = directory + '/' + self.experiment_name + '_PDPPO_one_critic' + '/'
+        checkpoint_path = directory + "PDPPO_one_critic_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
         print("loading network from : " + checkpoint_path)
         self.pdppo_agent.load(checkpoint_path)

code/Lot-sizing/agents/PDPPO_v0.py DELETED Viewed

@@ -1,328 +0,0 @@
-import os
-import copy
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-from torch.distributions import Categorical
-from envs import *
-import gym
-class SimplePlantSB(SimplePlant):
-    def __init__(self, settings, stoch_model):
-        super().__init__(settings, stoch_model)
-        try:self.dict_obs = settings['dict_obs']
-        except:self.dict_obs = False
-        self.last_inventory = copy.copy(self.inventory_level)
-        self.action_space = gym.spaces.MultiDiscrete(
-            [self.n_items+1] * self.n_machines
-        )
-        if self.dict_obs:
-            self.observation_space = gym.spaces.Dict({
-                'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items),
-                'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines)
-            })
-        else:
-            self.observation_space = gym.spaces.Box(
-                low=np.zeros(self.n_items+self.n_machines),# high for the inventory level
-                high=np.concatenate(
-                    [
-                        np.array(self.max_inventory_level),
-                        np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups
-                    ]),
-                dtype=np.int32
-            )
-    def step(self, action):
-        """
-        Step method: Execute one time step within the environment
-        Parameters
-        ----------
-        action : action given by the agent
-        Returns
-        -------
-        obs : Observation of the state give the method _next_observation
-        reward : Cost given by the _reward method
-        done : returns True or False given by the _done method
-        dict : possible information for control to environment monitoring
-        """
-        self.last_inventory = copy.copy(self.inventory_level)
-        self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand)
-        # self.total_cost['setup_costs'] = 0
-        # self.total_cost['holding_costs'] = 0
-        reward = -sum([ele for key, ele in self.total_cost.items()])
-        #reward = -self.total_cost['lost_sales']
-        #reward = np.abs(action)
-        self.current_step += 1
-        done = self.current_step == self.T
-        obs = self._next_observation()
-        return obs, reward, done, self.total_cost
-    def _next_observation(self):
-        """
-        Returns the next demand
-        """
-        obs = SimplePlant._next_observation(self)
-        #obs['last_inventory_level'] = copy.copy(self.last_inventory)
-        if isinstance(obs, dict):
-            if not self.dict_obs:
-                obs = np.concatenate(
-                    (
-                        obs['inventory_level'], # n_items size
-                        obs['machine_setup'], # n_machine size
-                        #obs['last_inventory_level']# n_items size
-                    )
-                )
-        else:
-            if self.dict_obs:
-                raise('Change dict_obst to False')
-        return obs
-# Define the policy network
-class Policy(nn.Module):
-    def __init__(self, input_size, output_shape):
-        super(Policy, self).__init__()
-        self.fc1 = nn.Linear(input_size, 128)
-        self.fc_list = nn.ModuleList([nn.Linear(128, output_shape[0]) for list(output_shape)[1] in range(0,output_shape[1])])
-    def forward(self, x):
-        x = F.relu(self.fc1(x)).requires_grad_()
-        outputs = [F.softmax(fc(x), dim=1)for fc in self.fc_list]
-        return outputs
-# Define the value network for deterministic components
-class Value(nn.Module):
-    def __init__(self,input_size,output_size):
-        super(Value, self).__init__()
-        self.fc1 = nn.Linear(input_size, 128)
-        self.fc2 = nn.Linear(128, output_size)
-    def forward(self, x):
-        x = F.relu(self.fc1(x)).requires_grad_()
-        x = self.fc2(x)
-        return x
-# Define the value network for stochastic components
-class ValueStochastic(nn.Module):
-    def __init__(self,input_size,output_size):
-        super(ValueStochastic, self).__init__()
-        self.fc1 = nn.Linear(input_size, 128)
-        self.fc2 = nn.Linear(128, output_size)
-    def forward(self, x):
-        x = F.relu(self.fc1(x)).requires_grad_()
-        x = F.softmax(self.fc2(x), dim=1)
-        return x
-# Define the PPO agent
-class PDPPO:
-    def __init__(self, env: SimplePlant, settings: dict):
-        self.env = SimplePlantSB(env.settings, env.stoch_model)
-        self.last_inventory = env.inventory_level
-        self.experiment_name = settings['experiment_name']
-        try:self.dict_obs = settings['dict_obs']
-        except:self.dict_obs = False
-        self.POSSIBLE_STATES = self.env.n_items + 1
-        self.env.cost_to_reward = True
-        self.epsilon = 0
-        BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-        # Use the logs file in the root path of the main.
-        self.LOG_DIR = os.path.join(BASE_DIR,'logs')
-        if self.dict_obs == False:
-            input_size = self.env.observation_space.shape[0]
-        output_size_policy = (self.env.n_items+1, self.env.action_space.shape[0]) # we add 1 for the idle state
-        output_size_value = self.env.action_space.shape[0]
-        self.policy = Policy(input_size,output_size_policy)
-        self.value = Value(input_size,output_size_value)
-        self.value_post = ValueStochastic(input_size,output_size_value)
-        self.optimizer_policy = optim.Adam(self.policy.parameters(), lr=1e-3)
-        self.optimizer_value = optim.Adam(self.value.parameters(), lr=1e-3)
-        self.optimizer_value_post = optim.Adam(self.value_post.parameters(), lr=1e-3)
-        self.eps_clip = 0.2
-        self.gamma = 0.99
-        self.lmbda = 0.95
-    def get_post_state(self, action, machine_setup, inventory_level):
-        setup_loss = np.zeros(self.env.n_machines, dtype=int)
-        setup_costs = np.zeros(self.env.n_machines)
-        # if we are just changing the setup, we use the setup cost matrix with the corresponding position given by the actual setup and the new setup
-        for m in range(self.env.n_machines):
-            if action[m] != 0: # if the machine is not iddle
-                # 1. IF NEEDED CHANGE SETUP
-                if machine_setup[m] != action[m] and action[m] != 0:
-                    setup_costs[m] = self.env.setup_costs[m][action[m] - 1]
-                    setup_loss[m] = self.env.setup_loss[m][action[m] - 1]
-                machine_setup[m] = action[m]
-                # 2. PRODUCTION
-                production = self.env.machine_production_matrix[m][action[m] - 1] - setup_loss[m]
-                inventory_level[action[m] - 1] += production
-            else:
-                machine_setup[m] = 0
-        # return the new machine_setup_inventory_level and the setup_cost
-        return machine_setup, inventory_level, setup_costs
-    def get_action(self, state):
-        state = torch.from_numpy(state).float().unsqueeze(0)
-        probs = self.policy(state)
-        probs_concat = torch.stack(probs, dim=1)
-        m = Categorical(probs_concat)
-        action = m.sample()
-        value = self.value(state)
-        machine_setup, inventory_level, setup_cost = self.get_post_state(action.numpy()[0], state[0][self.env.n_items:self.env.n_items+self.env.n_machines].numpy(), state[0][0:self.env.n_items].numpy())
-        value_post = self.value_post(state)
-        return action, m.log_prob(action), probs_concat, value, value_post
-    def update(self, rewards, rewards_pre_state, rewards_post_state, states, post_states, actions, probs, next_states):
-        # Update deterministic value function
-        for epoch in range(10):
-            for i in range(len(actions)):
-                state = torch.from_numpy(states[i]).float().unsqueeze(0)
-                value = self.value(state)
-                next_state = torch.from_numpy(next_states[i]).float().unsqueeze(0)
-                next_value = self.value(next_state)
-                target = rewards_pre_state[i] + self.gamma * next_value
-                advantage = target - value
-                loss = advantage.pow(2).mean()
-                self.optimizer_value.zero_grad()
-                loss.backward()
-                self.optimizer_value.step()
-        # Update stochastic value function
-        for epoch in range(10):
-            for i in range(len(actions)):
-                state = torch.from_numpy(states[i]).float().unsqueeze(0)
-                value = self.value_post(state)
-                post_state = torch.from_numpy(post_states[i]).float().unsqueeze(0)
-                value_post = self.value_post(post_state)
-                target = rewards_post_state[i] + self.gamma * value_post
-                advantage = target - value
-                loss = advantage.pow(2).mean()
-                self.optimizer_value_post.zero_grad()
-                loss.backward()
-                self.optimizer_value_post.step()
-        # Update policy network
-        states = torch.from_numpy(np.vstack(states)).float()
-        actions = torch.cat(actions).unsqueeze(1)
-        old_probs = torch.cat(probs)
-        old_probs = torch.gather(old_probs.clone(),2, actions)
-        policy_epochs = 10
-        for epoch in range(policy_epochs):
-            probs = self.policy(states)
-            probs = torch.stack(probs, dim=1).clone()
-            m = Categorical(probs)
-            action = m.sample()
-            probs = torch.gather(probs, 2, actions)
-            kl_div = (old_probs * (torch.log(old_probs) - torch.log(probs))).sum()
-            for state,post_state, action, old_prob, prob, next_state, reward_pre_state, reward_post_state in zip(states,post_states, actions, old_probs, probs, next_states,rewards_pre_state,rewards_post_state):
-                state = state.unsqueeze(0)
-                next_state = torch.from_numpy(next_state).unsqueeze(0).float()
-                post_state = torch.from_numpy(post_state).unsqueeze(0).float()
-                action = action.unsqueeze(0)
-                old_prob = old_prob.unsqueeze(0)
-                prob = prob.unsqueeze(0)
-                value = self.value(state)
-                value_post = self.value_post(post_state)
-                advantage = reward_pre_state + self.gamma * self.value(next_state) - self.value(state)
-                advantage_post = reward_post_state + self.gamma * self.value_post(post_state) - self.value_post(state)
-                ratio = (prob / old_prob)
-                surr1 = ratio * advantage
-                surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantage
-                policy_loss = -torch.min(surr1, surr2) - 0.01 * m.entropy()
-                ratio_post = ratio
-                surr1_post = ratio_post * advantage_post
-                surr2_post = torch.clamp(ratio_post, 1 - self.eps_clip, 1 + self.eps_clip) * advantage_post
-                policy_loss_post = -torch.min(surr1_post, surr2_post) - 0.01 * m.entropy()
-                self.optimizer_policy.zero_grad()
-                (policy_loss.pow(2).mean() + policy_loss_post.pow(2).mean() + 0.5 * value.pow(2).mean() + 0.5 * value_post.pow(2).mean()).backward(retain_graph=True)
-                self.optimizer_policy.step()
-    def learn(self, n_episodes=1000, save_interval=100):
-        # Train the agent
-        for episode in range(n_episodes):
-            state = self.env.reset()
-            rewards = []
-            rewards_pre_state = []
-            rewards_post_state = []
-            states = []
-            next_states = []
-            actions = []
-            probs = []
-            post_states = []
-            # next_post_states = []
-            done = False
-            while not done:
-                action, log_prob, prob, value, value_post = self.get_action(state)
-                next_state, reward, done, info = self.env.step(action[0].detach().numpy())
-                machine_setup, inventory_level, setup_cost = self.get_post_state(action[0].detach().numpy(), state[self.env.n_items:self.env.n_items+self.env.n_machines], state[0:self.env.n_items])
-                post_state = state.copy()
-                post_state[self.env.n_items:self.env.n_items+self.env.n_machines] = machine_setup
-                post_state[0:self.env.n_items] = inventory_level
-                post_states.append(post_state)
-                post_state = torch.from_numpy(post_state).float().unsqueeze(0)
-                rewards.append(reward)
-                reward_pre_state = -(self.env.total_cost['holding_costs'] + self.env.total_cost['lost_sales'])
-                reward_post_state = -setup_cost.sum()
-                rewards_pre_state.append(reward_pre_state)
-                rewards_post_state.append(reward_post_state)
-                states.append(state)
-                next_states.append(next_state)
-                actions.append(action)
-                probs.append(prob)
-                state = next_state
-                if done:
-                    self.update(rewards, rewards_pre_state, rewards_post_state, states, post_states, actions, probs, next_states)
-                    print('Episode:', episode, 'Reward:', sum(rewards))
-                    if episode % save_interval == 0:
-                        self.save(f'policy_{episode}.pt')
-            self.save(self.LOG_DIR)
-    def save(self, filepath):
-        torch.save({
-            'policy_state_dict': self.policy.state_dict(),
-            'value_state_dict': self.value.state_dict(),
-            'value_post_state_dict': self.value_post.state_dict(),
-            'optimizer_policy_state_dict': self.optimizer_policy.state_dict(),
-            'optimizer_value_state_dict': self.optimizer_value.state_dict(),
-            'optimizer_value_post_state_dict': self.optimizer_value_post.state_dict()
-        }, filepath)
-    def load(self, filepath):
-        checkpoint = torch.load(filepath)
-        self.policy.load_state_dict(checkpoint['policy_state_dict'])
-        self.value.load_state_dict(checkpoint['value_state_dict'])
-        self.value_post.load_state_dict(checkpoint['value_post_state_dict'])
-        self.optimizer_policy.load_state_dict(checkpoint['optimizer_policy_state_dict'])
-        self.optimizer_value.load_state_dict(checkpoint['optimizer_value_state_dict'])
-        self.optimizer_value_post.load_state_dict(checkpoint['optimizer_value_post_state_dict'])

code/Lot-sizing/agents/{PDPPO one critic.py → PDPPOonecritic.py} RENAMED Viewed

@@ -144,7 +144,7 @@ class ActorCritic(nn.Module):
         return action_logprobs, state_values, dist_entropy
-class PDPPO:
     def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, env, has_continuous_action_space, tau, action_std_init=0.6):
         self.has_continuous_action_space = has_continuous_action_space

         return action_logprobs, state_values, dist_entropy
+class PDPPOonecritic:
     def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, env, has_continuous_action_space, tau, action_std_init=0.6):
         self.has_continuous_action_space = has_continuous_action_space

code/Lot-sizing/agents/PPO.py CHANGED Viewed

@@ -214,7 +214,7 @@ class PPO:
             self.buffer.logprobs.append(action_logprob)
             self.buffer.state_values.append(state_val)
-            return action.numpy()
     def update(self):
         # Monte Carlo estimate of returns
@@ -258,8 +258,6 @@ class PPO:
             # final loss of clipped objective PPO
             loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.012 * dist_entropy
-            loss_numpy = loss.detach().numpy()
             # take gradient step
             self.optimizer.zero_grad()
             loss.mean().backward()

             self.buffer.logprobs.append(action_logprob)
             self.buffer.state_values.append(state_val)
+            return action.cpu().numpy()
     def update(self):
         # Monte Carlo estimate of returns
             # final loss of clipped objective PPO
             loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.012 * dist_entropy
             # take gradient step
             self.optimizer.zero_grad()
             loss.mean().backward()

code/Lot-sizing/agents/PPOAgent.py CHANGED Viewed

@@ -14,7 +14,7 @@ BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
 AGENTS_DIR = os.path.join(BASE_DIR,'agents')
 sys.path.append(AGENTS_DIR)
 from agents.PPO import PPO
-from envs import *
 class SimplePlantSB(SimplePlant):
@@ -155,8 +155,6 @@ class PPOAgent():
         print("training environment name : " + self.experiment_name + '_PPO')
         # state space dimension
         self.state_dim = self.env.observation_space.shape[0]

 AGENTS_DIR = os.path.join(BASE_DIR,'agents')
 sys.path.append(AGENTS_DIR)
 from agents.PPO import PPO
+from envs import  SimplePlant
 class SimplePlantSB(SimplePlant):
         print("training environment name : " + self.experiment_name + '_PPO')
         # state space dimension
         self.state_dim = self.env.observation_space.shape[0]

code/Lot-sizing/agents/PPOAgent_two_critics.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import os # Provides a way of interacting with the file system
+import sys
+import glob # Helps find all the pathnames matching a specified pattern according to the rules used by the Unix shell
+import time # Provides various time-related functions
+from datetime import datetime # Module that supplies classes for working with dates and times
+import numpy as np # A library for the Python programming language, adding support for large, multi-dimensional arrays and matrices
+import gym # Provides a collection of test problems — environments — that you can use to work out your reinforcement learning algorithms
+import torch # A machine learning framework that provides tensor computation (like NumPy) with strong acceleration on GPUs
+import copy # Provides a module for shallow and deep copying operations
+import matplotlib.pyplot as plt # A plotting library for the Python programming language and its numerical mathematics extension NumPy
+import matplotlib.patches as mpatches # Provides a way of adding a colored patch to the plot, for example to create a legend
+BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
+AGENTS_DIR = os.path.join(BASE_DIR,'agents')
+sys.path.append(AGENTS_DIR)
+from agents.PPOtwocritics import PPOtwocritics
+from envs import  SimplePlant
+class SimplePlantSB(SimplePlant):
+    def __init__(self, settings, stoch_model):
+        super().__init__(settings, stoch_model)
+        try:self.dict_obs = settings['dict_obs']
+        except:self.dict_obs = False
+        self.last_inventory = copy.copy(self.inventory_level)
+        self.action_space = gym.spaces.MultiDiscrete(
+            [self.n_items+1] * self.n_machines
+        )
+        if self.dict_obs:
+            self.observation_space = gym.spaces.Dict({
+                'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items),
+                'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines)
+                #'last_inventory_level':gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items)
+            })
+        else:
+            self.observation_space = gym.spaces.Box(
+                low=np.zeros(self.n_items+self.n_machines),# high for the inventory level
+                high=np.concatenate(
+                    [
+                        np.array(self.max_inventory_level),
+                        np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups
+                        #np.array(self.max_inventory_level) # high for the inventory level
+                    ]),
+                dtype=np.int32
+            )
+    def step(self, action):
+        """
+        Step method: Execute one time step within the environment
+        Parameters
+        ----------
+        action : action given by the agent
+        Returns
+        -------
+        obs : Observation of the state give the method _next_observation
+        reward : Cost given by the _reward method
+        done : returns True or False given by the _done method
+        dict : possible information for control to environment monitoring
+        """
+        self.last_inventory = copy.copy(self.inventory_level)
+        self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand)
+        # self.total_cost['setup_costs'] = 0
+        # self.total_cost['holding_costs'] = 0
+        reward = -sum([ele for key, ele in self.total_cost.items()])
+        #reward = -self.total_cost['lost_sales']
+        #reward = np.abs(action)
+        self.current_step += 1
+        done = self.current_step == self.T
+        obs = self._next_observation()
+        return obs, reward, done, self.total_cost
+    def _next_observation(self):
+        """
+        Returns the next demand
+        """
+        obs = SimplePlant._next_observation(self)
+        #obs['last_inventory_level'] = copy.copy(self.last_inventory)
+        if isinstance(obs, dict):
+            if not self.dict_obs:
+                obs = np.concatenate(
+                    (
+                        obs['inventory_level'], # n_items size
+                        obs['machine_setup'], # n_machine size
+                        #obs['last_inventory_level']# n_items size
+                    )
+                )
+        else:
+            if self.dict_obs:
+                raise('Change dict_obst to False')
+        return obs
+class PPOAgent_two_critics():
+    def __init__(self, env: SimplePlant, settings: dict):
+        self.env = SimplePlantSB(env.settings, env.stoch_model)
+        self.last_inventory = env.inventory_level
+        self.model_name = settings['model_name']
+        self.experiment_name = settings['experiment_name']
+        self.parallelization = settings['parallelization']
+        try:self.dict_obs = settings['dict_obs']
+        except:self.dict_obs = False
+        self.POSSIBLE_STATES = self.env.n_items + 1
+        self.env.cost_to_reward = True
+        self.epsilon = 0
+        BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        # Use the logs file in the root path of the main.
+        self.LOG_DIR = os.path.join(BASE_DIR,'logs')
+        print("============================================================================================")
+        ####### initialize environment hyperparameters ######
+        self.has_continuous_action_space = False  # continuous action space; else discrete
+        self.max_ep_len = 1000                   # max timesteps in one episode
+        self.print_freq = self.max_ep_len * 10        # print avg reward in the interval (in num timesteps)
+        self.log_freq = self.max_ep_len * 2           # log avg reward in the interval (in num timesteps)
+        self.save_model_freq = int(4999)          # save model frequency (in num timesteps)
+        self.action_std = 0.6                    # starting std for action distribution (Multivariate Normal)
+        self.action_std_decay_rate = 0.05        # linearly decay self.action_std (self.action_std = self.action_std - self.action_std_decay_rate)
+        self.min_action_std = 0.1                # minimum self.action_std (stop decay after self.action_std <= min_self.action_std)
+        self.action_std_decay_freq = int(2.5e5)  # self.action_std decay frequency (in num timesteps)
+        #####################################################
+        ## Note : print/log frequencies should be > than self.max_ep_len
+        ################ PPO_two_critics hyperparameters ################
+        self.update_timestep = self.max_ep_len * 4      # update policy every n timesteps
+        self.K_epochs = 60               # update policy for K epochs in one PPO_two_critics update
+        self.eps_clip = 0.2          # clip parameter for PPO_two_critics
+        self.gamma = 0.99            # discount factor
+        self.lr_actor = 0.00055       # learning rate for actor network
+        self.lr_critic = 0.001       # learning rate for critic network
+        self.random_seed = 0         # set random seed if required (0 = no random seed)
+        #####################################################
+        self.run_num_pretrained = 0      #### change this to prevent overwriting weights in same self.experiment_name folder
+        print("training environment name : " + self.experiment_name + '_PPO_two_critics')
+        # state space dimension
+        self.state_dim = self.env.observation_space.shape[0]
+        # action space dimension
+        if self.has_continuous_action_space:
+            self.action_dim = self.env.action_space.shape[0]
+        else:
+            self.action_dim = self.env.action_space
+        self.ppo_agent = PPOtwocritics(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, self.has_continuous_action_space, self.action_std)
+    ################################### Training ###################################
+    def learn(self,n_episodes = 100000):
+        ###################### logging ######################
+        self.max_training_timesteps = n_episodes   # break training loop if timeteps > self.max_training_timesteps
+        env = self.env
+        #### log files for multiple runs are NOT overwritten
+        log_dir = self.LOG_DIR
+        if not os.path.exists(log_dir):
+              os.makedirs(log_dir)
+        log_dir = log_dir + '/' + self.experiment_name + '_PPO_two_critics/'
+        if not os.path.exists(log_dir):
+              os.makedirs(log_dir)
+        #### get number of log files in log directory
+        run_num = 0
+        current_num_files = next(os.walk(log_dir))[2]
+        run_num = len(current_num_files)
+        #### create new log file for each run
+        log_f_name = log_dir + '/PPO_two_critics_' + self.experiment_name + "_log_" + str(run_num) + ".csv"
+        print("current logging run number for " + self.experiment_name + " : ", run_num)
+        print("logging at : " + log_f_name)
+        #####################################################
+        ################### checkpointing ###################
+        directory = self.LOG_DIR
+        if not os.path.exists(directory):
+              os.makedirs(directory)
+        directory = directory + '/' + self.experiment_name + '_PPO_two_critics' + '/'
+        if not os.path.exists(directory):
+              os.makedirs(directory)
+        checkpoint_path = directory + "PPO_two_critics_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
+        print("save checkpoint path : " + checkpoint_path)
+        #####################################################
+        ############# print all hyperparameters #############
+        print("--------------------------------------------------------------------------------------------")
+        print("max training timesteps : ", self.max_training_timesteps)
+        print("max timesteps per episode : ", self.max_ep_len)
+        print("model saving frequency : " + str(self.save_model_freq) + " timesteps")
+        print("log frequency : " + str(self.log_freq) + " timesteps")
+        print("printing average reward over episodes in last : " + str(self.print_freq) + " timesteps")
+        print("--------------------------------------------------------------------------------------------")
+        print("state space dimension : ", self.state_dim)
+        print("action space dimension : ", self.action_dim)
+        print("--------------------------------------------------------------------------------------------")
+        if self.has_continuous_action_space:
+            print("Initializing a continuous action space policy")
+            print("--------------------------------------------------------------------------------------------")
+            print("starting std of action distribution : ", self.action_std)
+            print("decay rate of std of action distribution : ", self.action_std_decay_rate)
+            print("minimum std of action distribution : ", min_self.action_std)
+            print("decay frequency of std of action distribution : " + str(self.action_std_decay_freq) + " timesteps")
+        else:
+            print("Initializing a discrete action space policy")
+        print("--------------------------------------------------------------------------------------------")
+        print("PPO_two_critics update frequency : " + str(self.update_timestep) + " timesteps")
+        print("PPO_two_critics K epochs : ", self.K_epochs)
+        print("PPO_two_critics epsilon clip : ", self.eps_clip)
+        print("discount factor (self.gamma) : ", self.gamma)
+        print("--------------------------------------------------------------------------------------------")
+        print("optimizer learning rate actor : ", self.lr_actor)
+        print("optimizer learning rate critic : ", self.lr_critic)
+        if self.random_seed:
+            print("--------------------------------------------------------------------------------------------")
+            print("setting random seed to ", self.random_seed)
+        #####################################################
+        print("============================================================================================")
+        ################# training procedure ################
+        # initialize a PPO agent
+        self.ppo_agent = PPOtwocritics(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, self.has_continuous_action_space, self.action_std)
+        # track total training time
+        start_time = datetime.now().replace(microsecond=0)
+        print("Started training at (GMT) : ", start_time)
+        print("============================================================================================")
+        # logging file
+        log_f = open(log_f_name,"w+")
+        log_f.write('episode,timestep,reward\n')
+        # printing and logging variables
+        print_running_reward = 0
+        print_running_episodes = 0
+        log_running_reward = 0
+        log_running_episodes = 0
+        time_step = 0
+        i_episode = 0
+        # training loop
+        while time_step <= self.max_training_timesteps:
+            state = env.reset()
+            current_ep_reward = 0
+            for t in range(1, self.max_ep_len+1):
+                # select action with policy
+                action = self.ppo_agent.select_action(state)
+                state, reward, done, _ = env.step(action)
+                # saving reward and is_terminals
+                self.ppo_agent.buffer.rewards.append(reward)
+                self.ppo_agent.buffer.is_terminals.append(done)
+                time_step +=1
+                current_ep_reward += reward
+                # update PPO_two_critics agent
+                if time_step % self.update_timestep == 0:
+                    self.ppo_agent.update()
+                # if continuous action space; then decay action std of ouput action distribution
+                if self.has_continuous_action_space and time_step % self.action_std_decay_freq == 0:
+                    self.ppo_agent.decay_self.action_std(self.action_std_decay_rate, self.action_std)
+                # log in logging file
+                if time_step % self.log_freq == 0:
+                    # log average reward till last episode
+                    log_avg_reward = log_running_reward / log_running_episodes
+                    log_avg_reward = round(log_avg_reward, 4)
+                    log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
+                    log_f.flush()
+                    log_running_reward = 0
+                    log_running_episodes = 0
+                # printing average reward
+                if time_step % self.print_freq == 0:
+                    # print average reward till last episode
+                    print_avg_reward = print_running_reward / print_running_episodes
+                    print_avg_reward = round(print_avg_reward, 2)
+                    print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))
+                    print_running_reward = 0
+                    print_running_episodes = 0
+                # save model weights
+                if time_step % self.save_model_freq == 0:
+                    print("--------------------------------------------------------------------------------------------")
+                    print("saving model at : " + checkpoint_path)
+                    self.ppo_agent.save(checkpoint_path)
+                    print("model saved")
+                    print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
+                    print("--------------------------------------------------------------------------------------------")
+                # break; if the episode is over
+                if done:
+                    break
+            print_running_reward += current_ep_reward
+            print_running_episodes += 1
+            log_running_reward += current_ep_reward
+            log_running_episodes += 1
+            i_episode += 1
+        log_f.close()
+        #env.close()
+        # print total training time
+        print("============================================================================================")
+        end_time = datetime.now().replace(microsecond=0)
+        print("Started training at (GMT) : ", start_time)
+        print("Finished training at (GMT) : ", end_time)
+        print("Total training time  : ", end_time - start_time)
+        print("============================================================================================")
+    def get_action(self,state):
+        if isinstance(state, dict):
+            if not self.dict_obs:
+                state = np.concatenate(
+                    (
+                        state['inventory_level'], # n_items size
+                        state['machine_setup'], # n_machine size
+                    )
+                )
+        else:
+            if self.dict_obs:
+                raise('Change dict_obst to False')
+        return self.ppo_agent.select_action(state)
+    def load_agent(self,path):
+        #directory = "PPO_two_critics_preTrained" + '/' + env_name + '/'
+        directory = self.LOG_DIR
+        directory = directory + '/' + self.experiment_name + '_PPO_two_critics' + '/'
+        checkpoint_path = directory + "PPO_two_critics_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
+        print("loading network from : " + checkpoint_path)
+        self.ppo_agent.load(checkpoint_path)

code/Lot-sizing/agents/{PDPPO_one_critic.py → PPOtwocritics.py} RENAMED Viewed

@@ -5,10 +5,8 @@ Created on Wed Mar  1 00:43:49 2023
 @author: leona
 """
-import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.init as init
 from torch.distributions import MultivariateNormal
 from torch.distributions import Categorical
@@ -25,26 +23,22 @@ else:
 print("============================================================================================")
-################################## PDPPO Policy ##################################
 class RolloutBuffer:
     def __init__(self):
         self.actions = []
         self.states = []
-        self.post_states = []
         self.logprobs = []
         self.rewards = []
         self.state_values = []
-        self.state_values_post = []
         self.is_terminals = []
     def clear(self):
         del self.actions[:]
         del self.states[:]
-        del self.post_states[:]
         del self.logprobs[:]
         del self.rewards[:]
         del self.state_values[:]
-        del self.state_values_post[:]
         del self.is_terminals[:]
@@ -74,7 +68,6 @@ class ActorCritic(nn.Module):
             self.fc2 = nn.Linear(128, 128)
             self.actor = nn.Linear(128, self.action_dim.nvec.sum())
         # critic
         self.critic = nn.Sequential(
                         nn.Linear(state_dim, 128),
@@ -84,6 +77,13 @@ class ActorCritic(nn.Module):
                         nn.Linear(128, 1)
                     )
     def forward(self, state):
         raise NotImplementedError
@@ -100,24 +100,28 @@ class ActorCritic(nn.Module):
-    def act(self, state,tau):
         if self.has_continuous_action_space:
             action_mean = self.actor(state)
             cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
             dist = MultivariateNormal(action_mean, cov_mat)
         else:
             x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
             logits = self.actor(x)
             action_probs = nn.functional.softmax(logits, dim=-1)
             dist = Categorical(action_probs.view(len(self.action_dim.nvec),-1))
         action = dist.sample()
         action_logprob = dist.log_prob(action)
-        return action.detach(), action_logprob.detach()
-    def evaluate(self, state,post_state, action,tau):
         if self.has_continuous_action_space:
             action_mean = self.actor(state)
@@ -130,30 +134,29 @@ class ActorCritic(nn.Module):
             if self.action_dim == 1:
                 action = action.reshape(-1, self.action_dim)
         else:
             x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
             logits = self.actor(x)
             action_probs = nn.functional.softmax(logits, dim=-1)
             dist = Categorical(action_probs.view(state.shape[0],len(self.action_dim.nvec),-1))
             # action_probs = self.actor(state)
             # dist = Categorical(action_probs)
         action_logprobs = dist.log_prob(action)
         dist_entropy = dist.entropy()
-        state_values = self.critic(post_state)
-        return action_logprobs, state_values, dist_entropy
-class PDPPO:
-    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, env, has_continuous_action_space, tau, action_std_init=0.6):
         self.has_continuous_action_space = has_continuous_action_space
         if has_continuous_action_space:
             self.action_std = action_std_init
-        self.tau = tau
-        self.env = env
         self.gamma = gamma
         self.eps_clip = eps_clip
         self.K_epochs = K_epochs
@@ -164,7 +167,7 @@ class PDPPO:
         self.optimizer = torch.optim.Adam([
                         {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                         {'params': self.policy.critic.parameters(), 'lr': lr_critic}
-                    ], weight_decay=0.001)
         self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
         self.policy_old.load_state_dict(self.policy.state_dict())
@@ -178,7 +181,7 @@ class PDPPO:
             self.policy_old.set_action_std(new_action_std)
         else:
             print("--------------------------------------------------------------------------------------------")
-            print("WARNING : Calling PDPPO::set_action_std() on discrete action space policy")
             print("--------------------------------------------------------------------------------------------")
     def decay_action_std(self, action_std_decay_rate, min_action_std):
@@ -194,64 +197,33 @@ class PDPPO:
             self.set_action_std(self.action_std)
         else:
-            print("WARNING : Calling PDPPO::decay_action_std() on discrete action space policy")
         print("--------------------------------------------------------------------------------------------")
-    def get_post_state(self, action, machine_setup, inventory_level):
-        setup_loss = np.zeros(self.env.n_machines, dtype=int)
-        setup_costs = np.zeros(self.env.n_machines)
-        # if we are just changing the setup, we use the setup cost matrix with the corresponding position given by the actual setup and the new setup
-        for m in range(self.env.n_machines):
-            if action[m] != 0: # if the machine is not iddle
-                # 1. IF NEEDED CHANGE SETUP
-                if machine_setup[m] != action[m] and action[m] != 0:
-                    setup_costs[m] = self.env.setup_costs[m][action[m] - 1]
-                    setup_loss[m] = self.env.setup_loss[m][action[m] - 1]
-                machine_setup[m] = action[m]
-                # 2. PRODUCTION
-                production = self.env.machine_production_matrix[m][action[m] - 1] - setup_loss[m]
-                inventory_level[action[m] - 1] += production
-            else:
-                machine_setup[m] = 0
-        # return the new machine_setup_inventory_level and the setup_cost
-        return machine_setup, inventory_level, setup_costs
-    def select_action(self, state,tau):
         if self.has_continuous_action_space:
             with torch.no_grad():
                 state = torch.FloatTensor(state).to(device)
-                action, action_logprob, state_val = self.policy_old.act(state,tau)
             self.buffer.states.append(state)
             self.buffer.actions.append(action)
             self.buffer.logprobs.append(action_logprob)
             self.buffer.state_values.append(state_val)
             return action.detach().cpu().numpy().flatten()
         else:
             with torch.no_grad():
                 state = torch.FloatTensor(state).to(device)
-                action, action_logprob = self.policy_old.act(state,tau)
-            machine_setup, inventory_level, setup_cost = self.get_post_state(action, state[self.env.n_items:self.env.n_items+self.env.n_machines].clone(), state[0:self.env.n_items].clone())
-            post_state = state.clone()
-            post_state[self.env.n_items:self.env.n_items+self.env.n_machines] = machine_setup.clone()
-            post_state[0:self.env.n_items] = inventory_level.clone()
-            post_state = torch.FloatTensor(post_state).to(device)
             self.buffer.states.append(state)
-            self.buffer.post_states.append(post_state)
             self.buffer.actions.append(action)
             self.buffer.logprobs.append(action_logprob)
-            with torch.no_grad():
-                #post_state = torch.cat([post_state.clone(),state.clone()])
-                state_val = self.policy_old.critic(post_state)
             self.buffer.state_values.append(state_val)
             return action.numpy()
@@ -271,19 +243,19 @@ class PDPPO:
         # convert list to tensor
         old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
-        old_post_states = torch.squeeze(torch.stack(self.buffer.post_states, dim=0)).detach().to(device)
         old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
         old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
         old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
         # calculate advantages
-        advantages = rewards.detach() - old_state_values.detach()
         # Optimize policy for K epochs
         for _ in range(self.K_epochs):
             # Evaluating old actions and values
-            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states,old_post_states, old_actions,self.tau)
             # match state_values tensor dimensions with rewards tensor
             state_values = torch.squeeze(state_values)
@@ -295,19 +267,17 @@ class PDPPO:
             surr1 = ratios * advantages.unsqueeze(1)
             surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages.unsqueeze(1)
-            # final loss of clipped objective PDPPO
-            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.012 * dist_entropy
             loss_numpy = loss.detach().numpy()
             # take gradient step
             self.optimizer.zero_grad()
             loss.mean().backward()
-            torch.nn.utils.clip_grad_norm_(self.policy.parameters(), max_norm=1)
             self.optimizer.step()
         # Copy new weights into old policy
         self.policy_old.load_state_dict(self.policy.state_dict())
         # clear buffer

 @author: leona
 """
 import torch
 import torch.nn as nn
 from torch.distributions import MultivariateNormal
 from torch.distributions import Categorical
 print("============================================================================================")
+################################## PPO_two_critics Policy ##################################
 class RolloutBuffer:
     def __init__(self):
         self.actions = []
         self.states = []
         self.logprobs = []
         self.rewards = []
         self.state_values = []
         self.is_terminals = []
     def clear(self):
         del self.actions[:]
         del self.states[:]
         del self.logprobs[:]
         del self.rewards[:]
         del self.state_values[:]
         del self.is_terminals[:]
             self.fc2 = nn.Linear(128, 128)
             self.actor = nn.Linear(128, self.action_dim.nvec.sum())
         # critic
         self.critic = nn.Sequential(
                         nn.Linear(state_dim, 128),
                         nn.Linear(128, 1)
                     )
+        self.critic_2 = nn.Sequential(
+                        nn.Linear(state_dim, 128),
+                        nn.Tanh(),
+                        nn.Linear(128, 128),
+                        nn.Tanh(),
+                        nn.Linear(128, 1)
+                    )
     def forward(self, state):
         raise NotImplementedError
+    def act(self, state):
         if self.has_continuous_action_space:
             action_mean = self.actor(state)
             cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
             dist = MultivariateNormal(action_mean, cov_mat)
         else:
+            #x = nn.functional.relu(self.fc(state))
             x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
             logits = self.actor(x)
             action_probs = nn.functional.softmax(logits, dim=-1)
             dist = Categorical(action_probs.view(len(self.action_dim.nvec),-1))
+            # action_probs = self.actor(state)
+            # dist = Categorical(action_probs)
         action = dist.sample()
         action_logprob = dist.log_prob(action)
+        state_val = self.critic(state)
+        return action.cpu().detach(), action_logprob.detach(), state_val.detach()
+    def evaluate(self, state, action):
         if self.has_continuous_action_space:
             action_mean = self.actor(state)
             if self.action_dim == 1:
                 action = action.reshape(-1, self.action_dim)
         else:
+            #x = nn.functional.relu(self.fc(state))
             x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
             logits = self.actor(x)
             action_probs = nn.functional.softmax(logits, dim=-1)
             dist = Categorical(action_probs.view(state.shape[0],len(self.action_dim.nvec),-1))
             # action_probs = self.actor(state)
             # dist = Categorical(action_probs)
         action_logprobs = dist.log_prob(action)
         dist_entropy = dist.entropy()
+        state_values = self.critic(state)
+        state_values_2 = self.critic_2(state)
+        return action_logprobs, state_values, state_values_2, dist_entropy
+class PPOtwocritics:
+    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):
         self.has_continuous_action_space = has_continuous_action_space
         if has_continuous_action_space:
             self.action_std = action_std_init
         self.gamma = gamma
         self.eps_clip = eps_clip
         self.K_epochs = K_epochs
         self.optimizer = torch.optim.Adam([
                         {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                         {'params': self.policy.critic.parameters(), 'lr': lr_critic}
+                    ])
         self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
         self.policy_old.load_state_dict(self.policy.state_dict())
             self.policy_old.set_action_std(new_action_std)
         else:
             print("--------------------------------------------------------------------------------------------")
+            print("WARNING : Calling PPO_two_critics::set_action_std() on discrete action space policy")
             print("--------------------------------------------------------------------------------------------")
     def decay_action_std(self, action_std_decay_rate, min_action_std):
             self.set_action_std(self.action_std)
         else:
+            print("WARNING : Calling PPO_two_critics::decay_action_std() on discrete action space policy")
         print("--------------------------------------------------------------------------------------------")
+    def select_action(self, state):
         if self.has_continuous_action_space:
             with torch.no_grad():
                 state = torch.FloatTensor(state).to(device)
+                action, action_logprob, state_val, state_val_2 = self.policy_old.act(state)
             self.buffer.states.append(state)
             self.buffer.actions.append(action)
             self.buffer.logprobs.append(action_logprob)
             self.buffer.state_values.append(state_val)
+            self.buffer.state_values_2.append(state_val_2)
             return action.detach().cpu().numpy().flatten()
         else:
             with torch.no_grad():
                 state = torch.FloatTensor(state).to(device)
+                action, action_logprob, state_val, state_val_2 = self.policy_old.act(state)
             self.buffer.states.append(state)
             self.buffer.actions.append(action)
             self.buffer.logprobs.append(action_logprob)
             self.buffer.state_values.append(state_val)
+            self.buffer.state_values_2.append(state_val_2)
             return action.numpy()
         # convert list to tensor
         old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
         old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
         old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
         old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
+        old_state_values_2 = torch.squeeze(torch.stack(self.buffer.state_values_2, dim=0)).detach().to(device)
         # calculate advantages
+        advantages = rewards.detach() - torch.min(old_state_values.detach(), old_state_values_2.detach()).detach()
         # Optimize policy for K epochs
         for _ in range(self.K_epochs):
             # Evaluating old actions and values
+            logprobs, state_values, state_values_2, dist_entropy = self.policy.evaluate(old_states, old_actions, self.tau)
             # match state_values tensor dimensions with rewards tensor
             state_values = torch.squeeze(state_values)
             surr1 = ratios * advantages.unsqueeze(1)
             surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages.unsqueeze(1)
+            # final loss of clipped objective PPO_two_critics
+            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(torch.min(state_values,state_values_2.squeeze()), rewards) - 0.012 * dist_entropy
             loss_numpy = loss.detach().numpy()
             # take gradient step
             self.optimizer.zero_grad()
             loss.mean().backward()
             self.optimizer.step()
         # Copy new weights into old policy
         self.policy_old.load_state_dict(self.policy.state_dict())
         # clear buffer

code/Lot-sizing/agents/__init__.py CHANGED Viewed

@@ -1,36 +1,17 @@
-from .dummyAgent import DummyAgent
-from .qLearningAgent import QLearningAgent
-from .stochasticProgrammingAgent import StochasticProgrammingAgent
-from .valueIteration import ValueIteration
-from .approximateValueIterationMC import ValueIterationMC
 from .stableBaselineAgents import StableBaselineAgent
-from .regressionTreeApproximation import RegressionTreeApproximation
-from .PSOAgent import PSOagent
-from .adpAgentHD import AdpAgentHD
-from .adpAgentHD1 import AdpAgentHD1
-from .adpAgentHD3 import AdpAgentHD3
-from .multiAgentRL import MultiAgentRL
 from .perfectInfoAgent import PerfectInfoAgent
-from .ensembleAgent import EnsembleAgent
-from .PPOAgent import PPOAgent
-from .PDPPOAgent_one_critic import PDPPOAgent
 __all__ = [
     "DummyAgent",
-    "QLearningAgent",
-    "StochasticProgrammingAgent",
-    "ValueIteration",
-    "ValueIterationMC",
-    "RegressionTreeApproximation",
     "StableBaselineAgent",
-    "PSOagent",
-    "AdpAgentHD",
-    "AdpAgentHD1",
-    "AdpAgentHD3",
-    "MultiAgentRL",
-    "PerfectInfoAgent",
-    "EnsembleAgent",
     "PPOAgent",
     "PDPPOAgent_one_critic"
 ]

+from .PPOAgent import PPOAgent
+from .PDPPOAgent import PDPPOAgent
+from .PPOAgent_two_critics import PPOAgent_two_critics
+from .PDPPOAgent_one_critic import PDPPOAgent_one_critic
+from .stableBaselineAgents import StableBaselineAgent
 from .stableBaselineAgents import StableBaselineAgent
 from .perfectInfoAgent import PerfectInfoAgent
 __all__ = [
     "DummyAgent",
+    "PerfectInfoAgent",
     "StableBaselineAgent",
     "PPOAgent",
+    "PPOAgent_two_critics",
+    "PDPPOAgent",
     "PDPPOAgent_one_critic"
 ]

code/Lot-sizing/agents/__pycache__/PDPPO.cpython-38.pyc ADDED Viewed

Binary file (8.99 kB). View file

code/Lot-sizing/agents/__pycache__/PDPPOAgent.cpython-38.pyc ADDED Viewed

Binary file (8.68 kB). View file

code/Lot-sizing/agents/__pycache__/PDPPOAgent_one_critic.cpython-38.pyc ADDED Viewed

Binary file (8.86 kB). View file

code/Lot-sizing/agents/__pycache__/PDPPO_one_critic.cpython-38.pyc ADDED Viewed

Binary file (8.73 kB). View file

code/Lot-sizing/agents/__pycache__/PDPPOonecritic.cpython-38.pyc ADDED Viewed

Binary file (8.82 kB). View file

code/Lot-sizing/agents/__pycache__/PPO.cpython-38.pyc ADDED Viewed

Binary file (7.5 kB). View file

code/Lot-sizing/agents/__pycache__/PPOAgent.cpython-38.pyc ADDED Viewed

Binary file (8.48 kB). View file

code/Lot-sizing/agents/__pycache__/PPOAgent_two_critics.cpython-38.pyc ADDED Viewed

Binary file (8.65 kB). View file

code/Lot-sizing/agents/__pycache__/PPOtwocritics.cpython-38.pyc ADDED Viewed

Binary file (7.93 kB). View file

code/Lot-sizing/agents/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (548 Bytes). View file

code/Lot-sizing/agents/__pycache__/perfectInfoAgent.cpython-38.pyc ADDED Viewed

Binary file (1.04 kB). View file

code/Lot-sizing/agents/__pycache__/stableBaselineAgents.cpython-38.pyc ADDED Viewed

Binary file (11 kB). View file

code/Lot-sizing/agents/perfectInfoAgent.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# -*- coding: utf-8 -*-
+from models import *
+from envs import *
+class PerfectInfoAgent():
+    def __init__(self, env, settings):
+        super(PerfectInfoAgent, self).__init__()
+        self.env = env
+        self.solver = PerfectInfoOptimization(env)
+        _, self.sol, _ = self.solver.solve()
+        self.sol = self.sol.astype(int)
+    def learn(self, epochs = 1000):
+        pass
+    def get_action(self, obs):
+        return list(self.sol[:,self.env.current_step])

code/Lot-sizing/agents/stableBaselineAgents.py ADDED Viewed

	@@ -0,0 +1,320 @@

+# -*- coding: utf-8 -*-
+import os
+import time
+import gym
+import torch
+import numpy as np
+import copy
+from envs import SimplePlant
+import matplotlib.pyplot as plt
+import matplotlib.patches as mpatches
+from stable_baselines3 import PPO,A2C,DQN,SAC,DDPG
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.vec_env import SubprocVecEnv
+from stable_baselines3.common.callbacks import EvalCallback
+class SimplePlantSB(SimplePlant):
+    def __init__(self, settings, stoch_model):
+        super().__init__(settings, stoch_model)
+        try:self.dict_obs = settings['dict_obs']
+        except:self.dict_obs = False
+        self.last_inventory = copy.copy(self.inventory_level)
+        self.action_space = gym.spaces.MultiDiscrete(
+            [self.n_items+1] * self.n_machines
+        )
+        if self.dict_obs:
+            self.observation_space = gym.spaces.Dict({
+                'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items),
+                'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines),
+                'last_inventory_level':gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items)
+            })
+        else:
+            self.observation_space = gym.spaces.Box(
+                low=np.zeros(2*self.n_items+self.n_machines),# high for the inventory level
+                high=np.concatenate(
+                    [
+                        np.array(self.max_inventory_level),
+                        np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups
+                        np.array(self.max_inventory_level) # high for the inventory level
+                    ]),
+                dtype=np.int32
+            )
+    def step(self, action):
+        """
+        Step method: Execute one time step within the environment
+        Parameters
+        ----------
+        action : action given by the agent
+        Returns
+        -------
+        obs : Observation of the state give the method _next_observation
+        reward : Cost given by the _reward method
+        done : returns True or False given by the _done method
+        dict : possible information for control to environment monitoring
+        """
+        self.last_inventory = copy.copy(self.inventory_level)
+        self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand)
+        # self.total_cost['setup_costs'] = 0
+        # self.total_cost['holding_costs'] = 0
+        reward = -sum([ele for key, ele in self.total_cost.items()])
+        #reward = -self.total_cost['lost_sales']
+        #reward = np.abs(action)
+        self.current_step += 1
+        done = self.current_step == self.T
+        obs = self._next_observation()
+        return obs, reward, done, self.total_cost
+    def _next_observation(self):
+        """
+        Returns the next demand
+        """
+        obs = SimplePlant._next_observation(self)
+        obs['last_inventory_level'] = copy.copy(self.last_inventory)
+        if isinstance(obs, dict):
+            if not self.dict_obs:
+                obs = np.concatenate(
+                    (
+                        obs['inventory_level'], # n_items size
+                        obs['machine_setup'], # n_machine size
+                        obs['last_inventory_level']# n_items size
+                    )
+                )
+        else:
+            if self.dict_obs:
+                raise('Change dict_obst to False')
+        return obs
+class StableBaselineAgent():
+    """
+    Stable baseline Agent Agent from StableBaselines3
+    We adapt the env to stablebaseline requirements:
+    A different _next_observation is required, with the observation space.
+    """
+    def __init__(self, env: SimplePlant, settings: dict):
+        super(StableBaselineAgent, self).__init__()
+        if settings['multiagent']:
+            self.env = env
+        else:
+            self.env = SimplePlantSB(env.settings, env.stoch_model)
+        self.last_inventory = env.inventory_level
+        self.model_name = settings['model_name']
+        self.experiment_name = settings['experiment_name']
+        self.parallelization = settings['parallelization']
+        self.run = settings['run']
+        try:self.dict_obs = settings['dict_obs']
+        except:self.dict_obs = False
+        self.POSSIBLE_STATES = self.env.n_items + 1
+        self.env.cost_to_reward = True
+        self.epsilon = 0
+        BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        # Use the logs file in the root path of the main.
+        self.LOG_DIR = os.path.join(BASE_DIR,'logs')
+        if self.parallelization:
+            # For cpu parallelization in StableBaseline learning
+            def make_env(seed):
+                def _init():
+                    env = self.env
+                    env = Monitor(
+                        env,
+                        os.path.join(f'{self.LOG_DIR}','monitor',f'{self.model_name}_{self.experiment_name}_{seed}_{self.run}'),
+                        allow_early_resets=True
+                    )
+                    return env
+                return _init
+            num_cpu = 5
+            env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
+        else:
+            env = Monitor(
+                self.env,
+                os.path.join(f'{self.LOG_DIR}','monitor',f'{self.model_name}_{self.experiment_name}_{self.run}')
+            )
+        self.eval_callback = EvalCallback(
+            env,
+            best_model_save_path=os.path.join(f'{self.LOG_DIR}',f'best_{self.model_name}_{self.experiment_name}_{self.run}'),
+            log_path=f'{self.LOG_DIR}/',
+            eval_freq=100,
+            deterministic=True,
+            verbose=0,
+            render=False
+        )
+        if self.dict_obs:
+            policy = 'MultiInputPolicy'
+        else:
+            policy = 'MlpPolicy'
+        if self.model_name == 'PPO':
+            self.model = PPO(
+                policy,
+                env,verbose = 0, batch_size = 256, n_steps = 256, gamma = 0.96, gae_lambda = 0.9, n_epochs = 20, ent_coef = 0.0, max_grad_norm = 0.5, vf_coef = 0.5, learning_rate = 5e-3, use_sde = False, clip_range = 0.4, policy_kwargs = dict(log_std_init=-2,ortho_init=False,activation_fn=torch.nn.ReLU,net_arch=[dict(pi=[300, 300], vf=[300, 300])])
+            )
+        elif self.model_name == 'A2C':
+            self.model = A2C(
+                policy,
+                env,verbose = 0, learning_rate=0.002, n_steps=100, gamma = 0.95, vf_coef = 0.7,policy_kwargs= dict(net_arch=[300, 300]), seed = None
+            )
+        elif self.model_name == 'DQN':
+            self.model = DQN(
+                policy,
+                env, verbose = 0, learning_rate= 2.3e-3, buffer_size=100000, learning_starts=1000, batch_size=32, tau=1.0, gamma=0.99,target_update_interval=10,train_freq= 256,gradient_steps= 128, exploration_fraction=0.16, exploration_initial_eps=0.04, policy_kwargs= dict(net_arch=[300, 300]), seed = None
+            )
+        elif self.model_name == 'SAC':
+            self.model = SAC(
+                policy,
+                env, verbose = 0,  learning_rate=0.0003, buffer_size=1000000, learning_starts=1000, batch_size=256, tau=0.005, gamma=0.99, train_freq=1, gradient_steps=1,seed = None,action_noise=None, replay_buffer_class=None, replay_buffer_kwargs=None, optimize_memory_usage=False, ent_coef='auto', target_update_interval=1, target_entropy='auto', use_sde=False, sde_sample_freq=-1, use_sde_at_warmup=False, tensorboard_log=None, create_eval_env=False, policy_kwargs=dict(activation_fn=torch.nn.ReLU,net_arch=[dict(pi=[300, 300], vf=[300, 300])])
+            )
+        elif self.model_name == 'DDPG':
+            self.model = DDPG(
+                policy,
+                env, verbose = 0,  learning_rate=0.0003, buffer_size=1000000, learning_starts=1000, batch_size=256
+            )
+    def get_action(self, obs):
+        obs['last_inventory_level'] = copy.copy(self.last_inventory)
+        if isinstance(obs, dict):
+            if self.dict_obs:
+                act = self.model.predict(obs,deterministic=True)[0]
+            else:
+                list_obs = []
+                for item in obs:
+                    list_obs.append(obs[item])
+                obs_ = np.array(np.concatenate(list_obs))
+                act = self.model.predict(obs_,deterministic=True)[0]
+        else:
+            if self.dict_obs:
+                raise('Change the policy to dictionary observations')
+            else:
+                act = self.model.predict(obs,deterministic=True)[0]
+        self.last_inventory = copy.copy(obs['inventory_level'])
+        return act
+    def learn(self, epochs=1000):
+        print(f"{self.model_name} learning...")
+        start_time = time.time()
+        # We define the EvalCallback wrapper to save the best model
+        # Here the model learns using the provided environment in the Stable baseline Agent definition
+        # We mutiply the number of epochs by the number of time periods to give the number of training steps
+        self.model.learn(
+            epochs*self.env.T,
+            callback=self.eval_callback,
+            # tb_log_name='PPO'
+        )
+        self.env.close()
+        time_duration = time.time() - start_time
+        print(f"Finished Learning {time_duration:.2f} s")
+    def load_agent(self, path):
+        if self.model_name == 'PPO':
+            self.model = PPO.load(path)
+        elif self.model_name == 'A2C':
+            self.model = A2C.load(path)
+        elif self.model_name == 'DQN':
+            self.model = DQN.load(path)
+        elif self.model_name == 'SAC':
+            self.model = SAC.load(path)
+        elif self.model_name == 'DDPG':
+            self.model = SAC.load(path)
+    def plot_policy(self, seed=1):
+        # ONLY WORKING FOR 2 ITEMS 1 MACHINE
+        cmap = plt.cm.get_cmap('viridis', 3)
+        policy_map = np.zeros((self.env.max_inventory_level[0]+1,self.env.max_inventory_level[1]+1,self.env.n_items+1))
+        for i in range(self.env.max_inventory_level[0]+1):
+            for j in range(self.env.max_inventory_level[1]+1):
+                for k in range(self.env.n_items+1):
+                    obs = np.expand_dims(np.array([i,j,k]), axis = 0)
+                    try: action = self.model.predict(obs,deterministic=True)[0][0][0]
+                    except: action = self.model.predict(obs,deterministic=True)[0][0]
+                    #print(f'action: {action} | obs: {obs}')
+                    policy_map[i,j,k] = action
+        self.policy = policy_map
+        fig, axs = plt.subplots(1, self.POSSIBLE_STATES)
+        fig.suptitle('Found Policy')
+        for i, ax in enumerate(axs):
+            ax.set_title(f'Setup {i}')
+            im = ax.pcolormesh(
+                self.policy[:,:,i], cmap = cmap, edgecolors='k', linewidth=2
+            )
+            im.set_clim(0, self.POSSIBLE_STATES - 1)
+            ax.set_xlabel('I2')
+            if i == 0:
+                ax.set_ylabel('I1')
+        # COLOR BAR:
+        bound = [0,1,2]
+        # Creating 8 Patch instances
+        fig.subplots_adjust(bottom=0.2)
+        ax.legend(
+            [mpatches.Patch(color=cmap(b)) for b in bound],
+            ['{}'.format(i) for i in range(3)],
+            loc='upper center', bbox_to_anchor=(-0.8,-0.13),
+            fancybox=True, shadow=True, ncol=3
+        )
+        fig.savefig(os.path.join(f'results', f'policy_function_{self.model_name}_{self.experiment_name}_{seed}.pdf'), bbox_inches='tight')
+        plt.close()
+    def plot_value_function(self, seed):
+        # ONLY WORKING FOR 2 ITEMS 1 MACHINE
+        value_map = np.zeros((self.env.max_inventory_level[0]+1,self.env.max_inventory_level[1]+1,self.env.n_items+1))
+        for i in range(self.env.max_inventory_level[0]+1):
+            for j in range(self.env.max_inventory_level[1]+1):
+                for k in range(self.env.n_items+1):
+                    value_list = []
+                    for action in range(self.env.n_items+1):
+                        obs = np.expand_dims(np.array([j,i,k]), axis = 0)
+                        action = np.array([[action]])
+                        if torch.cuda.is_available():
+                            obs = torch.from_numpy(obs).to(torch.float).to(device="cuda")
+                            action = torch.from_numpy(action).to(torch.float).to(device="cuda")
+                        else:
+                            obs = torch.from_numpy(obs).to(torch.float)
+                            action = torch.from_numpy(action).to(torch.float)
+                        try:
+                            value,prob,dist_entropy = self.model.policy.evaluate_actions(obs,action)
+                            value_list.append(value.item())
+                        except:
+                            value = self.model.policy.q_net(obs)[0][int(action.item())]
+                            value_list.append(value.item())
+                    value_map[j,i,k] = np.array(value_list).mean()
+        self.value_function = value_map
+        # Plotting:
+        fig, axs = plt.subplots(nrows=1, ncols=self.POSSIBLE_STATES)
+        fig.suptitle('Value Function')
+        for i, ax in enumerate(axs):
+            ax.set_title(f'Setup {i}')
+            im = ax.imshow(
+                -self.value_function[:,:,i],
+                aspect='auto', cmap='viridis'
+            )
+            if i == 0:
+                ax.set_ylabel('I1')
+            ax.set_xlabel('I2')
+            ax.invert_yaxis()
+        fig.subplots_adjust(right=0.85)
+        cbar_ax = fig.add_axes([0.88, 0.15, 0.04, 0.7])
+        fig.colorbar(im, cax=cbar_ax)
+        fig.savefig(os.path.join('results',f'value_function_{self.model_name}_{self.experiment_name}_{self.run}_{seed}.pdf'))
+        plt.close()

code/Lot-sizing/envs/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (337 Bytes). View file

code/Lot-sizing/envs/__pycache__/simplePlant.cpython-38.pyc ADDED Viewed

Binary file (6.66 kB). View file

code/Lot-sizing/envs/__pycache__/singleSequenceDependentMachinePlant.cpython-38.pyc ADDED Viewed

Binary file (2.66 kB). View file

code/Lot-sizing/experiments.py CHANGED Viewed

@@ -12,15 +12,16 @@ sys.path.append(AGENTS_DIR)
 from agents.PPO import PPO
 from agents.PDPPO import PDPPO
 from agents.PDPPOAgent import PDPPOAgent
 from agents.PPOAgent import PPOAgent
 import numpy as np
-from envs import *
-from agents import *
-from agents import StochasticProgrammingAgent, AdpAgentHD3
-from agents import StableBaselineAgent, MultiAgentRL, EnsembleAgent, PerfectInfoAgent,PSOagent,AdpAgentHD, PPOAgent
-from test_functions import *
 from scenarioManager.stochasticDemandModel import StochasticDemandModel
@@ -29,13 +30,11 @@ from scenarioManager.stochasticDemandModel import StochasticDemandModel
 if __name__ == '__main__':
     experiments = ['15items_5machines_i100','20items_10machines','25items_10machines']
     for experiment_name in experiments:
-        for i in range(0,5):
             # Setting the seeds
             np.random.seed(1)
             random.seed(10)
             # Environment setup load:
-            # experiment_name = '15items_5machines_i100' # we set the experiment using the available files in cfg
-            # experiment_name = '25items_10machines' # we set the experiment using the available files in cfg
             file_path = os.path.abspath(f"./cfg_env/setting_{experiment_name}.json")
             fp = open(file_path, 'r')
             settings = json.load(fp)
@@ -57,37 +56,37 @@ if __name__ == '__main__':
             # Parameters for the ADPHS:
             setting_sol_method['regressor_name'] = 'plain_matrix_I2xM1'
             setting_sol_method['discount_rate'] = 0.9
             agents = []
             # Parameters for the RL:
-            training_epochs_RL = 30000
-            training_epochs_multiagent = 2000
-            setting_sol_method['parallelization'] = False
             env = SimplePlant(settings, stoch_model)
             # Number of test execution (number of complet environment iterations)
             nreps = 100
             ###########################################################################
-            # #PPO
             ###########################################################################
-            # base_model_name = 'PPO'
-            # ppo_agent = PPOAgent(
-            #     env,
-            #     setting_sol_method
-            # )
-            # ppo_agent.learn(n_episodes=training_epochs_RL*settings['time_horizon'] ) # Each ep with 200 steps
-            # #load best agent before appending in the test list
-            # BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
-            # ppo_agent.load_agent(BEST_MODEL_DIR) # For training purposes
-            # agents.append(("PPO", ppo_agent))
             ###########################################################################
-            # Post-decision PPO
             ###########################################################################
             base_model_name = 'PDPPO'
@@ -101,7 +100,60 @@ if __name__ == '__main__':
             BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
             pdppo_agent.load_agent(BEST_MODEL_DIR) # For training purposes
             agents.append(("PDPPO", pdppo_agent))
             ###########################################################################
             #TESTING

 from agents.PPO import PPO
 from agents.PDPPO import PDPPO
+from agents.PDPPOAgent_one_critic import PDPPOAgent_one_critic
+from agents.PPOAgent_two_critics import PPOAgent_two_critics
 from agents.PDPPOAgent import PDPPOAgent
 from agents.PPOAgent import PPOAgent
+from agents.stableBaselineAgents import StableBaselineAgent
+from test_functions import test_agents
 import numpy as np
+from envs import SimplePlant
 from scenarioManager.stochasticDemandModel import StochasticDemandModel
 if __name__ == '__main__':
     experiments = ['15items_5machines_i100','20items_10machines','25items_10machines']
     for experiment_name in experiments:
+        for i in range(0,10):
             # Setting the seeds
             np.random.seed(1)
             random.seed(10)
             # Environment setup load:
             file_path = os.path.abspath(f"./cfg_env/setting_{experiment_name}.json")
             fp = open(file_path, 'r')
             settings = json.load(fp)
             # Parameters for the ADPHS:
             setting_sol_method['regressor_name'] = 'plain_matrix_I2xM1'
             setting_sol_method['discount_rate'] = 0.9
+            setting_sol_method['multiagent'] = False
+            setting_sol_method['parallelization'] = True
+            setting_sol_method['run'] = i
             agents = []
             # Parameters for the RL:
+            training_epochs_RL = 5000 # 30000
             env = SimplePlant(settings, stoch_model)
             # Number of test execution (number of complet environment iterations)
             nreps = 100
             ###########################################################################
+            # PPO
             ###########################################################################
+            base_model_name = 'PPO'
+            ppo_agent = PPOAgent(
+                env,
+                setting_sol_method
+            )
+            ppo_agent.learn(n_episodes=training_epochs_RL*settings['time_horizon'] ) # Each ep with 200 steps
+            #load best agent before appending in the test list
+            BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
+            ppo_agent.load_agent(BEST_MODEL_DIR) # For training purposes
+            agents.append(("PPO", ppo_agent))
             ###########################################################################
+            # Post-decision PPO - Dual critic
             ###########################################################################
             base_model_name = 'PDPPO'
             BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
             pdppo_agent.load_agent(BEST_MODEL_DIR) # For training purposes
             agents.append(("PDPPO", pdppo_agent))
+            ###########################################################################
+            # Post-decision PPO - Dual critic
+            ###########################################################################
+            base_model_name = 'PDPPO_one_critic'
+            pdppo_agent_one_critic = PDPPOAgent(
+                env,
+                setting_sol_method
+            )
+            pdppo_agent_one_critic.learn(n_episodes=training_epochs_RL*settings['time_horizon'] ) # Each ep with 200 steps
+            #load best agent before appending in the test list
+            BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
+            pdppo_agent_one_critic.load_agent(BEST_MODEL_DIR) # For training purposes
+            agents.append(("PDPPO", pdppo_agent_one_critic))
+            ###########################################################################
+            # Post-decision PPO - Dual critic
+            ###########################################################################
+            base_model_name = 'PPO_two_critics'
+            ppo_agent_two_critics = PDPPOAgent(
+                env,
+                setting_sol_method
+            )
+            ppo_agent_two_critics.learn(n_episodes=training_epochs_RL*settings['time_horizon'] ) # Each ep with 200 steps
+            #load best agent before appending in the test list
+            BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
+            ppo_agent_two_critics.load_agent(BEST_MODEL_DIR) # For training purposes
+            agents.append(("PDPPO", ppo_agent_two_critics))
+            ###########################################################################
+            # RL A2C
+            ###########################################################################
+            # base_model_name = 'A2C'
+            # env = SimplePlant(settings, stoch_model)
+            # setting_sol_method['model_name'] = base_model_name
+            # rl_agent = StableBaselineAgent(
+            #     env,
+            #     setting_sol_method
+            # )
+            # rl_agent.learn(epochs=training_epochs_RL) # Each ep with 200 steps
+            # #load best agent before appending in the test list
+            # BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
+            # rl_agent.load_agent(BEST_MODEL_DIR)
+            # agents.append(("A2C", rl_agent))
             ###########################################################################
             #TESTING

code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_0_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48a42ab9cb3c0ecd6aee7da1dbd709f9ce11e1f7855dd8b4ca263ed02aa4f106
+size 279139

code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_0.csv ADDED Viewed

	@@ -0,0 +1,126 @@

+episode,timestep,reward
+39,4000,-2141.4023
+79,8000,-2186.1688
+119,12000,-2176.6288
+159,16000,-2191.1093
+199,20000,-2189.4693
+239,24000,-2187.8145
+279,28000,-2196.4592
+319,32000,-2186.1425
+359,36000,-2170.4952
+399,40000,-2184.8005
+439,44000,-2156.964
+479,48000,-2159.1175
+519,52000,-2177.2202
+559,56000,-2184.189
+599,60000,-2173.9785
+639,64000,-2185.696
+679,68000,-2187.422
+719,72000,-2188.3395
+759,76000,-2200.6052
+799,80000,-2206.662
+839,84000,-2198.9285
+879,88000,-2212.6385
+919,92000,-2214.2012
+959,96000,-2210.518
+999,100000,-2212.269
+1039,104000,-2202.792
+1079,108000,-2230.6483
+1119,112000,-2230.9285
+1159,116000,-2234.553
+1199,120000,-2231.0472
+1239,124000,-2205.4692
+1279,128000,-2224.4608
+1319,132000,-2222.5775
+1359,136000,-2220.6603
+1399,140000,-2217.5998
+1439,144000,-2206.9042
+1479,148000,-2219.398
+1519,152000,-2219.977
+1559,156000,-2208.6932
+1599,160000,-2199.11
+1639,164000,-2216.3345
+1679,168000,-2195.7275
+1719,172000,-2208.9112
+1759,176000,-2196.7148
+1799,180000,-2179.6018
+1839,184000,-2185.97
+1879,188000,-2190.0938
+1919,192000,-2182.9055
+1959,196000,-2212.575
+1999,200000,-2217.4317
+2039,204000,-2207.0677
+2079,208000,-2201.1058
+2119,212000,-2210.03
+2159,216000,-2196.9463
+2199,220000,-2186.4732
+2239,224000,-2196.7148
+2279,228000,-2180.514
+2319,232000,-2182.2113
+2359,236000,-2177.5078
+2399,240000,-2177.2448
+2439,244000,-2176.5475
+2479,248000,-2176.4643
+2519,252000,-2166.5628
+2559,256000,-2181.9908
+2599,260000,-2201.7215
+2639,264000,-2189.474
+2679,268000,-2170.9755
+2719,272000,-2169.075
+2759,276000,-2183.726
+2799,280000,-2165.4742
+2839,284000,-2187.9715
+2879,288000,-2179.0172
+2919,292000,-2161.0182
+2959,296000,-2168.9047
+2999,300000,-2165.532
+3039,304000,-2168.5285
+3079,308000,-2159.3415
+3119,312000,-2168.1608
+3159,316000,-2177.5103
+3199,320000,-2185.0758
+3239,324000,-2176.5248
+3279,328000,-2173.1685
+3319,332000,-2190.4757
+3359,336000,-2219.6503
+3399,340000,-2207.4892
+3439,344000,-2199.8988
+3479,348000,-2211.5325
+3519,352000,-2201.5668
+3559,356000,-2202.0843
+3599,360000,-2196.885
+3639,364000,-2199.742
+3679,368000,-2219.324
+3719,372000,-2224.5802
+3759,376000,-2213.3832
+3799,380000,-2191.889
+3839,384000,-2220.2653
+3879,388000,-2206.6353
+3919,392000,-2193.6993
+3959,396000,-2173.2148
+3999,400000,-2168.8942
+4039,404000,-2182.1583
+4079,408000,-2170.8605
+4119,412000,-2179.9363
+4159,416000,-2177.2738
+4199,420000,-2186.921
+4239,424000,-2176.2058
+4279,428000,-2178.0973
+4319,432000,-2179.0505
+4359,436000,-2183.782
+4399,440000,-2189.763
+4439,444000,-2191.9625
+4479,448000,-2190.0078
+4519,452000,-2208.7985
+4559,456000,-2196.431
+4599,460000,-2204.601
+4639,464000,-2198.331
+4679,468000,-2197.519
+4719,472000,-2195.665
+4759,476000,-2179.7755
+4799,480000,-2201.7112
+4839,484000,-2187.7942
+4879,488000,-2177.4918
+4919,492000,-2188.3555
+4959,496000,-2178.8962
+4999,500000,-2193.1755

code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_2.csv ADDED Viewed

	@@ -0,0 +1,126 @@

+episode,timestep,reward
+39,4000,-2335.3474
+79,8000,-2244.5998
+119,12000,-2238.709
+159,16000,-2290.7835
+199,20000,-2266.6182
+239,24000,-2236.9325
+279,28000,-2260.8245
+319,32000,-2270.0922
+359,36000,-2267.02
+399,40000,-2260.1368
+439,44000,-2246.2922
+479,48000,-2251.6645
+519,52000,-2270.2068
+559,56000,-2271.094
+599,60000,-2270.1955
+639,64000,-2251.9755
+679,68000,-2283.5295
+719,72000,-2265.69
+759,76000,-2307.0708
+799,80000,-2293.882
+839,84000,-2290.8482
+879,88000,-2277.416
+919,92000,-2250.59
+959,96000,-2234.1988
+999,100000,-2249.0185
+1039,104000,-2247.6215
+1079,108000,-2216.7135
+1119,112000,-2213.6995
+1159,116000,-2224.0747
+1199,120000,-2228.468
+1239,124000,-2255.0583
+1279,128000,-2226.098
+1319,132000,-2217.657
+1359,136000,-2243.1698
+1399,140000,-2232.1338
+1439,144000,-2250.4618
+1479,148000,-2235.0085
+1519,152000,-2249.4723
+1559,156000,-2216.2995
+1599,160000,-2233.6805
+1639,164000,-2247.4035
+1679,168000,-2229.968
+1719,172000,-2218.4828
+1759,176000,-2223.798
+1799,180000,-2213.273
+1839,184000,-2219.179
+1879,188000,-2205.1017
+1919,192000,-2207.6708
+1959,196000,-2200.982
+1999,200000,-2218.6955
+2039,204000,-2200.056
+2079,208000,-2218.9955
+2119,212000,-2214.628
+2159,216000,-2230.9135
+2199,220000,-2212.2112
+2239,224000,-2228.0432
+2279,228000,-2228.0378
+2319,232000,-2218.216
+2359,236000,-2237.9682
+2399,240000,-2218.8503
+2439,244000,-2201.6265
+2479,248000,-2216.5263
+2519,252000,-2209.0173
+2559,256000,-2210.7017
+2599,260000,-2192.9838
+2639,264000,-2206.9902
+2679,268000,-2196.276
+2719,272000,-2187.5165
+2759,276000,-2201.5815
+2799,280000,-2197.6468
+2839,284000,-2181.081
+2879,288000,-2191.911
+2919,292000,-2210.5108
+2959,296000,-2191.4668
+2999,300000,-2207.3622
+3039,304000,-2188.681
+3079,308000,-2205.789
+3119,312000,-2189.5567
+3159,316000,-2171.3155
+3199,320000,-2170.6315
+3239,324000,-2170.7322
+3279,328000,-2174.4193
+3319,332000,-2175.8538
+3359,336000,-2154.2035
+3399,340000,-2185.9618
+3439,344000,-2178.553
+3479,348000,-2170.287
+3519,352000,-2159.5517
+3559,356000,-2172.067
+3599,360000,-2159.2972
+3639,364000,-2177.8195
+3679,368000,-2156.6698
+3719,372000,-2168.946
+3759,376000,-2182.2233
+3799,380000,-2170.65
+3839,384000,-2158.5868
+3879,388000,-2162.828
+3919,392000,-2148.9192
+3959,396000,-2152.153
+3999,400000,-2169.9372
+4039,404000,-2169.7798
+4079,408000,-2162.5945
+4119,412000,-2148.3235
+4159,416000,-2157.2015
+4199,420000,-2171.1243
+4239,424000,-2154.7868
+4279,428000,-2164.997
+4319,432000,-2162.2733
+4359,436000,-2167.9713
+4399,440000,-2163.9672
+4439,444000,-2152.2753
+4479,448000,-2149.6665
+4519,452000,-2160.5565
+4559,456000,-2157.0198
+4599,460000,-2158.6238
+4639,464000,-2153.1465
+4679,468000,-2161.9365
+4719,472000,-2147.464
+4759,476000,-2157.8608
+4799,480000,-2163.0485
+4839,484000,-2170.2235
+4879,488000,-2165.6525
+4919,492000,-2161.917
+4959,496000,-2157.1193
+4999,500000,-2146.3092

code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_3.csv ADDED Viewed

	@@ -0,0 +1,126 @@

+episode,timestep,reward
+39,4000,-2240.3108
+79,8000,-2288.825
+119,12000,-2320.0678
+159,16000,-2266.3545
+199,20000,-2254.704
+239,24000,-2284.7492
+279,28000,-2275.9468
+319,32000,-2268.6098
+359,36000,-2288.8185
+399,40000,-2272.3042
+439,44000,-2261.6483
+479,48000,-2266.3038
+519,52000,-2270.6518
+559,56000,-2244.104
+599,60000,-2245.5072
+639,64000,-2238.031
+679,68000,-2220.696
+719,72000,-2220.8338
+759,76000,-2215.0553
+799,80000,-2206.708
+839,84000,-2227.824
+879,88000,-2214.947
+919,92000,-2217.5067
+959,96000,-2224.1628
+999,100000,-2221.2443
+1039,104000,-2244.4968
+1079,108000,-2239.8208
+1119,112000,-2232.748
+1159,116000,-2223.8978
+1199,120000,-2218.0928
+1239,124000,-2203.4095
+1279,128000,-2204.5672
+1319,132000,-2238.5695
+1359,136000,-2203.7973
+1399,140000,-2217.6258
+1439,144000,-2213.7642
+1479,148000,-2207.4387
+1519,152000,-2215.8908
+1559,156000,-2191.1362
+1599,160000,-2224.398
+1639,164000,-2201.3452
+1679,168000,-2189.2553
+1719,172000,-2209.868
+1759,176000,-2169.1652
+1799,180000,-2191.6032
+1839,184000,-2192.4662
+1879,188000,-2173.139
+1919,192000,-2180.778
+1959,196000,-2193.01
+1999,200000,-2196.909
+2039,204000,-2203.634
+2079,208000,-2203.3062
+2119,212000,-2205.7118
+2159,216000,-2221.6275
+2199,220000,-2207.0085
+2239,224000,-2205.649
+2279,228000,-2229.8532
+2319,232000,-2198.7525
+2359,236000,-2180.7215
+2399,240000,-2173.0688
+2439,244000,-2191.3938
+2479,248000,-2194.5465
+2519,252000,-2200.4895
+2559,256000,-2215.1643
+2599,260000,-2196.0888
+2639,264000,-2205.88
+2679,268000,-2186.5843
+2719,272000,-2189.5945
+2759,276000,-2177.4112
+2799,280000,-2185.7688
+2839,284000,-2180.7005
+2879,288000,-2194.3678
+2919,292000,-2183.5812
+2959,296000,-2188.0495
+2999,300000,-2185.9692
+3039,304000,-2178.563
+3079,308000,-2184.8002
+3119,312000,-2210.264
+3159,316000,-2190.2037
+3199,320000,-2198.2853
+3239,324000,-2206.658
+3279,328000,-2197.803
+3319,332000,-2206.5752
+3359,336000,-2210.574
+3399,340000,-2207.2495
+3439,344000,-2222.5217
+3479,348000,-2208.8218
+3519,352000,-2214.9137
+3559,356000,-2223.4288
+3599,360000,-2226.1332
+3639,364000,-2227.895
+3679,368000,-2213.1972
+3719,372000,-2217.1715
+3759,376000,-2229.5115
+3799,380000,-2232.2263
+3839,384000,-2250.712
+3879,388000,-2237.0413
+3919,392000,-2237.8288
+3959,396000,-2242.2087
+3999,400000,-2242.8518
+4039,404000,-2242.582
+4079,408000,-2247.5048
+4119,412000,-2219.5345
+4159,416000,-2219.813
+4199,420000,-2206.089
+4239,424000,-2229.2065
+4279,428000,-2232.5973
+4319,432000,-2220.915
+4359,436000,-2213.3003
+4399,440000,-2225.92
+4439,444000,-2229.2655
+4479,448000,-2223.2977
+4519,452000,-2222.3368
+4559,456000,-2217.945
+4599,460000,-2209.8247
+4639,464000,-2203.7908
+4679,468000,-2222.4963
+4719,472000,-2213.5595
+4759,476000,-2207.0573
+4799,480000,-2224.0718
+4839,484000,-2192.7728
+4879,488000,-2211.8895
+4919,492000,-2209.2267
+4959,496000,-2208.4648
+4999,500000,-2238.9572

code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_0_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34225abaea5568390f9cbf0a8d2382fa6d67e11addda79738a756604f1550ac6
+size 199811

code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_0.csv ADDED Viewed

	@@ -0,0 +1,251 @@

+episode,timestep,reward
+19,2000,-2135.3463
+39,4000,-2149.332
+59,6000,-2116.58
+79,8000,-2147.7215
+99,10000,-2145.4525
+119,12000,-2129.431
+139,14000,-2166.079
+159,16000,-2163.6345
+179,18000,-2139.9555
+199,20000,-2163.143
+219,22000,-2153.983
+239,24000,-2155.218
+259,26000,-2188.591
+279,28000,-2189.6865
+299,30000,-2163.1825
+319,32000,-2164.4245
+339,34000,-2179.1755
+359,36000,-2196.876
+379,38000,-2174.789
+399,40000,-2177.7325
+419,42000,-2171.5065
+439,44000,-2180.978
+459,46000,-2177.033
+479,48000,-2166.894
+499,50000,-2174.373
+519,52000,-2160.9975
+539,54000,-2139.3125
+559,56000,-2153.796
+579,58000,-2160.5355
+599,60000,-2147.8125
+619,62000,-2160.4915
+639,64000,-2156.354
+659,66000,-2120.872
+679,68000,-2144.24
+699,70000,-2132.259
+719,72000,-2161.746
+739,74000,-2157.3845
+759,76000,-2152.7245
+779,78000,-2172.0235
+799,80000,-2142.163
+819,82000,-2139.4385
+839,84000,-2144.8855
+859,86000,-2168.6705
+879,88000,-2151.953
+899,90000,-2163.172
+919,92000,-2146.0855
+939,94000,-2164.2995
+959,96000,-2136.362
+979,98000,-2144.5915
+999,100000,-2140.8605
+1019,102000,-2145.3225
+1039,104000,-2158.671
+1059,106000,-2143.01
+1079,108000,-2139.721
+1099,110000,-2116.822
+1119,112000,-2114.3315
+1139,114000,-2124.026
+1159,116000,-2142.8155
+1179,118000,-2147.8685
+1199,120000,-2143.8945
+1219,122000,-2146.832
+1239,124000,-2124.1695
+1259,126000,-2090.169
+1279,128000,-2146.8415
+1299,130000,-2121.292
+1319,132000,-2121.0195
+1339,134000,-2123.3185
+1359,136000,-2140.0235
+1379,138000,-2099.58
+1399,140000,-2110.5595
+1419,142000,-2113.717
+1439,144000,-2115.8905
+1459,146000,-2095.1055
+1479,148000,-2106.1685
+1499,150000,-2109.4955
+1519,152000,-2111.4375
+1539,154000,-2106.307
+1559,156000,-2130.6555
+1579,158000,-2136.0985
+1599,160000,-2121.9925
+1619,162000,-2109.5255
+1639,164000,-2128.574
+1659,166000,-2124.366
+1679,168000,-2139.8685
+1699,170000,-2116.211
+1719,172000,-2126.852
+1739,174000,-2117.076
+1759,176000,-2135.6755
+1779,178000,-2117.5595
+1799,180000,-2131.8435
+1819,182000,-2141.5565
+1839,184000,-2150.929
+1859,186000,-2139.8145
+1879,188000,-2129.5425
+1899,190000,-2126.8315
+1919,192000,-2133.958
+1939,194000,-2141.4045
+1959,196000,-2133.663
+1979,198000,-2141.9005
+1999,200000,-2148.833
+2019,202000,-2131.9035
+2039,204000,-2174.289
+2059,206000,-2160.0245
+2079,208000,-2165.5785
+2099,210000,-2147.701
+2119,212000,-2154.0045
+2139,214000,-2124.077
+2159,216000,-2166.3835
+2179,218000,-2168.514
+2199,220000,-2152.2125
+2219,222000,-2162.136
+2239,224000,-2154.616
+2259,226000,-2148.246
+2279,228000,-2146.5235
+2299,230000,-2143.4965
+2319,232000,-2133.6155
+2339,234000,-2144.0695
+2359,236000,-2139.9
+2379,238000,-2137.7725
+2399,240000,-2151.268
+2419,242000,-2175.9375
+2439,244000,-2157.845
+2459,246000,-2153.9685
+2479,248000,-2175.319
+2499,250000,-2140.522
+2519,252000,-2154.707
+2539,254000,-2133.141
+2559,256000,-2122.6885
+2579,258000,-2136.63
+2599,260000,-2141.906
+2619,262000,-2136.6815
+2639,264000,-2109.2965
+2659,266000,-2122.899
+2679,268000,-2149.3255
+2699,270000,-2118.7445
+2719,272000,-2131.16
+2739,274000,-2119.327
+2759,276000,-2127.0115
+2779,278000,-2165.839
+2799,280000,-2163.743
+2819,282000,-2135.451
+2839,284000,-2144.345
+2859,286000,-2129.195
+2879,288000,-2143.4665
+2899,290000,-2130.941
+2919,292000,-2147.0725
+2939,294000,-2125.8355
+2959,296000,-2126.066
+2979,298000,-2146.799
+2999,300000,-2147.949
+3019,302000,-2100.444
+3039,304000,-2116.093
+3059,306000,-2122.09
+3079,308000,-2136.446
+3099,310000,-2106.498
+3119,312000,-2101.977
+3139,314000,-2102.1295
+3159,316000,-2092.621
+3179,318000,-2112.0175
+3199,320000,-2102.532
+3219,322000,-2100.1165
+3239,324000,-2108.5405
+3259,326000,-2117.316
+3279,328000,-2113.263
+3299,330000,-2095.814
+3319,332000,-2097.3245
+3339,334000,-2091.1245
+3359,336000,-2112.114
+3379,338000,-2107.756
+3399,340000,-2105.6305
+3419,342000,-2106.4435
+3439,344000,-2093.697
+3459,346000,-2101.936
+3479,348000,-2087.019
+3499,350000,-2094.8375
+3519,352000,-2091.358
+3539,354000,-2114.3615
+3559,356000,-2131.719
+3579,358000,-2116.838
+3599,360000,-2128.923
+3619,362000,-2104.5615
+3639,364000,-2109.625
+3659,366000,-2106.293
+3679,368000,-2124.0315
+3699,370000,-2116.146
+3719,372000,-2121.4415
+3739,374000,-2084.2695
+3759,376000,-2104.179
+3779,378000,-2111.046
+3799,380000,-2108.5605
+3819,382000,-2092.0465
+3839,384000,-2107.194
+3859,386000,-2095.3865
+3879,388000,-2082.453
+3899,390000,-2119.981
+3919,392000,-2104.4325
+3939,394000,-2100.127
+3959,396000,-2103.365
+3979,398000,-2108.799
+3999,400000,-2087.373
+4019,402000,-2089.962
+4039,404000,-2113.7635
+4059,406000,-2127.984
+4079,408000,-2087.538
+4099,410000,-2071.391
+4119,412000,-2103.1025
+4139,414000,-2092.2085
+4159,416000,-2088.2855
+4179,418000,-2094.342
+4199,420000,-2089.6075
+4219,422000,-2088.1145
+4239,424000,-2101.0985
+4259,426000,-2107.1365
+4279,428000,-2093.734
+4299,430000,-2090.7895
+4319,432000,-2079.56
+4339,434000,-2083.1335
+4359,436000,-2087.81
+4379,438000,-2096.6135
+4399,440000,-2089.9545
+4419,442000,-2074.709
+4439,444000,-2080.6065
+4459,446000,-2078.952
+4479,448000,-2059.433
+4499,450000,-2049.38
+4519,452000,-2065.312
+4539,454000,-2057.3825
+4559,456000,-2085.955
+4579,458000,-2092.071
+4599,460000,-2073.4495
+4619,462000,-2082.937
+4639,464000,-2077.3055
+4659,466000,-2078.2065
+4679,468000,-2062.653
+4699,470000,-2054.374
+4719,472000,-2074.2705
+4739,474000,-2066.9925
+4759,476000,-2049.2215
+4779,478000,-2071.545
+4799,480000,-2057.4975
+4819,482000,-2045.2775
+4839,484000,-2059.3195
+4859,486000,-2054.074
+4879,488000,-2069.4245
+4899,490000,-2069.116
+4919,492000,-2038.679
+4939,494000,-2068.0445
+4959,496000,-2039.354
+4979,498000,-2032.349
+4999,500000,-2026.1585

code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_2.csv ADDED Viewed

	@@ -0,0 +1,216 @@

+episode,timestep,reward
+19,2000,-2380.8889
+39,4000,-2373.828
+59,6000,-2335.17
+79,8000,-2358.4425
+99,10000,-2358.743
+119,12000,-2359.581
+139,14000,-2374.202
+159,16000,-2350.303
+179,18000,-2351.3965
+199,20000,-2369.7695
+219,22000,-2333.9435
+239,24000,-2349.8265
+259,26000,-2369.537
+279,28000,-2370.489
+299,30000,-2325.0625
+319,32000,-2364.1575
+339,34000,-2315.2425
+359,36000,-2347.791
+379,38000,-2333.213
+399,40000,-2345.041
+419,42000,-2324.087
+439,44000,-2327.946
+459,46000,-2290.026
+479,48000,-2297.604
+499,50000,-2318.3805
+519,52000,-2304.4445
+539,54000,-2278.6925
+559,56000,-2279.398
+579,58000,-2275.2595
+599,60000,-2292.0225
+619,62000,-2292.0715
+639,64000,-2304.709
+659,66000,-2277.029
+679,68000,-2258.1065
+699,70000,-2263.7255
+719,72000,-2265.5755
+739,74000,-2246.222
+759,76000,-2234.866
+779,78000,-2254.3855
+799,80000,-2239.978
+819,82000,-2205.3095
+839,84000,-2230.3575
+859,86000,-2225.6055
+879,88000,-2229.913
+899,90000,-2244.0605
+919,92000,-2229.112
+939,94000,-2233.0065
+959,96000,-2221.533
+979,98000,-2199.2975
+999,100000,-2218.0545
+1019,102000,-2219.6245
+1039,104000,-2212.3515
+1059,106000,-2228.366
+1079,108000,-2213.773
+1099,110000,-2214.438
+1119,112000,-2218.1
+1139,114000,-2195.482
+1159,116000,-2225.5825
+1179,118000,-2213.362
+1199,120000,-2211.5005
+1219,122000,-2208.066
+1239,124000,-2191.501
+1259,126000,-2235.9985
+1279,128000,-2211.5905
+1299,130000,-2202.716
+1319,132000,-2212.6015
+1339,134000,-2216.1535
+1359,136000,-2215.3695
+1379,138000,-2210.9315
+1399,140000,-2219.104
+1419,142000,-2223.478
+1439,144000,-2222.4635
+1459,146000,-2221.686
+1479,148000,-2211.6465
+1499,150000,-2208.096
+1519,152000,-2209.976
+1539,154000,-2199.5775
+1559,156000,-2213.538
+1579,158000,-2196.544
+1599,160000,-2191.9365
+1619,162000,-2202.8655
+1639,164000,-2195.785
+1659,166000,-2197.826
+1679,168000,-2198.4345
+1699,170000,-2192.2155
+1719,172000,-2183.3555
+1739,174000,-2215.12
+1759,176000,-2183.842
+1779,178000,-2185.168
+1799,180000,-2173.7945
+1819,182000,-2172.845
+1839,184000,-2176.132
+1859,186000,-2188.4535
+1879,188000,-2156.692
+1899,190000,-2169.1765
+1919,192000,-2150.046
+1939,194000,-2169.566
+1959,196000,-2159.7815
+1979,198000,-2167.8865
+1999,200000,-2188.2145
+2019,202000,-2145.591
+2039,204000,-2156.559
+2059,206000,-2164.4925
+2079,208000,-2162.0795
+2099,210000,-2157.1775
+2119,212000,-2145.232
+2139,214000,-2147.627
+2159,216000,-2154.1195
+2179,218000,-2155.8565
+2199,220000,-2134.7075
+2219,222000,-2127.8285
+2239,224000,-2168.0365
+2259,226000,-2142.8975
+2279,228000,-2140.589
+2299,230000,-2149.4825
+2319,232000,-2140.3645
+2339,234000,-2143.029
+2359,236000,-2126.4945
+2379,238000,-2135.033
+2399,240000,-2124.0925
+2419,242000,-2118.4725
+2439,244000,-2134.5425
+2459,246000,-2102.6725
+2479,248000,-2117.8615
+2499,250000,-2123.2265
+2519,252000,-2099.3605
+2539,254000,-2113.7965
+2559,256000,-2126.2285
+2579,258000,-2124.4015
+2599,260000,-2099.2445
+2619,262000,-2131.927
+2639,264000,-2128.738
+2659,266000,-2120.8685
+2679,268000,-2121.595
+2699,270000,-2140.079
+2719,272000,-2115.872
+2739,274000,-2105.3305
+2759,276000,-2133.0435
+2779,278000,-2117.732
+2799,280000,-2143.8175
+2819,282000,-2076.888
+2839,284000,-2106.048
+2859,286000,-2105.761
+2879,288000,-2102.9825
+2899,290000,-2118.6665
+2919,292000,-2122.7975
+2939,294000,-2121.764
+2959,296000,-2128.1515
+2979,298000,-2113.3235
+2999,300000,-2126.751
+3019,302000,-2111.186
+3039,304000,-2112.5405
+3059,306000,-2110.0095
+3079,308000,-2118.0815
+3099,310000,-2100.6005
+3119,312000,-2106.429
+3139,314000,-2092.304
+3159,316000,-2105.092
+3179,318000,-2085.4645
+3199,320000,-2107.1535
+3219,322000,-2107.04
+3239,324000,-2092.5935
+3259,326000,-2096.9715
+3279,328000,-2103.3905
+3299,330000,-2105.1935
+3319,332000,-2108.05
+3339,334000,-2100.4505
+3359,336000,-2087.976
+3379,338000,-2093.996
+3399,340000,-2103.395
+3419,342000,-2075.1395
+3439,344000,-2100.193
+3459,346000,-2097.6485
+3479,348000,-2103.601
+3499,350000,-2109.8605
+3519,352000,-2087.653
+3539,354000,-2126.2165
+3559,356000,-2117.9495
+3579,358000,-2112.5835
+3599,360000,-2117.1415
+3619,362000,-2108.5045
+3639,364000,-2103.0745
+3659,366000,-2111.068
+3679,368000,-2126.239
+3699,370000,-2104.904
+3719,372000,-2085.1685
+3739,374000,-2093.3945
+3759,376000,-2101.3165
+3779,378000,-2103.0655
+3799,380000,-2101.006
+3819,382000,-2103.158
+3839,384000,-2102.5225
+3859,386000,-2107.4555
+3879,388000,-2095.627
+3899,390000,-2114.5905
+3919,392000,-2112.6065
+3939,394000,-2103.3065
+3959,396000,-2111.277
+3979,398000,-2106.088
+3999,400000,-2106.7605
+4019,402000,-2084.668
+4039,404000,-2104.5425
+4059,406000,-2105.0865
+4079,408000,-2086.7805
+4099,410000,-2116.368
+4119,412000,-2100.076
+4139,414000,-2115.2785
+4159,416000,-2111.847
+4179,418000,-2075.2525
+4199,420000,-2089.003
+4219,422000,-2101.154
+4239,424000,-2099.7625
+4259,426000,-2118.5795
+4279,428000,-2108.951
+4299,430000,-2099.8935

code/Lot-sizing/logs/best_A2C_15items_5machines_i100_0/best_model.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43b750cb2d2aac2c7897b0e0f6d1495b4e6a4cd6a5dc5938b3359734024810bc
+size 1022438

code/Lot-sizing/logs/evaluations.npz ADDED Viewed

Binary file (40.2 kB). View file

code/Lot-sizing/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (419 Bytes). View file

code/Lot-sizing/models/__pycache__/multistageOptimization.cpython-38.pyc ADDED Viewed

Binary file (6.56 kB). View file

code/Lot-sizing/models/__pycache__/optimizationProblemInstance.cpython-38.pyc ADDED Viewed

Binary file (877 Bytes). View file

code/Lot-sizing/models/__pycache__/perfectInfoOptimization.cpython-38.pyc ADDED Viewed

Binary file (5.42 kB). View file

code/Lot-sizing/results/PDPPO_15items_5machines_i100_actions_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e173a4ff1f1af7ec376b991a97d5d971a9358e0ee59e78d1dac0c5706dce0a3
+size 1200128

code/Lot-sizing/results/PDPPO_15items_5machines_i100_costs_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4f6e5d4c9a9837667f1df2a98f3556b3b67f700011695efcd5503099bfd39b7
+size 136

code/Lot-sizing/results/PDPPO_15items_5machines_i100_demands_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8427469f56af868c1a5fed54b3b235af670398cb091217c84c3d710f8c02ea5
+size 1800128

code/Lot-sizing/results/PDPPO_15items_5machines_i100_holding_costs_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:433d5d5321c151cfdc6fb420c90de1c55c6ba45888445f942fe3880c56b7d947
+size 240128

code/Lot-sizing/results/PDPPO_15items_5machines_i100_lost_sales_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85741d269fea615ac96e4de74211ce0aa144e1d69d471fcce17cc133c6c0a88b
+size 240128

code/Lot-sizing/results/PDPPO_15items_5machines_i100_observations_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49242c880236d956378b24445e0d0c653d8af1f474e5fd61113e125ed244dfba
+size 744805

code/Lot-sizing/results/PDPPO_15items_5machines_i100_setup_costs_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da0680577feff47cc4071a0ac12f0ba5b4f6c3a1240e33a5cfb799916317c6f9
+size 240128

code/Lot-sizing/results/PPO_15items_5machines_i100_actions_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de66cf942bdc3f335699c6c75de45a952811465f0a0d19dfdb4121c2baa1314f
+size 400128

code/Lot-sizing/results/PPO_15items_5machines_i100_costs_test.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6056e23a628ef0b32f9b252ff401ac9aadb26324986af77c7e80e2d0bb201cc
+size 136