include modifications to test dual critic ppo
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- code/Lot-sizing/agents/PDPPO.py +0 -2
- code/Lot-sizing/agents/PDPPOAgent.py +1 -1
- code/Lot-sizing/agents/PDPPOAgent_one_critic.py +14 -14
- code/Lot-sizing/agents/PDPPO_v0.py +0 -328
- code/Lot-sizing/agents/{PDPPO one critic.py → PDPPOonecritic.py} +1 -1
- code/Lot-sizing/agents/PPO.py +1 -3
- code/Lot-sizing/agents/PPOAgent.py +1 -3
- code/Lot-sizing/agents/PPOAgent_two_critics.py +385 -0
- code/Lot-sizing/agents/{PDPPO_one_critic.py → PPOtwocritics.py} +37 -67
- code/Lot-sizing/agents/__init__.py +8 -27
- code/Lot-sizing/agents/__pycache__/PDPPO.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PDPPOAgent.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PDPPOAgent_one_critic.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PDPPO_one_critic.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PDPPOonecritic.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PPO.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PPOAgent.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PPOAgent_two_critics.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/PPOtwocritics.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/__init__.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/perfectInfoAgent.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/__pycache__/stableBaselineAgents.cpython-38.pyc +0 -0
- code/Lot-sizing/agents/perfectInfoAgent.py +18 -0
- code/Lot-sizing/agents/stableBaselineAgents.py +320 -0
- code/Lot-sizing/envs/__pycache__/__init__.cpython-38.pyc +0 -0
- code/Lot-sizing/envs/__pycache__/simplePlant.cpython-38.pyc +0 -0
- code/Lot-sizing/envs/__pycache__/singleSequenceDependentMachinePlant.cpython-38.pyc +0 -0
- code/Lot-sizing/experiments.py +77 -25
- code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_0_0.pth +3 -0
- code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_0.csv +126 -0
- code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_2.csv +126 -0
- code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_3.csv +126 -0
- code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_0_0.pth +3 -0
- code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_0.csv +251 -0
- code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_2.csv +216 -0
- code/Lot-sizing/logs/best_A2C_15items_5machines_i100_0/best_model.zip +3 -0
- code/Lot-sizing/logs/evaluations.npz +0 -0
- code/Lot-sizing/models/__pycache__/__init__.cpython-38.pyc +0 -0
- code/Lot-sizing/models/__pycache__/multistageOptimization.cpython-38.pyc +0 -0
- code/Lot-sizing/models/__pycache__/optimizationProblemInstance.cpython-38.pyc +0 -0
- code/Lot-sizing/models/__pycache__/perfectInfoOptimization.cpython-38.pyc +0 -0
- code/Lot-sizing/results/PDPPO_15items_5machines_i100_actions_test.npy +3 -0
- code/Lot-sizing/results/PDPPO_15items_5machines_i100_costs_test.npy +3 -0
- code/Lot-sizing/results/PDPPO_15items_5machines_i100_demands_test.npy +3 -0
- code/Lot-sizing/results/PDPPO_15items_5machines_i100_holding_costs_test.npy +3 -0
- code/Lot-sizing/results/PDPPO_15items_5machines_i100_lost_sales_test.npy +3 -0
- code/Lot-sizing/results/PDPPO_15items_5machines_i100_observations_test.npy +3 -0
- code/Lot-sizing/results/PDPPO_15items_5machines_i100_setup_costs_test.npy +3 -0
- code/Lot-sizing/results/PPO_15items_5machines_i100_actions_test.npy +3 -0
- code/Lot-sizing/results/PPO_15items_5machines_i100_costs_test.npy +3 -0
code/Lot-sizing/agents/PDPPO.py
CHANGED
|
@@ -309,8 +309,6 @@ class PDPPO:
|
|
| 309 |
# final loss of clipped objective PDPPO
|
| 310 |
loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(torch.min(state_values,state_values_post.squeeze()), rewards) - 0.012 * dist_entropy
|
| 311 |
|
| 312 |
-
loss_numpy = loss.detach().cpu().numpy()
|
| 313 |
-
|
| 314 |
# take gradient step
|
| 315 |
self.optimizer.zero_grad()
|
| 316 |
loss.mean().backward()
|
|
|
|
| 309 |
# final loss of clipped objective PDPPO
|
| 310 |
loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(torch.min(state_values,state_values_post.squeeze()), rewards) - 0.012 * dist_entropy
|
| 311 |
|
|
|
|
|
|
|
| 312 |
# take gradient step
|
| 313 |
self.optimizer.zero_grad()
|
| 314 |
loss.mean().backward()
|
code/Lot-sizing/agents/PDPPOAgent.py
CHANGED
|
@@ -14,7 +14,7 @@ BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
|
|
| 14 |
AGENTS_DIR = os.path.join(BASE_DIR,'agents')
|
| 15 |
sys.path.append(AGENTS_DIR)
|
| 16 |
from agents.PDPPO import PDPPO
|
| 17 |
-
from envs import
|
| 18 |
import copy
|
| 19 |
|
| 20 |
|
|
|
|
| 14 |
AGENTS_DIR = os.path.join(BASE_DIR,'agents')
|
| 15 |
sys.path.append(AGENTS_DIR)
|
| 16 |
from agents.PDPPO import PDPPO
|
| 17 |
+
from envs import SimplePlant
|
| 18 |
import copy
|
| 19 |
|
| 20 |
|
code/Lot-sizing/agents/PDPPOAgent_one_critic.py
CHANGED
|
@@ -13,8 +13,8 @@ import matplotlib.patches as mpatches # Provides a way of adding a colored patch
|
|
| 13 |
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
|
| 14 |
AGENTS_DIR = os.path.join(BASE_DIR,'agents')
|
| 15 |
sys.path.append(AGENTS_DIR)
|
| 16 |
-
from agents.
|
| 17 |
-
from envs import
|
| 18 |
import copy
|
| 19 |
|
| 20 |
|
|
@@ -101,7 +101,7 @@ class SimplePlantSB(SimplePlant):
|
|
| 101 |
return obs
|
| 102 |
|
| 103 |
|
| 104 |
-
class
|
| 105 |
def __init__(self, env: SimplePlant, settings: dict):
|
| 106 |
self.env = SimplePlantSB(env.settings, env.stoch_model)
|
| 107 |
self.last_inventory = env.inventory_level
|
|
@@ -142,7 +142,7 @@ class PDPPOAgent():
|
|
| 142 |
|
| 143 |
## Note : print/log frequencies should be > than self.max_ep_len
|
| 144 |
|
| 145 |
-
################
|
| 146 |
self.update_timestep = self.max_ep_len * 4 # update policy every n timesteps
|
| 147 |
self.K_epochs = 60 # update policy for K epochs in one PDPPO update
|
| 148 |
|
|
@@ -169,7 +169,7 @@ class PDPPOAgent():
|
|
| 169 |
else:
|
| 170 |
self.action_dim = self.env.action_space
|
| 171 |
|
| 172 |
-
self.pdppo_agent =
|
| 173 |
|
| 174 |
|
| 175 |
################################### Training ###################################
|
|
@@ -187,7 +187,7 @@ class PDPPOAgent():
|
|
| 187 |
if not os.path.exists(log_dir):
|
| 188 |
os.makedirs(log_dir)
|
| 189 |
|
| 190 |
-
log_dir = log_dir + '/' + self.experiment_name + '
|
| 191 |
if not os.path.exists(log_dir):
|
| 192 |
os.makedirs(log_dir)
|
| 193 |
|
|
@@ -197,7 +197,7 @@ class PDPPOAgent():
|
|
| 197 |
run_num = len(current_num_files)
|
| 198 |
|
| 199 |
#### create new log file for each run
|
| 200 |
-
log_f_name = log_dir + '/
|
| 201 |
|
| 202 |
print("current logging run number for " + self.experiment_name + " : ", run_num)
|
| 203 |
print("logging at : " + log_f_name)
|
|
@@ -215,7 +215,7 @@ class PDPPOAgent():
|
|
| 215 |
os.makedirs(directory)
|
| 216 |
|
| 217 |
|
| 218 |
-
checkpoint_path = directory + "
|
| 219 |
print("save checkpoint path : " + checkpoint_path)
|
| 220 |
#####################################################
|
| 221 |
|
|
@@ -241,9 +241,9 @@ class PDPPOAgent():
|
|
| 241 |
else:
|
| 242 |
print("Initializing a discrete action space policy")
|
| 243 |
print("--------------------------------------------------------------------------------------------")
|
| 244 |
-
print("
|
| 245 |
-
print("
|
| 246 |
-
print("
|
| 247 |
print("discount factor (self.gamma) : ", self.gamma)
|
| 248 |
print("--------------------------------------------------------------------------------------------")
|
| 249 |
print("optimizer learning rate actor : ", self.lr_actor)
|
|
@@ -259,7 +259,7 @@ class PDPPOAgent():
|
|
| 259 |
################# training procedure ################
|
| 260 |
|
| 261 |
# initialize a PDPPO agent
|
| 262 |
-
self.PDPPO_agent =
|
| 263 |
|
| 264 |
# track total training time
|
| 265 |
start_time = datetime.now().replace(microsecond=0)
|
|
@@ -388,7 +388,7 @@ class PDPPOAgent():
|
|
| 388 |
def load_agent(self,path):
|
| 389 |
#directory = "PDPPO_preTrained" + '/' + env_name + '/'
|
| 390 |
directory = self.LOG_DIR
|
| 391 |
-
directory = directory + '/' + self.experiment_name + '
|
| 392 |
-
checkpoint_path = directory + "
|
| 393 |
print("loading network from : " + checkpoint_path)
|
| 394 |
self.pdppo_agent.load(checkpoint_path)
|
|
|
|
| 13 |
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
|
| 14 |
AGENTS_DIR = os.path.join(BASE_DIR,'agents')
|
| 15 |
sys.path.append(AGENTS_DIR)
|
| 16 |
+
from agents.PDPPOonecritic import PDPPOonecritic
|
| 17 |
+
from envs import SimplePlant
|
| 18 |
import copy
|
| 19 |
|
| 20 |
|
|
|
|
| 101 |
return obs
|
| 102 |
|
| 103 |
|
| 104 |
+
class PDPPOAgent_one_critic():
|
| 105 |
def __init__(self, env: SimplePlant, settings: dict):
|
| 106 |
self.env = SimplePlantSB(env.settings, env.stoch_model)
|
| 107 |
self.last_inventory = env.inventory_level
|
|
|
|
| 142 |
|
| 143 |
## Note : print/log frequencies should be > than self.max_ep_len
|
| 144 |
|
| 145 |
+
################ PDPPO_one_critic hyperparameters ################
|
| 146 |
self.update_timestep = self.max_ep_len * 4 # update policy every n timesteps
|
| 147 |
self.K_epochs = 60 # update policy for K epochs in one PDPPO update
|
| 148 |
|
|
|
|
| 169 |
else:
|
| 170 |
self.action_dim = self.env.action_space
|
| 171 |
|
| 172 |
+
self.pdppo_agent = PDPPO_one_critic(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space,self.tau, self.action_std)
|
| 173 |
|
| 174 |
|
| 175 |
################################### Training ###################################
|
|
|
|
| 187 |
if not os.path.exists(log_dir):
|
| 188 |
os.makedirs(log_dir)
|
| 189 |
|
| 190 |
+
log_dir = log_dir + '/' + self.experiment_name + '_PDPPO_one_critic/'
|
| 191 |
if not os.path.exists(log_dir):
|
| 192 |
os.makedirs(log_dir)
|
| 193 |
|
|
|
|
| 197 |
run_num = len(current_num_files)
|
| 198 |
|
| 199 |
#### create new log file for each run
|
| 200 |
+
log_f_name = log_dir + '/PDPPO_one_critic_' + self.experiment_name + "_log_" + str(run_num) + ".csv"
|
| 201 |
|
| 202 |
print("current logging run number for " + self.experiment_name + " : ", run_num)
|
| 203 |
print("logging at : " + log_f_name)
|
|
|
|
| 215 |
os.makedirs(directory)
|
| 216 |
|
| 217 |
|
| 218 |
+
checkpoint_path = directory + "PDPPO_one_critic_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
|
| 219 |
print("save checkpoint path : " + checkpoint_path)
|
| 220 |
#####################################################
|
| 221 |
|
|
|
|
| 241 |
else:
|
| 242 |
print("Initializing a discrete action space policy")
|
| 243 |
print("--------------------------------------------------------------------------------------------")
|
| 244 |
+
print("PDPPO_one_critic update frequency : " + str(self.update_timestep) + " timesteps")
|
| 245 |
+
print("PDPPO_one_critic K epochs : ", self.K_epochs)
|
| 246 |
+
print("PDPPO_one_critic epsilon clip : ", self.eps_clip)
|
| 247 |
print("discount factor (self.gamma) : ", self.gamma)
|
| 248 |
print("--------------------------------------------------------------------------------------------")
|
| 249 |
print("optimizer learning rate actor : ", self.lr_actor)
|
|
|
|
| 259 |
################# training procedure ################
|
| 260 |
|
| 261 |
# initialize a PDPPO agent
|
| 262 |
+
self.PDPPO_agent = PDPPO_one_critic(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space, self.action_std)
|
| 263 |
|
| 264 |
# track total training time
|
| 265 |
start_time = datetime.now().replace(microsecond=0)
|
|
|
|
| 388 |
def load_agent(self,path):
|
| 389 |
#directory = "PDPPO_preTrained" + '/' + env_name + '/'
|
| 390 |
directory = self.LOG_DIR
|
| 391 |
+
directory = directory + '/' + self.experiment_name + '_PDPPO_one_critic' + '/'
|
| 392 |
+
checkpoint_path = directory + "PDPPO_one_critic_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
|
| 393 |
print("loading network from : " + checkpoint_path)
|
| 394 |
self.pdppo_agent.load(checkpoint_path)
|
code/Lot-sizing/agents/PDPPO_v0.py
DELETED
|
@@ -1,328 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import copy
|
| 3 |
-
import numpy as np
|
| 4 |
-
import torch
|
| 5 |
-
import torch.nn as nn
|
| 6 |
-
import torch.optim as optim
|
| 7 |
-
import torch.nn.functional as F
|
| 8 |
-
from torch.distributions import Categorical
|
| 9 |
-
from envs import *
|
| 10 |
-
import gym
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class SimplePlantSB(SimplePlant):
|
| 15 |
-
def __init__(self, settings, stoch_model):
|
| 16 |
-
super().__init__(settings, stoch_model)
|
| 17 |
-
try:self.dict_obs = settings['dict_obs']
|
| 18 |
-
except:self.dict_obs = False
|
| 19 |
-
self.last_inventory = copy.copy(self.inventory_level)
|
| 20 |
-
self.action_space = gym.spaces.MultiDiscrete(
|
| 21 |
-
[self.n_items+1] * self.n_machines
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
if self.dict_obs:
|
| 25 |
-
self.observation_space = gym.spaces.Dict({
|
| 26 |
-
'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items),
|
| 27 |
-
'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines)
|
| 28 |
-
})
|
| 29 |
-
else:
|
| 30 |
-
self.observation_space = gym.spaces.Box(
|
| 31 |
-
low=np.zeros(self.n_items+self.n_machines),# high for the inventory level
|
| 32 |
-
high=np.concatenate(
|
| 33 |
-
[
|
| 34 |
-
np.array(self.max_inventory_level),
|
| 35 |
-
np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups
|
| 36 |
-
]),
|
| 37 |
-
dtype=np.int32
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
-
def step(self, action):
|
| 41 |
-
"""
|
| 42 |
-
Step method: Execute one time step within the environment
|
| 43 |
-
|
| 44 |
-
Parameters
|
| 45 |
-
----------
|
| 46 |
-
action : action given by the agent
|
| 47 |
-
|
| 48 |
-
Returns
|
| 49 |
-
-------
|
| 50 |
-
obs : Observation of the state give the method _next_observation
|
| 51 |
-
reward : Cost given by the _reward method
|
| 52 |
-
done : returns True or False given by the _done method
|
| 53 |
-
dict : possible information for control to environment monitoring
|
| 54 |
-
|
| 55 |
-
"""
|
| 56 |
-
self.last_inventory = copy.copy(self.inventory_level)
|
| 57 |
-
|
| 58 |
-
self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand)
|
| 59 |
-
|
| 60 |
-
# self.total_cost['setup_costs'] = 0
|
| 61 |
-
# self.total_cost['holding_costs'] = 0
|
| 62 |
-
|
| 63 |
-
reward = -sum([ele for key, ele in self.total_cost.items()])
|
| 64 |
-
#reward = -self.total_cost['lost_sales']
|
| 65 |
-
|
| 66 |
-
#reward = np.abs(action)
|
| 67 |
-
|
| 68 |
-
self.current_step += 1
|
| 69 |
-
done = self.current_step == self.T
|
| 70 |
-
obs = self._next_observation()
|
| 71 |
-
|
| 72 |
-
return obs, reward, done, self.total_cost
|
| 73 |
-
|
| 74 |
-
def _next_observation(self):
|
| 75 |
-
"""
|
| 76 |
-
Returns the next demand
|
| 77 |
-
"""
|
| 78 |
-
obs = SimplePlant._next_observation(self)
|
| 79 |
-
#obs['last_inventory_level'] = copy.copy(self.last_inventory)
|
| 80 |
-
if isinstance(obs, dict):
|
| 81 |
-
if not self.dict_obs:
|
| 82 |
-
obs = np.concatenate(
|
| 83 |
-
(
|
| 84 |
-
obs['inventory_level'], # n_items size
|
| 85 |
-
obs['machine_setup'], # n_machine size
|
| 86 |
-
#obs['last_inventory_level']# n_items size
|
| 87 |
-
)
|
| 88 |
-
)
|
| 89 |
-
else:
|
| 90 |
-
if self.dict_obs:
|
| 91 |
-
raise('Change dict_obst to False')
|
| 92 |
-
return obs
|
| 93 |
-
|
| 94 |
-
# Define the policy network
|
| 95 |
-
class Policy(nn.Module):
|
| 96 |
-
def __init__(self, input_size, output_shape):
|
| 97 |
-
super(Policy, self).__init__()
|
| 98 |
-
self.fc1 = nn.Linear(input_size, 128)
|
| 99 |
-
self.fc_list = nn.ModuleList([nn.Linear(128, output_shape[0]) for list(output_shape)[1] in range(0,output_shape[1])])
|
| 100 |
-
|
| 101 |
-
def forward(self, x):
|
| 102 |
-
x = F.relu(self.fc1(x)).requires_grad_()
|
| 103 |
-
outputs = [F.softmax(fc(x), dim=1)for fc in self.fc_list]
|
| 104 |
-
return outputs
|
| 105 |
-
|
| 106 |
-
# Define the value network for deterministic components
|
| 107 |
-
class Value(nn.Module):
|
| 108 |
-
def __init__(self,input_size,output_size):
|
| 109 |
-
super(Value, self).__init__()
|
| 110 |
-
self.fc1 = nn.Linear(input_size, 128)
|
| 111 |
-
self.fc2 = nn.Linear(128, output_size)
|
| 112 |
-
|
| 113 |
-
def forward(self, x):
|
| 114 |
-
x = F.relu(self.fc1(x)).requires_grad_()
|
| 115 |
-
x = self.fc2(x)
|
| 116 |
-
return x
|
| 117 |
-
|
| 118 |
-
# Define the value network for stochastic components
|
| 119 |
-
class ValueStochastic(nn.Module):
|
| 120 |
-
def __init__(self,input_size,output_size):
|
| 121 |
-
super(ValueStochastic, self).__init__()
|
| 122 |
-
self.fc1 = nn.Linear(input_size, 128)
|
| 123 |
-
self.fc2 = nn.Linear(128, output_size)
|
| 124 |
-
|
| 125 |
-
def forward(self, x):
|
| 126 |
-
x = F.relu(self.fc1(x)).requires_grad_()
|
| 127 |
-
x = F.softmax(self.fc2(x), dim=1)
|
| 128 |
-
return x
|
| 129 |
-
|
| 130 |
-
# Define the PPO agent
|
| 131 |
-
class PDPPO:
|
| 132 |
-
def __init__(self, env: SimplePlant, settings: dict):
|
| 133 |
-
|
| 134 |
-
self.env = SimplePlantSB(env.settings, env.stoch_model)
|
| 135 |
-
self.last_inventory = env.inventory_level
|
| 136 |
-
self.experiment_name = settings['experiment_name']
|
| 137 |
-
try:self.dict_obs = settings['dict_obs']
|
| 138 |
-
except:self.dict_obs = False
|
| 139 |
-
|
| 140 |
-
self.POSSIBLE_STATES = self.env.n_items + 1
|
| 141 |
-
self.env.cost_to_reward = True
|
| 142 |
-
self.epsilon = 0
|
| 143 |
-
|
| 144 |
-
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 145 |
-
# Use the logs file in the root path of the main.
|
| 146 |
-
self.LOG_DIR = os.path.join(BASE_DIR,'logs')
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
if self.dict_obs == False:
|
| 150 |
-
input_size = self.env.observation_space.shape[0]
|
| 151 |
-
output_size_policy = (self.env.n_items+1, self.env.action_space.shape[0]) # we add 1 for the idle state
|
| 152 |
-
output_size_value = self.env.action_space.shape[0]
|
| 153 |
-
self.policy = Policy(input_size,output_size_policy)
|
| 154 |
-
self.value = Value(input_size,output_size_value)
|
| 155 |
-
self.value_post = ValueStochastic(input_size,output_size_value)
|
| 156 |
-
self.optimizer_policy = optim.Adam(self.policy.parameters(), lr=1e-3)
|
| 157 |
-
self.optimizer_value = optim.Adam(self.value.parameters(), lr=1e-3)
|
| 158 |
-
self.optimizer_value_post = optim.Adam(self.value_post.parameters(), lr=1e-3)
|
| 159 |
-
self.eps_clip = 0.2
|
| 160 |
-
self.gamma = 0.99
|
| 161 |
-
self.lmbda = 0.95
|
| 162 |
-
|
| 163 |
-
def get_post_state(self, action, machine_setup, inventory_level):
|
| 164 |
-
setup_loss = np.zeros(self.env.n_machines, dtype=int)
|
| 165 |
-
setup_costs = np.zeros(self.env.n_machines)
|
| 166 |
-
# if we are just changing the setup, we use the setup cost matrix with the corresponding position given by the actual setup and the new setup
|
| 167 |
-
for m in range(self.env.n_machines):
|
| 168 |
-
if action[m] != 0: # if the machine is not iddle
|
| 169 |
-
# 1. IF NEEDED CHANGE SETUP
|
| 170 |
-
if machine_setup[m] != action[m] and action[m] != 0:
|
| 171 |
-
setup_costs[m] = self.env.setup_costs[m][action[m] - 1]
|
| 172 |
-
setup_loss[m] = self.env.setup_loss[m][action[m] - 1]
|
| 173 |
-
machine_setup[m] = action[m]
|
| 174 |
-
# 2. PRODUCTION
|
| 175 |
-
production = self.env.machine_production_matrix[m][action[m] - 1] - setup_loss[m]
|
| 176 |
-
inventory_level[action[m] - 1] += production
|
| 177 |
-
else:
|
| 178 |
-
machine_setup[m] = 0
|
| 179 |
-
# return the new machine_setup_inventory_level and the setup_cost
|
| 180 |
-
return machine_setup, inventory_level, setup_costs
|
| 181 |
-
|
| 182 |
-
def get_action(self, state):
|
| 183 |
-
state = torch.from_numpy(state).float().unsqueeze(0)
|
| 184 |
-
probs = self.policy(state)
|
| 185 |
-
probs_concat = torch.stack(probs, dim=1)
|
| 186 |
-
m = Categorical(probs_concat)
|
| 187 |
-
action = m.sample()
|
| 188 |
-
value = self.value(state)
|
| 189 |
-
machine_setup, inventory_level, setup_cost = self.get_post_state(action.numpy()[0], state[0][self.env.n_items:self.env.n_items+self.env.n_machines].numpy(), state[0][0:self.env.n_items].numpy())
|
| 190 |
-
value_post = self.value_post(state)
|
| 191 |
-
|
| 192 |
-
return action, m.log_prob(action), probs_concat, value, value_post
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
def update(self, rewards, rewards_pre_state, rewards_post_state, states, post_states, actions, probs, next_states):
|
| 196 |
-
# Update deterministic value function
|
| 197 |
-
for epoch in range(10):
|
| 198 |
-
for i in range(len(actions)):
|
| 199 |
-
state = torch.from_numpy(states[i]).float().unsqueeze(0)
|
| 200 |
-
value = self.value(state)
|
| 201 |
-
next_state = torch.from_numpy(next_states[i]).float().unsqueeze(0)
|
| 202 |
-
next_value = self.value(next_state)
|
| 203 |
-
target = rewards_pre_state[i] + self.gamma * next_value
|
| 204 |
-
advantage = target - value
|
| 205 |
-
loss = advantage.pow(2).mean()
|
| 206 |
-
self.optimizer_value.zero_grad()
|
| 207 |
-
loss.backward()
|
| 208 |
-
self.optimizer_value.step()
|
| 209 |
-
|
| 210 |
-
# Update stochastic value function
|
| 211 |
-
for epoch in range(10):
|
| 212 |
-
for i in range(len(actions)):
|
| 213 |
-
state = torch.from_numpy(states[i]).float().unsqueeze(0)
|
| 214 |
-
value = self.value_post(state)
|
| 215 |
-
post_state = torch.from_numpy(post_states[i]).float().unsqueeze(0)
|
| 216 |
-
value_post = self.value_post(post_state)
|
| 217 |
-
target = rewards_post_state[i] + self.gamma * value_post
|
| 218 |
-
advantage = target - value
|
| 219 |
-
loss = advantage.pow(2).mean()
|
| 220 |
-
self.optimizer_value_post.zero_grad()
|
| 221 |
-
loss.backward()
|
| 222 |
-
self.optimizer_value_post.step()
|
| 223 |
-
|
| 224 |
-
# Update policy network
|
| 225 |
-
states = torch.from_numpy(np.vstack(states)).float()
|
| 226 |
-
actions = torch.cat(actions).unsqueeze(1)
|
| 227 |
-
old_probs = torch.cat(probs)
|
| 228 |
-
old_probs = torch.gather(old_probs.clone(),2, actions)
|
| 229 |
-
|
| 230 |
-
policy_epochs = 10
|
| 231 |
-
for epoch in range(policy_epochs):
|
| 232 |
-
probs = self.policy(states)
|
| 233 |
-
probs = torch.stack(probs, dim=1).clone()
|
| 234 |
-
m = Categorical(probs)
|
| 235 |
-
action = m.sample()
|
| 236 |
-
probs = torch.gather(probs, 2, actions)
|
| 237 |
-
kl_div = (old_probs * (torch.log(old_probs) - torch.log(probs))).sum()
|
| 238 |
-
|
| 239 |
-
for state,post_state, action, old_prob, prob, next_state, reward_pre_state, reward_post_state in zip(states,post_states, actions, old_probs, probs, next_states,rewards_pre_state,rewards_post_state):
|
| 240 |
-
state = state.unsqueeze(0)
|
| 241 |
-
next_state = torch.from_numpy(next_state).unsqueeze(0).float()
|
| 242 |
-
post_state = torch.from_numpy(post_state).unsqueeze(0).float()
|
| 243 |
-
action = action.unsqueeze(0)
|
| 244 |
-
old_prob = old_prob.unsqueeze(0)
|
| 245 |
-
prob = prob.unsqueeze(0)
|
| 246 |
-
value = self.value(state)
|
| 247 |
-
value_post = self.value_post(post_state)
|
| 248 |
-
advantage = reward_pre_state + self.gamma * self.value(next_state) - self.value(state)
|
| 249 |
-
advantage_post = reward_post_state + self.gamma * self.value_post(post_state) - self.value_post(state)
|
| 250 |
-
|
| 251 |
-
ratio = (prob / old_prob)
|
| 252 |
-
surr1 = ratio * advantage
|
| 253 |
-
surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantage
|
| 254 |
-
policy_loss = -torch.min(surr1, surr2) - 0.01 * m.entropy()
|
| 255 |
-
|
| 256 |
-
ratio_post = ratio
|
| 257 |
-
surr1_post = ratio_post * advantage_post
|
| 258 |
-
surr2_post = torch.clamp(ratio_post, 1 - self.eps_clip, 1 + self.eps_clip) * advantage_post
|
| 259 |
-
policy_loss_post = -torch.min(surr1_post, surr2_post) - 0.01 * m.entropy()
|
| 260 |
-
|
| 261 |
-
self.optimizer_policy.zero_grad()
|
| 262 |
-
(policy_loss.pow(2).mean() + policy_loss_post.pow(2).mean() + 0.5 * value.pow(2).mean() + 0.5 * value_post.pow(2).mean()).backward(retain_graph=True)
|
| 263 |
-
self.optimizer_policy.step()
|
| 264 |
-
|
| 265 |
-
def learn(self, n_episodes=1000, save_interval=100):
|
| 266 |
-
# Train the agent
|
| 267 |
-
for episode in range(n_episodes):
|
| 268 |
-
state = self.env.reset()
|
| 269 |
-
rewards = []
|
| 270 |
-
rewards_pre_state = []
|
| 271 |
-
rewards_post_state = []
|
| 272 |
-
states = []
|
| 273 |
-
next_states = []
|
| 274 |
-
actions = []
|
| 275 |
-
probs = []
|
| 276 |
-
post_states = []
|
| 277 |
-
# next_post_states = []
|
| 278 |
-
done = False
|
| 279 |
-
while not done:
|
| 280 |
-
action, log_prob, prob, value, value_post = self.get_action(state)
|
| 281 |
-
next_state, reward, done, info = self.env.step(action[0].detach().numpy())
|
| 282 |
-
machine_setup, inventory_level, setup_cost = self.get_post_state(action[0].detach().numpy(), state[self.env.n_items:self.env.n_items+self.env.n_machines], state[0:self.env.n_items])
|
| 283 |
-
post_state = state.copy()
|
| 284 |
-
post_state[self.env.n_items:self.env.n_items+self.env.n_machines] = machine_setup
|
| 285 |
-
post_state[0:self.env.n_items] = inventory_level
|
| 286 |
-
post_states.append(post_state)
|
| 287 |
-
post_state = torch.from_numpy(post_state).float().unsqueeze(0)
|
| 288 |
-
rewards.append(reward)
|
| 289 |
-
reward_pre_state = -(self.env.total_cost['holding_costs'] + self.env.total_cost['lost_sales'])
|
| 290 |
-
reward_post_state = -setup_cost.sum()
|
| 291 |
-
rewards_pre_state.append(reward_pre_state)
|
| 292 |
-
rewards_post_state.append(reward_post_state)
|
| 293 |
-
states.append(state)
|
| 294 |
-
next_states.append(next_state)
|
| 295 |
-
actions.append(action)
|
| 296 |
-
probs.append(prob)
|
| 297 |
-
|
| 298 |
-
state = next_state
|
| 299 |
-
if done:
|
| 300 |
-
self.update(rewards, rewards_pre_state, rewards_post_state, states, post_states, actions, probs, next_states)
|
| 301 |
-
print('Episode:', episode, 'Reward:', sum(rewards))
|
| 302 |
-
if episode % save_interval == 0:
|
| 303 |
-
self.save(f'policy_{episode}.pt')
|
| 304 |
-
self.save(self.LOG_DIR)
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
def save(self, filepath):
|
| 308 |
-
torch.save({
|
| 309 |
-
'policy_state_dict': self.policy.state_dict(),
|
| 310 |
-
'value_state_dict': self.value.state_dict(),
|
| 311 |
-
'value_post_state_dict': self.value_post.state_dict(),
|
| 312 |
-
'optimizer_policy_state_dict': self.optimizer_policy.state_dict(),
|
| 313 |
-
'optimizer_value_state_dict': self.optimizer_value.state_dict(),
|
| 314 |
-
'optimizer_value_post_state_dict': self.optimizer_value_post.state_dict()
|
| 315 |
-
}, filepath)
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
def load(self, filepath):
|
| 320 |
-
checkpoint = torch.load(filepath)
|
| 321 |
-
self.policy.load_state_dict(checkpoint['policy_state_dict'])
|
| 322 |
-
self.value.load_state_dict(checkpoint['value_state_dict'])
|
| 323 |
-
self.value_post.load_state_dict(checkpoint['value_post_state_dict'])
|
| 324 |
-
self.optimizer_policy.load_state_dict(checkpoint['optimizer_policy_state_dict'])
|
| 325 |
-
self.optimizer_value.load_state_dict(checkpoint['optimizer_value_state_dict'])
|
| 326 |
-
self.optimizer_value_post.load_state_dict(checkpoint['optimizer_value_post_state_dict'])
|
| 327 |
-
|
| 328 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
code/Lot-sizing/agents/{PDPPO one critic.py → PDPPOonecritic.py}
RENAMED
|
@@ -144,7 +144,7 @@ class ActorCritic(nn.Module):
|
|
| 144 |
return action_logprobs, state_values, dist_entropy
|
| 145 |
|
| 146 |
|
| 147 |
-
class
|
| 148 |
def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, env, has_continuous_action_space, tau, action_std_init=0.6):
|
| 149 |
|
| 150 |
self.has_continuous_action_space = has_continuous_action_space
|
|
|
|
| 144 |
return action_logprobs, state_values, dist_entropy
|
| 145 |
|
| 146 |
|
| 147 |
+
class PDPPOonecritic:
|
| 148 |
def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, env, has_continuous_action_space, tau, action_std_init=0.6):
|
| 149 |
|
| 150 |
self.has_continuous_action_space = has_continuous_action_space
|
code/Lot-sizing/agents/PPO.py
CHANGED
|
@@ -214,7 +214,7 @@ class PPO:
|
|
| 214 |
self.buffer.logprobs.append(action_logprob)
|
| 215 |
self.buffer.state_values.append(state_val)
|
| 216 |
|
| 217 |
-
return action.numpy()
|
| 218 |
|
| 219 |
def update(self):
|
| 220 |
# Monte Carlo estimate of returns
|
|
@@ -258,8 +258,6 @@ class PPO:
|
|
| 258 |
# final loss of clipped objective PPO
|
| 259 |
loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.012 * dist_entropy
|
| 260 |
|
| 261 |
-
loss_numpy = loss.detach().numpy()
|
| 262 |
-
|
| 263 |
# take gradient step
|
| 264 |
self.optimizer.zero_grad()
|
| 265 |
loss.mean().backward()
|
|
|
|
| 214 |
self.buffer.logprobs.append(action_logprob)
|
| 215 |
self.buffer.state_values.append(state_val)
|
| 216 |
|
| 217 |
+
return action.cpu().numpy()
|
| 218 |
|
| 219 |
def update(self):
|
| 220 |
# Monte Carlo estimate of returns
|
|
|
|
| 258 |
# final loss of clipped objective PPO
|
| 259 |
loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.012 * dist_entropy
|
| 260 |
|
|
|
|
|
|
|
| 261 |
# take gradient step
|
| 262 |
self.optimizer.zero_grad()
|
| 263 |
loss.mean().backward()
|
code/Lot-sizing/agents/PPOAgent.py
CHANGED
|
@@ -14,7 +14,7 @@ BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
|
|
| 14 |
AGENTS_DIR = os.path.join(BASE_DIR,'agents')
|
| 15 |
sys.path.append(AGENTS_DIR)
|
| 16 |
from agents.PPO import PPO
|
| 17 |
-
from envs import
|
| 18 |
|
| 19 |
|
| 20 |
class SimplePlantSB(SimplePlant):
|
|
@@ -155,8 +155,6 @@ class PPOAgent():
|
|
| 155 |
|
| 156 |
print("training environment name : " + self.experiment_name + '_PPO')
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
# state space dimension
|
| 161 |
self.state_dim = self.env.observation_space.shape[0]
|
| 162 |
|
|
|
|
| 14 |
AGENTS_DIR = os.path.join(BASE_DIR,'agents')
|
| 15 |
sys.path.append(AGENTS_DIR)
|
| 16 |
from agents.PPO import PPO
|
| 17 |
+
from envs import SimplePlant
|
| 18 |
|
| 19 |
|
| 20 |
class SimplePlantSB(SimplePlant):
|
|
|
|
| 155 |
|
| 156 |
print("training environment name : " + self.experiment_name + '_PPO')
|
| 157 |
|
|
|
|
|
|
|
| 158 |
# state space dimension
|
| 159 |
self.state_dim = self.env.observation_space.shape[0]
|
| 160 |
|
code/Lot-sizing/agents/PPOAgent_two_critics.py
ADDED
|
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os # Provides a way of interacting with the file system
|
| 2 |
+
import sys
|
| 3 |
+
import glob # Helps find all the pathnames matching a specified pattern according to the rules used by the Unix shell
|
| 4 |
+
import time # Provides various time-related functions
|
| 5 |
+
from datetime import datetime # Module that supplies classes for working with dates and times
|
| 6 |
+
|
| 7 |
+
import numpy as np # A library for the Python programming language, adding support for large, multi-dimensional arrays and matrices
|
| 8 |
+
import gym # Provides a collection of test problems — environments — that you can use to work out your reinforcement learning algorithms
|
| 9 |
+
import torch # A machine learning framework that provides tensor computation (like NumPy) with strong acceleration on GPUs
|
| 10 |
+
import copy # Provides a module for shallow and deep copying operations
|
| 11 |
+
import matplotlib.pyplot as plt # A plotting library for the Python programming language and its numerical mathematics extension NumPy
|
| 12 |
+
import matplotlib.patches as mpatches # Provides a way of adding a colored patch to the plot, for example to create a legend
|
| 13 |
+
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
|
| 14 |
+
AGENTS_DIR = os.path.join(BASE_DIR,'agents')
|
| 15 |
+
sys.path.append(AGENTS_DIR)
|
| 16 |
+
from agents.PPOtwocritics import PPOtwocritics
|
| 17 |
+
from envs import SimplePlant
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SimplePlantSB(SimplePlant):
|
| 21 |
+
def __init__(self, settings, stoch_model):
|
| 22 |
+
super().__init__(settings, stoch_model)
|
| 23 |
+
try:self.dict_obs = settings['dict_obs']
|
| 24 |
+
except:self.dict_obs = False
|
| 25 |
+
self.last_inventory = copy.copy(self.inventory_level)
|
| 26 |
+
self.action_space = gym.spaces.MultiDiscrete(
|
| 27 |
+
[self.n_items+1] * self.n_machines
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
if self.dict_obs:
|
| 31 |
+
self.observation_space = gym.spaces.Dict({
|
| 32 |
+
'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items),
|
| 33 |
+
'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines)
|
| 34 |
+
#'last_inventory_level':gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items)
|
| 35 |
+
})
|
| 36 |
+
else:
|
| 37 |
+
self.observation_space = gym.spaces.Box(
|
| 38 |
+
low=np.zeros(self.n_items+self.n_machines),# high for the inventory level
|
| 39 |
+
high=np.concatenate(
|
| 40 |
+
[
|
| 41 |
+
np.array(self.max_inventory_level),
|
| 42 |
+
np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups
|
| 43 |
+
#np.array(self.max_inventory_level) # high for the inventory level
|
| 44 |
+
]),
|
| 45 |
+
dtype=np.int32
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
def step(self, action):
|
| 49 |
+
"""
|
| 50 |
+
Step method: Execute one time step within the environment
|
| 51 |
+
|
| 52 |
+
Parameters
|
| 53 |
+
----------
|
| 54 |
+
action : action given by the agent
|
| 55 |
+
|
| 56 |
+
Returns
|
| 57 |
+
-------
|
| 58 |
+
obs : Observation of the state give the method _next_observation
|
| 59 |
+
reward : Cost given by the _reward method
|
| 60 |
+
done : returns True or False given by the _done method
|
| 61 |
+
dict : possible information for control to environment monitoring
|
| 62 |
+
|
| 63 |
+
"""
|
| 64 |
+
self.last_inventory = copy.copy(self.inventory_level)
|
| 65 |
+
|
| 66 |
+
self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand)
|
| 67 |
+
|
| 68 |
+
# self.total_cost['setup_costs'] = 0
|
| 69 |
+
# self.total_cost['holding_costs'] = 0
|
| 70 |
+
|
| 71 |
+
reward = -sum([ele for key, ele in self.total_cost.items()])
|
| 72 |
+
#reward = -self.total_cost['lost_sales']
|
| 73 |
+
|
| 74 |
+
#reward = np.abs(action)
|
| 75 |
+
|
| 76 |
+
self.current_step += 1
|
| 77 |
+
done = self.current_step == self.T
|
| 78 |
+
obs = self._next_observation()
|
| 79 |
+
|
| 80 |
+
return obs, reward, done, self.total_cost
|
| 81 |
+
|
| 82 |
+
def _next_observation(self):
|
| 83 |
+
"""
|
| 84 |
+
Returns the next demand
|
| 85 |
+
"""
|
| 86 |
+
obs = SimplePlant._next_observation(self)
|
| 87 |
+
#obs['last_inventory_level'] = copy.copy(self.last_inventory)
|
| 88 |
+
if isinstance(obs, dict):
|
| 89 |
+
if not self.dict_obs:
|
| 90 |
+
obs = np.concatenate(
|
| 91 |
+
(
|
| 92 |
+
obs['inventory_level'], # n_items size
|
| 93 |
+
obs['machine_setup'], # n_machine size
|
| 94 |
+
#obs['last_inventory_level']# n_items size
|
| 95 |
+
)
|
| 96 |
+
)
|
| 97 |
+
else:
|
| 98 |
+
if self.dict_obs:
|
| 99 |
+
raise('Change dict_obst to False')
|
| 100 |
+
return obs
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
class PPOAgent_two_critics():
|
| 104 |
+
def __init__(self, env: SimplePlant, settings: dict):
|
| 105 |
+
self.env = SimplePlantSB(env.settings, env.stoch_model)
|
| 106 |
+
self.last_inventory = env.inventory_level
|
| 107 |
+
self.model_name = settings['model_name']
|
| 108 |
+
self.experiment_name = settings['experiment_name']
|
| 109 |
+
self.parallelization = settings['parallelization']
|
| 110 |
+
try:self.dict_obs = settings['dict_obs']
|
| 111 |
+
except:self.dict_obs = False
|
| 112 |
+
|
| 113 |
+
self.POSSIBLE_STATES = self.env.n_items + 1
|
| 114 |
+
self.env.cost_to_reward = True
|
| 115 |
+
self.epsilon = 0
|
| 116 |
+
|
| 117 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 118 |
+
# Use the logs file in the root path of the main.
|
| 119 |
+
self.LOG_DIR = os.path.join(BASE_DIR,'logs')
|
| 120 |
+
|
| 121 |
+
print("============================================================================================")
|
| 122 |
+
|
| 123 |
+
####### initialize environment hyperparameters ######
|
| 124 |
+
|
| 125 |
+
self.has_continuous_action_space = False # continuous action space; else discrete
|
| 126 |
+
|
| 127 |
+
self.max_ep_len = 1000 # max timesteps in one episode
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
self.print_freq = self.max_ep_len * 10 # print avg reward in the interval (in num timesteps)
|
| 131 |
+
self.log_freq = self.max_ep_len * 2 # log avg reward in the interval (in num timesteps)
|
| 132 |
+
self.save_model_freq = int(4999) # save model frequency (in num timesteps)
|
| 133 |
+
|
| 134 |
+
self.action_std = 0.6 # starting std for action distribution (Multivariate Normal)
|
| 135 |
+
self.action_std_decay_rate = 0.05 # linearly decay self.action_std (self.action_std = self.action_std - self.action_std_decay_rate)
|
| 136 |
+
self.min_action_std = 0.1 # minimum self.action_std (stop decay after self.action_std <= min_self.action_std)
|
| 137 |
+
self.action_std_decay_freq = int(2.5e5) # self.action_std decay frequency (in num timesteps)
|
| 138 |
+
#####################################################
|
| 139 |
+
|
| 140 |
+
## Note : print/log frequencies should be > than self.max_ep_len
|
| 141 |
+
|
| 142 |
+
################ PPO_two_critics hyperparameters ################
|
| 143 |
+
self.update_timestep = self.max_ep_len * 4 # update policy every n timesteps
|
| 144 |
+
self.K_epochs = 60 # update policy for K epochs in one PPO_two_critics update
|
| 145 |
+
|
| 146 |
+
self.eps_clip = 0.2 # clip parameter for PPO_two_critics
|
| 147 |
+
self.gamma = 0.99 # discount factor
|
| 148 |
+
|
| 149 |
+
self.lr_actor = 0.00055 # learning rate for actor network
|
| 150 |
+
self.lr_critic = 0.001 # learning rate for critic network
|
| 151 |
+
|
| 152 |
+
self.random_seed = 0 # set random seed if required (0 = no random seed)
|
| 153 |
+
#####################################################
|
| 154 |
+
self.run_num_pretrained = 0 #### change this to prevent overwriting weights in same self.experiment_name folder
|
| 155 |
+
|
| 156 |
+
print("training environment name : " + self.experiment_name + '_PPO_two_critics')
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# state space dimension
|
| 161 |
+
self.state_dim = self.env.observation_space.shape[0]
|
| 162 |
+
|
| 163 |
+
# action space dimension
|
| 164 |
+
if self.has_continuous_action_space:
|
| 165 |
+
self.action_dim = self.env.action_space.shape[0]
|
| 166 |
+
else:
|
| 167 |
+
self.action_dim = self.env.action_space
|
| 168 |
+
|
| 169 |
+
self.ppo_agent = PPOtwocritics(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, self.has_continuous_action_space, self.action_std)
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
################################### Training ###################################
|
| 173 |
+
def learn(self,n_episodes = 100000):
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
###################### logging ######################
|
| 177 |
+
|
| 178 |
+
self.max_training_timesteps = n_episodes # break training loop if timeteps > self.max_training_timesteps
|
| 179 |
+
|
| 180 |
+
env = self.env
|
| 181 |
+
|
| 182 |
+
#### log files for multiple runs are NOT overwritten
|
| 183 |
+
log_dir = self.LOG_DIR
|
| 184 |
+
if not os.path.exists(log_dir):
|
| 185 |
+
os.makedirs(log_dir)
|
| 186 |
+
|
| 187 |
+
log_dir = log_dir + '/' + self.experiment_name + '_PPO_two_critics/'
|
| 188 |
+
if not os.path.exists(log_dir):
|
| 189 |
+
os.makedirs(log_dir)
|
| 190 |
+
|
| 191 |
+
#### get number of log files in log directory
|
| 192 |
+
run_num = 0
|
| 193 |
+
current_num_files = next(os.walk(log_dir))[2]
|
| 194 |
+
run_num = len(current_num_files)
|
| 195 |
+
|
| 196 |
+
#### create new log file for each run
|
| 197 |
+
log_f_name = log_dir + '/PPO_two_critics_' + self.experiment_name + "_log_" + str(run_num) + ".csv"
|
| 198 |
+
|
| 199 |
+
print("current logging run number for " + self.experiment_name + " : ", run_num)
|
| 200 |
+
print("logging at : " + log_f_name)
|
| 201 |
+
#####################################################
|
| 202 |
+
|
| 203 |
+
################### checkpointing ###################
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
directory = self.LOG_DIR
|
| 207 |
+
if not os.path.exists(directory):
|
| 208 |
+
os.makedirs(directory)
|
| 209 |
+
|
| 210 |
+
directory = directory + '/' + self.experiment_name + '_PPO_two_critics' + '/'
|
| 211 |
+
if not os.path.exists(directory):
|
| 212 |
+
os.makedirs(directory)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
checkpoint_path = directory + "PPO_two_critics_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
|
| 216 |
+
print("save checkpoint path : " + checkpoint_path)
|
| 217 |
+
#####################################################
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
############# print all hyperparameters #############
|
| 221 |
+
print("--------------------------------------------------------------------------------------------")
|
| 222 |
+
print("max training timesteps : ", self.max_training_timesteps)
|
| 223 |
+
print("max timesteps per episode : ", self.max_ep_len)
|
| 224 |
+
print("model saving frequency : " + str(self.save_model_freq) + " timesteps")
|
| 225 |
+
print("log frequency : " + str(self.log_freq) + " timesteps")
|
| 226 |
+
print("printing average reward over episodes in last : " + str(self.print_freq) + " timesteps")
|
| 227 |
+
print("--------------------------------------------------------------------------------------------")
|
| 228 |
+
print("state space dimension : ", self.state_dim)
|
| 229 |
+
print("action space dimension : ", self.action_dim)
|
| 230 |
+
print("--------------------------------------------------------------------------------------------")
|
| 231 |
+
if self.has_continuous_action_space:
|
| 232 |
+
print("Initializing a continuous action space policy")
|
| 233 |
+
print("--------------------------------------------------------------------------------------------")
|
| 234 |
+
print("starting std of action distribution : ", self.action_std)
|
| 235 |
+
print("decay rate of std of action distribution : ", self.action_std_decay_rate)
|
| 236 |
+
print("minimum std of action distribution : ", min_self.action_std)
|
| 237 |
+
print("decay frequency of std of action distribution : " + str(self.action_std_decay_freq) + " timesteps")
|
| 238 |
+
else:
|
| 239 |
+
print("Initializing a discrete action space policy")
|
| 240 |
+
print("--------------------------------------------------------------------------------------------")
|
| 241 |
+
print("PPO_two_critics update frequency : " + str(self.update_timestep) + " timesteps")
|
| 242 |
+
print("PPO_two_critics K epochs : ", self.K_epochs)
|
| 243 |
+
print("PPO_two_critics epsilon clip : ", self.eps_clip)
|
| 244 |
+
print("discount factor (self.gamma) : ", self.gamma)
|
| 245 |
+
print("--------------------------------------------------------------------------------------------")
|
| 246 |
+
print("optimizer learning rate actor : ", self.lr_actor)
|
| 247 |
+
print("optimizer learning rate critic : ", self.lr_critic)
|
| 248 |
+
if self.random_seed:
|
| 249 |
+
print("--------------------------------------------------------------------------------------------")
|
| 250 |
+
print("setting random seed to ", self.random_seed)
|
| 251 |
+
|
| 252 |
+
#####################################################
|
| 253 |
+
|
| 254 |
+
print("============================================================================================")
|
| 255 |
+
|
| 256 |
+
################# training procedure ################
|
| 257 |
+
|
| 258 |
+
# initialize a PPO agent
|
| 259 |
+
self.ppo_agent = PPOtwocritics(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, self.has_continuous_action_space, self.action_std)
|
| 260 |
+
|
| 261 |
+
# track total training time
|
| 262 |
+
start_time = datetime.now().replace(microsecond=0)
|
| 263 |
+
print("Started training at (GMT) : ", start_time)
|
| 264 |
+
|
| 265 |
+
print("============================================================================================")
|
| 266 |
+
|
| 267 |
+
# logging file
|
| 268 |
+
log_f = open(log_f_name,"w+")
|
| 269 |
+
log_f.write('episode,timestep,reward\n')
|
| 270 |
+
|
| 271 |
+
# printing and logging variables
|
| 272 |
+
print_running_reward = 0
|
| 273 |
+
print_running_episodes = 0
|
| 274 |
+
|
| 275 |
+
log_running_reward = 0
|
| 276 |
+
log_running_episodes = 0
|
| 277 |
+
|
| 278 |
+
time_step = 0
|
| 279 |
+
i_episode = 0
|
| 280 |
+
|
| 281 |
+
# training loop
|
| 282 |
+
while time_step <= self.max_training_timesteps:
|
| 283 |
+
|
| 284 |
+
state = env.reset()
|
| 285 |
+
current_ep_reward = 0
|
| 286 |
+
|
| 287 |
+
for t in range(1, self.max_ep_len+1):
|
| 288 |
+
|
| 289 |
+
# select action with policy
|
| 290 |
+
action = self.ppo_agent.select_action(state)
|
| 291 |
+
state, reward, done, _ = env.step(action)
|
| 292 |
+
|
| 293 |
+
# saving reward and is_terminals
|
| 294 |
+
self.ppo_agent.buffer.rewards.append(reward)
|
| 295 |
+
self.ppo_agent.buffer.is_terminals.append(done)
|
| 296 |
+
|
| 297 |
+
time_step +=1
|
| 298 |
+
current_ep_reward += reward
|
| 299 |
+
|
| 300 |
+
# update PPO_two_critics agent
|
| 301 |
+
if time_step % self.update_timestep == 0:
|
| 302 |
+
self.ppo_agent.update()
|
| 303 |
+
|
| 304 |
+
# if continuous action space; then decay action std of ouput action distribution
|
| 305 |
+
if self.has_continuous_action_space and time_step % self.action_std_decay_freq == 0:
|
| 306 |
+
self.ppo_agent.decay_self.action_std(self.action_std_decay_rate, self.action_std)
|
| 307 |
+
|
| 308 |
+
# log in logging file
|
| 309 |
+
if time_step % self.log_freq == 0:
|
| 310 |
+
|
| 311 |
+
# log average reward till last episode
|
| 312 |
+
log_avg_reward = log_running_reward / log_running_episodes
|
| 313 |
+
log_avg_reward = round(log_avg_reward, 4)
|
| 314 |
+
|
| 315 |
+
log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
|
| 316 |
+
log_f.flush()
|
| 317 |
+
|
| 318 |
+
log_running_reward = 0
|
| 319 |
+
log_running_episodes = 0
|
| 320 |
+
|
| 321 |
+
# printing average reward
|
| 322 |
+
if time_step % self.print_freq == 0:
|
| 323 |
+
|
| 324 |
+
# print average reward till last episode
|
| 325 |
+
print_avg_reward = print_running_reward / print_running_episodes
|
| 326 |
+
print_avg_reward = round(print_avg_reward, 2)
|
| 327 |
+
|
| 328 |
+
print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))
|
| 329 |
+
|
| 330 |
+
print_running_reward = 0
|
| 331 |
+
print_running_episodes = 0
|
| 332 |
+
|
| 333 |
+
# save model weights
|
| 334 |
+
if time_step % self.save_model_freq == 0:
|
| 335 |
+
print("--------------------------------------------------------------------------------------------")
|
| 336 |
+
print("saving model at : " + checkpoint_path)
|
| 337 |
+
self.ppo_agent.save(checkpoint_path)
|
| 338 |
+
print("model saved")
|
| 339 |
+
print("Elapsed Time : ", datetime.now().replace(microsecond=0) - start_time)
|
| 340 |
+
print("--------------------------------------------------------------------------------------------")
|
| 341 |
+
|
| 342 |
+
# break; if the episode is over
|
| 343 |
+
if done:
|
| 344 |
+
break
|
| 345 |
+
|
| 346 |
+
print_running_reward += current_ep_reward
|
| 347 |
+
print_running_episodes += 1
|
| 348 |
+
|
| 349 |
+
log_running_reward += current_ep_reward
|
| 350 |
+
log_running_episodes += 1
|
| 351 |
+
|
| 352 |
+
i_episode += 1
|
| 353 |
+
|
| 354 |
+
log_f.close()
|
| 355 |
+
#env.close()
|
| 356 |
+
|
| 357 |
+
# print total training time
|
| 358 |
+
print("============================================================================================")
|
| 359 |
+
end_time = datetime.now().replace(microsecond=0)
|
| 360 |
+
print("Started training at (GMT) : ", start_time)
|
| 361 |
+
print("Finished training at (GMT) : ", end_time)
|
| 362 |
+
print("Total training time : ", end_time - start_time)
|
| 363 |
+
print("============================================================================================")
|
| 364 |
+
|
| 365 |
+
def get_action(self,state):
|
| 366 |
+
if isinstance(state, dict):
|
| 367 |
+
if not self.dict_obs:
|
| 368 |
+
state = np.concatenate(
|
| 369 |
+
(
|
| 370 |
+
state['inventory_level'], # n_items size
|
| 371 |
+
state['machine_setup'], # n_machine size
|
| 372 |
+
)
|
| 373 |
+
)
|
| 374 |
+
else:
|
| 375 |
+
if self.dict_obs:
|
| 376 |
+
raise('Change dict_obst to False')
|
| 377 |
+
return self.ppo_agent.select_action(state)
|
| 378 |
+
|
| 379 |
+
def load_agent(self,path):
|
| 380 |
+
#directory = "PPO_two_critics_preTrained" + '/' + env_name + '/'
|
| 381 |
+
directory = self.LOG_DIR
|
| 382 |
+
directory = directory + '/' + self.experiment_name + '_PPO_two_critics' + '/'
|
| 383 |
+
checkpoint_path = directory + "PPO_two_critics_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained)
|
| 384 |
+
print("loading network from : " + checkpoint_path)
|
| 385 |
+
self.ppo_agent.load(checkpoint_path)
|
code/Lot-sizing/agents/{PDPPO_one_critic.py → PPOtwocritics.py}
RENAMED
|
@@ -5,10 +5,8 @@ Created on Wed Mar 1 00:43:49 2023
|
|
| 5 |
@author: leona
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
import numpy as np
|
| 9 |
import torch
|
| 10 |
import torch.nn as nn
|
| 11 |
-
import torch.nn.init as init
|
| 12 |
from torch.distributions import MultivariateNormal
|
| 13 |
from torch.distributions import Categorical
|
| 14 |
|
|
@@ -25,26 +23,22 @@ else:
|
|
| 25 |
print("============================================================================================")
|
| 26 |
|
| 27 |
|
| 28 |
-
##################################
|
| 29 |
class RolloutBuffer:
|
| 30 |
def __init__(self):
|
| 31 |
self.actions = []
|
| 32 |
self.states = []
|
| 33 |
-
self.post_states = []
|
| 34 |
self.logprobs = []
|
| 35 |
self.rewards = []
|
| 36 |
self.state_values = []
|
| 37 |
-
self.state_values_post = []
|
| 38 |
self.is_terminals = []
|
| 39 |
|
| 40 |
def clear(self):
|
| 41 |
del self.actions[:]
|
| 42 |
del self.states[:]
|
| 43 |
-
del self.post_states[:]
|
| 44 |
del self.logprobs[:]
|
| 45 |
del self.rewards[:]
|
| 46 |
del self.state_values[:]
|
| 47 |
-
del self.state_values_post[:]
|
| 48 |
del self.is_terminals[:]
|
| 49 |
|
| 50 |
|
|
@@ -74,7 +68,6 @@ class ActorCritic(nn.Module):
|
|
| 74 |
self.fc2 = nn.Linear(128, 128)
|
| 75 |
self.actor = nn.Linear(128, self.action_dim.nvec.sum())
|
| 76 |
|
| 77 |
-
|
| 78 |
# critic
|
| 79 |
self.critic = nn.Sequential(
|
| 80 |
nn.Linear(state_dim, 128),
|
|
@@ -84,6 +77,13 @@ class ActorCritic(nn.Module):
|
|
| 84 |
nn.Linear(128, 1)
|
| 85 |
)
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
def forward(self, state):
|
| 89 |
raise NotImplementedError
|
|
@@ -100,24 +100,28 @@ class ActorCritic(nn.Module):
|
|
| 100 |
|
| 101 |
|
| 102 |
|
| 103 |
-
def act(self, state
|
| 104 |
|
| 105 |
if self.has_continuous_action_space:
|
| 106 |
action_mean = self.actor(state)
|
| 107 |
cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
|
| 108 |
dist = MultivariateNormal(action_mean, cov_mat)
|
| 109 |
else:
|
|
|
|
| 110 |
x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
|
| 111 |
logits = self.actor(x)
|
| 112 |
action_probs = nn.functional.softmax(logits, dim=-1)
|
| 113 |
dist = Categorical(action_probs.view(len(self.action_dim.nvec),-1))
|
|
|
|
|
|
|
| 114 |
|
| 115 |
action = dist.sample()
|
| 116 |
action_logprob = dist.log_prob(action)
|
| 117 |
-
|
| 118 |
-
|
|
|
|
| 119 |
|
| 120 |
-
def evaluate(self, state,
|
| 121 |
|
| 122 |
if self.has_continuous_action_space:
|
| 123 |
action_mean = self.actor(state)
|
|
@@ -130,30 +134,29 @@ class ActorCritic(nn.Module):
|
|
| 130 |
if self.action_dim == 1:
|
| 131 |
action = action.reshape(-1, self.action_dim)
|
| 132 |
else:
|
|
|
|
| 133 |
x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
|
| 134 |
logits = self.actor(x)
|
| 135 |
action_probs = nn.functional.softmax(logits, dim=-1)
|
| 136 |
-
|
| 137 |
dist = Categorical(action_probs.view(state.shape[0],len(self.action_dim.nvec),-1))
|
| 138 |
# action_probs = self.actor(state)
|
| 139 |
# dist = Categorical(action_probs)
|
| 140 |
action_logprobs = dist.log_prob(action)
|
| 141 |
dist_entropy = dist.entropy()
|
| 142 |
-
state_values = self.critic(
|
|
|
|
| 143 |
|
| 144 |
-
return action_logprobs, state_values, dist_entropy
|
| 145 |
|
| 146 |
|
| 147 |
-
class
|
| 148 |
-
def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip,
|
| 149 |
|
| 150 |
self.has_continuous_action_space = has_continuous_action_space
|
| 151 |
|
| 152 |
if has_continuous_action_space:
|
| 153 |
self.action_std = action_std_init
|
| 154 |
-
|
| 155 |
-
self.tau = tau
|
| 156 |
-
self.env = env
|
| 157 |
self.gamma = gamma
|
| 158 |
self.eps_clip = eps_clip
|
| 159 |
self.K_epochs = K_epochs
|
|
@@ -164,7 +167,7 @@ class PDPPO:
|
|
| 164 |
self.optimizer = torch.optim.Adam([
|
| 165 |
{'params': self.policy.actor.parameters(), 'lr': lr_actor},
|
| 166 |
{'params': self.policy.critic.parameters(), 'lr': lr_critic}
|
| 167 |
-
]
|
| 168 |
|
| 169 |
self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
|
| 170 |
self.policy_old.load_state_dict(self.policy.state_dict())
|
|
@@ -178,7 +181,7 @@ class PDPPO:
|
|
| 178 |
self.policy_old.set_action_std(new_action_std)
|
| 179 |
else:
|
| 180 |
print("--------------------------------------------------------------------------------------------")
|
| 181 |
-
print("WARNING : Calling
|
| 182 |
print("--------------------------------------------------------------------------------------------")
|
| 183 |
|
| 184 |
def decay_action_std(self, action_std_decay_rate, min_action_std):
|
|
@@ -194,64 +197,33 @@ class PDPPO:
|
|
| 194 |
self.set_action_std(self.action_std)
|
| 195 |
|
| 196 |
else:
|
| 197 |
-
print("WARNING : Calling
|
| 198 |
print("--------------------------------------------------------------------------------------------")
|
| 199 |
-
|
| 200 |
-
def
|
| 201 |
-
setup_loss = np.zeros(self.env.n_machines, dtype=int)
|
| 202 |
-
setup_costs = np.zeros(self.env.n_machines)
|
| 203 |
-
# if we are just changing the setup, we use the setup cost matrix with the corresponding position given by the actual setup and the new setup
|
| 204 |
-
for m in range(self.env.n_machines):
|
| 205 |
-
if action[m] != 0: # if the machine is not iddle
|
| 206 |
-
# 1. IF NEEDED CHANGE SETUP
|
| 207 |
-
if machine_setup[m] != action[m] and action[m] != 0:
|
| 208 |
-
setup_costs[m] = self.env.setup_costs[m][action[m] - 1]
|
| 209 |
-
setup_loss[m] = self.env.setup_loss[m][action[m] - 1]
|
| 210 |
-
machine_setup[m] = action[m]
|
| 211 |
-
# 2. PRODUCTION
|
| 212 |
-
production = self.env.machine_production_matrix[m][action[m] - 1] - setup_loss[m]
|
| 213 |
-
inventory_level[action[m] - 1] += production
|
| 214 |
-
else:
|
| 215 |
-
machine_setup[m] = 0
|
| 216 |
-
# return the new machine_setup_inventory_level and the setup_cost
|
| 217 |
-
return machine_setup, inventory_level, setup_costs
|
| 218 |
-
|
| 219 |
-
def select_action(self, state,tau):
|
| 220 |
|
| 221 |
if self.has_continuous_action_space:
|
| 222 |
with torch.no_grad():
|
| 223 |
state = torch.FloatTensor(state).to(device)
|
| 224 |
-
action, action_logprob, state_val = self.policy_old.act(state
|
| 225 |
|
| 226 |
self.buffer.states.append(state)
|
| 227 |
self.buffer.actions.append(action)
|
| 228 |
self.buffer.logprobs.append(action_logprob)
|
| 229 |
self.buffer.state_values.append(state_val)
|
|
|
|
| 230 |
|
| 231 |
return action.detach().cpu().numpy().flatten()
|
| 232 |
else:
|
| 233 |
with torch.no_grad():
|
| 234 |
state = torch.FloatTensor(state).to(device)
|
| 235 |
-
action, action_logprob = self.policy_old.act(state
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
machine_setup, inventory_level, setup_cost = self.get_post_state(action, state[self.env.n_items:self.env.n_items+self.env.n_machines].clone(), state[0:self.env.n_items].clone())
|
| 239 |
-
|
| 240 |
-
post_state = state.clone()
|
| 241 |
-
post_state[self.env.n_items:self.env.n_items+self.env.n_machines] = machine_setup.clone()
|
| 242 |
-
post_state[0:self.env.n_items] = inventory_level.clone()
|
| 243 |
-
post_state = torch.FloatTensor(post_state).to(device)
|
| 244 |
|
| 245 |
self.buffer.states.append(state)
|
| 246 |
-
self.buffer.post_states.append(post_state)
|
| 247 |
self.buffer.actions.append(action)
|
| 248 |
self.buffer.logprobs.append(action_logprob)
|
| 249 |
-
|
| 250 |
-
with torch.no_grad():
|
| 251 |
-
#post_state = torch.cat([post_state.clone(),state.clone()])
|
| 252 |
-
state_val = self.policy_old.critic(post_state)
|
| 253 |
-
|
| 254 |
self.buffer.state_values.append(state_val)
|
|
|
|
| 255 |
|
| 256 |
return action.numpy()
|
| 257 |
|
|
@@ -271,19 +243,19 @@ class PDPPO:
|
|
| 271 |
|
| 272 |
# convert list to tensor
|
| 273 |
old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
|
| 274 |
-
old_post_states = torch.squeeze(torch.stack(self.buffer.post_states, dim=0)).detach().to(device)
|
| 275 |
old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
|
| 276 |
old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
|
| 277 |
old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
|
|
|
|
| 278 |
|
| 279 |
# calculate advantages
|
| 280 |
-
advantages = rewards.detach() - old_state_values.detach()
|
| 281 |
|
| 282 |
# Optimize policy for K epochs
|
| 283 |
for _ in range(self.K_epochs):
|
| 284 |
|
| 285 |
# Evaluating old actions and values
|
| 286 |
-
logprobs, state_values, dist_entropy = self.policy.evaluate(old_states,
|
| 287 |
|
| 288 |
# match state_values tensor dimensions with rewards tensor
|
| 289 |
state_values = torch.squeeze(state_values)
|
|
@@ -295,19 +267,17 @@ class PDPPO:
|
|
| 295 |
surr1 = ratios * advantages.unsqueeze(1)
|
| 296 |
surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages.unsqueeze(1)
|
| 297 |
|
| 298 |
-
# final loss of clipped objective
|
| 299 |
-
loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.012 * dist_entropy
|
| 300 |
|
| 301 |
loss_numpy = loss.detach().numpy()
|
| 302 |
|
| 303 |
# take gradient step
|
| 304 |
self.optimizer.zero_grad()
|
| 305 |
loss.mean().backward()
|
| 306 |
-
torch.nn.utils.clip_grad_norm_(self.policy.parameters(), max_norm=1)
|
| 307 |
self.optimizer.step()
|
| 308 |
|
| 309 |
# Copy new weights into old policy
|
| 310 |
-
|
| 311 |
self.policy_old.load_state_dict(self.policy.state_dict())
|
| 312 |
|
| 313 |
# clear buffer
|
|
|
|
| 5 |
@author: leona
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
import torch
|
| 9 |
import torch.nn as nn
|
|
|
|
| 10 |
from torch.distributions import MultivariateNormal
|
| 11 |
from torch.distributions import Categorical
|
| 12 |
|
|
|
|
| 23 |
print("============================================================================================")
|
| 24 |
|
| 25 |
|
| 26 |
+
################################## PPO_two_critics Policy ##################################
|
| 27 |
class RolloutBuffer:
|
| 28 |
def __init__(self):
|
| 29 |
self.actions = []
|
| 30 |
self.states = []
|
|
|
|
| 31 |
self.logprobs = []
|
| 32 |
self.rewards = []
|
| 33 |
self.state_values = []
|
|
|
|
| 34 |
self.is_terminals = []
|
| 35 |
|
| 36 |
def clear(self):
|
| 37 |
del self.actions[:]
|
| 38 |
del self.states[:]
|
|
|
|
| 39 |
del self.logprobs[:]
|
| 40 |
del self.rewards[:]
|
| 41 |
del self.state_values[:]
|
|
|
|
| 42 |
del self.is_terminals[:]
|
| 43 |
|
| 44 |
|
|
|
|
| 68 |
self.fc2 = nn.Linear(128, 128)
|
| 69 |
self.actor = nn.Linear(128, self.action_dim.nvec.sum())
|
| 70 |
|
|
|
|
| 71 |
# critic
|
| 72 |
self.critic = nn.Sequential(
|
| 73 |
nn.Linear(state_dim, 128),
|
|
|
|
| 77 |
nn.Linear(128, 1)
|
| 78 |
)
|
| 79 |
|
| 80 |
+
self.critic_2 = nn.Sequential(
|
| 81 |
+
nn.Linear(state_dim, 128),
|
| 82 |
+
nn.Tanh(),
|
| 83 |
+
nn.Linear(128, 128),
|
| 84 |
+
nn.Tanh(),
|
| 85 |
+
nn.Linear(128, 1)
|
| 86 |
+
)
|
| 87 |
|
| 88 |
def forward(self, state):
|
| 89 |
raise NotImplementedError
|
|
|
|
| 100 |
|
| 101 |
|
| 102 |
|
| 103 |
+
def act(self, state):
|
| 104 |
|
| 105 |
if self.has_continuous_action_space:
|
| 106 |
action_mean = self.actor(state)
|
| 107 |
cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
|
| 108 |
dist = MultivariateNormal(action_mean, cov_mat)
|
| 109 |
else:
|
| 110 |
+
#x = nn.functional.relu(self.fc(state))
|
| 111 |
x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
|
| 112 |
logits = self.actor(x)
|
| 113 |
action_probs = nn.functional.softmax(logits, dim=-1)
|
| 114 |
dist = Categorical(action_probs.view(len(self.action_dim.nvec),-1))
|
| 115 |
+
# action_probs = self.actor(state)
|
| 116 |
+
# dist = Categorical(action_probs)
|
| 117 |
|
| 118 |
action = dist.sample()
|
| 119 |
action_logprob = dist.log_prob(action)
|
| 120 |
+
state_val = self.critic(state)
|
| 121 |
+
|
| 122 |
+
return action.cpu().detach(), action_logprob.detach(), state_val.detach()
|
| 123 |
|
| 124 |
+
def evaluate(self, state, action):
|
| 125 |
|
| 126 |
if self.has_continuous_action_space:
|
| 127 |
action_mean = self.actor(state)
|
|
|
|
| 134 |
if self.action_dim == 1:
|
| 135 |
action = action.reshape(-1, self.action_dim)
|
| 136 |
else:
|
| 137 |
+
#x = nn.functional.relu(self.fc(state))
|
| 138 |
x = nn.functional.relu(self.fc2(nn.functional.relu(self.fc1(state))))
|
| 139 |
logits = self.actor(x)
|
| 140 |
action_probs = nn.functional.softmax(logits, dim=-1)
|
|
|
|
| 141 |
dist = Categorical(action_probs.view(state.shape[0],len(self.action_dim.nvec),-1))
|
| 142 |
# action_probs = self.actor(state)
|
| 143 |
# dist = Categorical(action_probs)
|
| 144 |
action_logprobs = dist.log_prob(action)
|
| 145 |
dist_entropy = dist.entropy()
|
| 146 |
+
state_values = self.critic(state)
|
| 147 |
+
state_values_2 = self.critic_2(state)
|
| 148 |
|
| 149 |
+
return action_logprobs, state_values, state_values_2, dist_entropy
|
| 150 |
|
| 151 |
|
| 152 |
+
class PPOtwocritics:
|
| 153 |
+
def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std_init=0.6):
|
| 154 |
|
| 155 |
self.has_continuous_action_space = has_continuous_action_space
|
| 156 |
|
| 157 |
if has_continuous_action_space:
|
| 158 |
self.action_std = action_std_init
|
| 159 |
+
|
|
|
|
|
|
|
| 160 |
self.gamma = gamma
|
| 161 |
self.eps_clip = eps_clip
|
| 162 |
self.K_epochs = K_epochs
|
|
|
|
| 167 |
self.optimizer = torch.optim.Adam([
|
| 168 |
{'params': self.policy.actor.parameters(), 'lr': lr_actor},
|
| 169 |
{'params': self.policy.critic.parameters(), 'lr': lr_critic}
|
| 170 |
+
])
|
| 171 |
|
| 172 |
self.policy_old = ActorCritic(state_dim, action_dim, has_continuous_action_space, action_std_init).to(device)
|
| 173 |
self.policy_old.load_state_dict(self.policy.state_dict())
|
|
|
|
| 181 |
self.policy_old.set_action_std(new_action_std)
|
| 182 |
else:
|
| 183 |
print("--------------------------------------------------------------------------------------------")
|
| 184 |
+
print("WARNING : Calling PPO_two_critics::set_action_std() on discrete action space policy")
|
| 185 |
print("--------------------------------------------------------------------------------------------")
|
| 186 |
|
| 187 |
def decay_action_std(self, action_std_decay_rate, min_action_std):
|
|
|
|
| 197 |
self.set_action_std(self.action_std)
|
| 198 |
|
| 199 |
else:
|
| 200 |
+
print("WARNING : Calling PPO_two_critics::decay_action_std() on discrete action space policy")
|
| 201 |
print("--------------------------------------------------------------------------------------------")
|
| 202 |
+
|
| 203 |
+
def select_action(self, state):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
if self.has_continuous_action_space:
|
| 206 |
with torch.no_grad():
|
| 207 |
state = torch.FloatTensor(state).to(device)
|
| 208 |
+
action, action_logprob, state_val, state_val_2 = self.policy_old.act(state)
|
| 209 |
|
| 210 |
self.buffer.states.append(state)
|
| 211 |
self.buffer.actions.append(action)
|
| 212 |
self.buffer.logprobs.append(action_logprob)
|
| 213 |
self.buffer.state_values.append(state_val)
|
| 214 |
+
self.buffer.state_values_2.append(state_val_2)
|
| 215 |
|
| 216 |
return action.detach().cpu().numpy().flatten()
|
| 217 |
else:
|
| 218 |
with torch.no_grad():
|
| 219 |
state = torch.FloatTensor(state).to(device)
|
| 220 |
+
action, action_logprob, state_val, state_val_2 = self.policy_old.act(state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
self.buffer.states.append(state)
|
|
|
|
| 223 |
self.buffer.actions.append(action)
|
| 224 |
self.buffer.logprobs.append(action_logprob)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
self.buffer.state_values.append(state_val)
|
| 226 |
+
self.buffer.state_values_2.append(state_val_2)
|
| 227 |
|
| 228 |
return action.numpy()
|
| 229 |
|
|
|
|
| 243 |
|
| 244 |
# convert list to tensor
|
| 245 |
old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
|
|
|
|
| 246 |
old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
|
| 247 |
old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)
|
| 248 |
old_state_values = torch.squeeze(torch.stack(self.buffer.state_values, dim=0)).detach().to(device)
|
| 249 |
+
old_state_values_2 = torch.squeeze(torch.stack(self.buffer.state_values_2, dim=0)).detach().to(device)
|
| 250 |
|
| 251 |
# calculate advantages
|
| 252 |
+
advantages = rewards.detach() - torch.min(old_state_values.detach(), old_state_values_2.detach()).detach()
|
| 253 |
|
| 254 |
# Optimize policy for K epochs
|
| 255 |
for _ in range(self.K_epochs):
|
| 256 |
|
| 257 |
# Evaluating old actions and values
|
| 258 |
+
logprobs, state_values, state_values_2, dist_entropy = self.policy.evaluate(old_states, old_actions, self.tau)
|
| 259 |
|
| 260 |
# match state_values tensor dimensions with rewards tensor
|
| 261 |
state_values = torch.squeeze(state_values)
|
|
|
|
| 267 |
surr1 = ratios * advantages.unsqueeze(1)
|
| 268 |
surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages.unsqueeze(1)
|
| 269 |
|
| 270 |
+
# final loss of clipped objective PPO_two_critics
|
| 271 |
+
loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(torch.min(state_values,state_values_2.squeeze()), rewards) - 0.012 * dist_entropy
|
| 272 |
|
| 273 |
loss_numpy = loss.detach().numpy()
|
| 274 |
|
| 275 |
# take gradient step
|
| 276 |
self.optimizer.zero_grad()
|
| 277 |
loss.mean().backward()
|
|
|
|
| 278 |
self.optimizer.step()
|
| 279 |
|
| 280 |
# Copy new weights into old policy
|
|
|
|
| 281 |
self.policy_old.load_state_dict(self.policy.state_dict())
|
| 282 |
|
| 283 |
# clear buffer
|
code/Lot-sizing/agents/__init__.py
CHANGED
|
@@ -1,36 +1,17 @@
|
|
| 1 |
-
from .
|
| 2 |
-
from .
|
| 3 |
-
from .
|
| 4 |
-
from .
|
| 5 |
-
from .
|
| 6 |
from .stableBaselineAgents import StableBaselineAgent
|
| 7 |
-
from .regressionTreeApproximation import RegressionTreeApproximation
|
| 8 |
-
from .PSOAgent import PSOagent
|
| 9 |
-
from .adpAgentHD import AdpAgentHD
|
| 10 |
-
from .adpAgentHD1 import AdpAgentHD1
|
| 11 |
-
from .adpAgentHD3 import AdpAgentHD3
|
| 12 |
-
from .multiAgentRL import MultiAgentRL
|
| 13 |
from .perfectInfoAgent import PerfectInfoAgent
|
| 14 |
-
from .ensembleAgent import EnsembleAgent
|
| 15 |
-
from .PPOAgent import PPOAgent
|
| 16 |
-
from .PDPPOAgent_one_critic import PDPPOAgent
|
| 17 |
-
|
| 18 |
|
| 19 |
__all__ = [
|
| 20 |
"DummyAgent",
|
| 21 |
-
"
|
| 22 |
-
"StochasticProgrammingAgent",
|
| 23 |
-
"ValueIteration",
|
| 24 |
-
"ValueIterationMC",
|
| 25 |
-
"RegressionTreeApproximation",
|
| 26 |
"StableBaselineAgent",
|
| 27 |
-
"PSOagent",
|
| 28 |
-
"AdpAgentHD",
|
| 29 |
-
"AdpAgentHD1",
|
| 30 |
-
"AdpAgentHD3",
|
| 31 |
-
"MultiAgentRL",
|
| 32 |
-
"PerfectInfoAgent",
|
| 33 |
-
"EnsembleAgent",
|
| 34 |
"PPOAgent",
|
|
|
|
|
|
|
| 35 |
"PDPPOAgent_one_critic"
|
| 36 |
]
|
|
|
|
| 1 |
+
from .PPOAgent import PPOAgent
|
| 2 |
+
from .PDPPOAgent import PDPPOAgent
|
| 3 |
+
from .PPOAgent_two_critics import PPOAgent_two_critics
|
| 4 |
+
from .PDPPOAgent_one_critic import PDPPOAgent_one_critic
|
| 5 |
+
from .stableBaselineAgents import StableBaselineAgent
|
| 6 |
from .stableBaselineAgents import StableBaselineAgent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from .perfectInfoAgent import PerfectInfoAgent
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
__all__ = [
|
| 10 |
"DummyAgent",
|
| 11 |
+
"PerfectInfoAgent",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"StableBaselineAgent",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"PPOAgent",
|
| 14 |
+
"PPOAgent_two_critics",
|
| 15 |
+
"PDPPOAgent",
|
| 16 |
"PDPPOAgent_one_critic"
|
| 17 |
]
|
code/Lot-sizing/agents/__pycache__/PDPPO.cpython-38.pyc
ADDED
|
Binary file (8.99 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PDPPOAgent.cpython-38.pyc
ADDED
|
Binary file (8.68 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PDPPOAgent_one_critic.cpython-38.pyc
ADDED
|
Binary file (8.86 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PDPPO_one_critic.cpython-38.pyc
ADDED
|
Binary file (8.73 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PDPPOonecritic.cpython-38.pyc
ADDED
|
Binary file (8.82 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PPO.cpython-38.pyc
ADDED
|
Binary file (7.5 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PPOAgent.cpython-38.pyc
ADDED
|
Binary file (8.48 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PPOAgent_two_critics.cpython-38.pyc
ADDED
|
Binary file (8.65 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/PPOtwocritics.cpython-38.pyc
ADDED
|
Binary file (7.93 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (548 Bytes). View file
|
|
|
code/Lot-sizing/agents/__pycache__/perfectInfoAgent.cpython-38.pyc
ADDED
|
Binary file (1.04 kB). View file
|
|
|
code/Lot-sizing/agents/__pycache__/stableBaselineAgents.cpython-38.pyc
ADDED
|
Binary file (11 kB). View file
|
|
|
code/Lot-sizing/agents/perfectInfoAgent.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
from models import *
|
| 3 |
+
from envs import *
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class PerfectInfoAgent():
|
| 7 |
+
def __init__(self, env, settings):
|
| 8 |
+
super(PerfectInfoAgent, self).__init__()
|
| 9 |
+
self.env = env
|
| 10 |
+
self.solver = PerfectInfoOptimization(env)
|
| 11 |
+
_, self.sol, _ = self.solver.solve()
|
| 12 |
+
self.sol = self.sol.astype(int)
|
| 13 |
+
|
| 14 |
+
def learn(self, epochs = 1000):
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
def get_action(self, obs):
|
| 18 |
+
return list(self.sol[:,self.env.current_step])
|
code/Lot-sizing/agents/stableBaselineAgents.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
import gym
|
| 5 |
+
import torch
|
| 6 |
+
import numpy as np
|
| 7 |
+
import copy
|
| 8 |
+
from envs import SimplePlant
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
import matplotlib.patches as mpatches
|
| 11 |
+
from stable_baselines3 import PPO,A2C,DQN,SAC,DDPG
|
| 12 |
+
from stable_baselines3.common.monitor import Monitor
|
| 13 |
+
from stable_baselines3.common.vec_env import SubprocVecEnv
|
| 14 |
+
from stable_baselines3.common.callbacks import EvalCallback
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class SimplePlantSB(SimplePlant):
|
| 18 |
+
def __init__(self, settings, stoch_model):
|
| 19 |
+
super().__init__(settings, stoch_model)
|
| 20 |
+
try:self.dict_obs = settings['dict_obs']
|
| 21 |
+
except:self.dict_obs = False
|
| 22 |
+
self.last_inventory = copy.copy(self.inventory_level)
|
| 23 |
+
self.action_space = gym.spaces.MultiDiscrete(
|
| 24 |
+
[self.n_items+1] * self.n_machines
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
if self.dict_obs:
|
| 28 |
+
self.observation_space = gym.spaces.Dict({
|
| 29 |
+
'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items),
|
| 30 |
+
'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines),
|
| 31 |
+
'last_inventory_level':gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items)
|
| 32 |
+
})
|
| 33 |
+
else:
|
| 34 |
+
self.observation_space = gym.spaces.Box(
|
| 35 |
+
low=np.zeros(2*self.n_items+self.n_machines),# high for the inventory level
|
| 36 |
+
high=np.concatenate(
|
| 37 |
+
[
|
| 38 |
+
np.array(self.max_inventory_level),
|
| 39 |
+
np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups
|
| 40 |
+
np.array(self.max_inventory_level) # high for the inventory level
|
| 41 |
+
]),
|
| 42 |
+
dtype=np.int32
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def step(self, action):
|
| 46 |
+
"""
|
| 47 |
+
Step method: Execute one time step within the environment
|
| 48 |
+
|
| 49 |
+
Parameters
|
| 50 |
+
----------
|
| 51 |
+
action : action given by the agent
|
| 52 |
+
|
| 53 |
+
Returns
|
| 54 |
+
-------
|
| 55 |
+
obs : Observation of the state give the method _next_observation
|
| 56 |
+
reward : Cost given by the _reward method
|
| 57 |
+
done : returns True or False given by the _done method
|
| 58 |
+
dict : possible information for control to environment monitoring
|
| 59 |
+
|
| 60 |
+
"""
|
| 61 |
+
self.last_inventory = copy.copy(self.inventory_level)
|
| 62 |
+
|
| 63 |
+
self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand)
|
| 64 |
+
|
| 65 |
+
# self.total_cost['setup_costs'] = 0
|
| 66 |
+
# self.total_cost['holding_costs'] = 0
|
| 67 |
+
|
| 68 |
+
reward = -sum([ele for key, ele in self.total_cost.items()])
|
| 69 |
+
#reward = -self.total_cost['lost_sales']
|
| 70 |
+
|
| 71 |
+
#reward = np.abs(action)
|
| 72 |
+
|
| 73 |
+
self.current_step += 1
|
| 74 |
+
done = self.current_step == self.T
|
| 75 |
+
obs = self._next_observation()
|
| 76 |
+
|
| 77 |
+
return obs, reward, done, self.total_cost
|
| 78 |
+
|
| 79 |
+
def _next_observation(self):
|
| 80 |
+
"""
|
| 81 |
+
Returns the next demand
|
| 82 |
+
"""
|
| 83 |
+
obs = SimplePlant._next_observation(self)
|
| 84 |
+
obs['last_inventory_level'] = copy.copy(self.last_inventory)
|
| 85 |
+
if isinstance(obs, dict):
|
| 86 |
+
if not self.dict_obs:
|
| 87 |
+
obs = np.concatenate(
|
| 88 |
+
(
|
| 89 |
+
obs['inventory_level'], # n_items size
|
| 90 |
+
obs['machine_setup'], # n_machine size
|
| 91 |
+
obs['last_inventory_level']# n_items size
|
| 92 |
+
)
|
| 93 |
+
)
|
| 94 |
+
else:
|
| 95 |
+
if self.dict_obs:
|
| 96 |
+
raise('Change dict_obst to False')
|
| 97 |
+
return obs
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
class StableBaselineAgent():
|
| 101 |
+
"""
|
| 102 |
+
Stable baseline Agent Agent from StableBaselines3
|
| 103 |
+
We adapt the env to stablebaseline requirements:
|
| 104 |
+
A different _next_observation is required, with the observation space.
|
| 105 |
+
"""
|
| 106 |
+
def __init__(self, env: SimplePlant, settings: dict):
|
| 107 |
+
super(StableBaselineAgent, self).__init__()
|
| 108 |
+
|
| 109 |
+
if settings['multiagent']:
|
| 110 |
+
self.env = env
|
| 111 |
+
else:
|
| 112 |
+
self.env = SimplePlantSB(env.settings, env.stoch_model)
|
| 113 |
+
self.last_inventory = env.inventory_level
|
| 114 |
+
self.model_name = settings['model_name']
|
| 115 |
+
self.experiment_name = settings['experiment_name']
|
| 116 |
+
self.parallelization = settings['parallelization']
|
| 117 |
+
self.run = settings['run']
|
| 118 |
+
try:self.dict_obs = settings['dict_obs']
|
| 119 |
+
except:self.dict_obs = False
|
| 120 |
+
|
| 121 |
+
self.POSSIBLE_STATES = self.env.n_items + 1
|
| 122 |
+
self.env.cost_to_reward = True
|
| 123 |
+
self.epsilon = 0
|
| 124 |
+
|
| 125 |
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 126 |
+
# Use the logs file in the root path of the main.
|
| 127 |
+
self.LOG_DIR = os.path.join(BASE_DIR,'logs')
|
| 128 |
+
|
| 129 |
+
if self.parallelization:
|
| 130 |
+
# For cpu parallelization in StableBaseline learning
|
| 131 |
+
def make_env(seed):
|
| 132 |
+
def _init():
|
| 133 |
+
env = self.env
|
| 134 |
+
env = Monitor(
|
| 135 |
+
env,
|
| 136 |
+
os.path.join(f'{self.LOG_DIR}','monitor',f'{self.model_name}_{self.experiment_name}_{seed}_{self.run}'),
|
| 137 |
+
allow_early_resets=True
|
| 138 |
+
)
|
| 139 |
+
return env
|
| 140 |
+
return _init
|
| 141 |
+
num_cpu = 5
|
| 142 |
+
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
|
| 143 |
+
else:
|
| 144 |
+
env = Monitor(
|
| 145 |
+
self.env,
|
| 146 |
+
os.path.join(f'{self.LOG_DIR}','monitor',f'{self.model_name}_{self.experiment_name}_{self.run}')
|
| 147 |
+
)
|
| 148 |
+
self.eval_callback = EvalCallback(
|
| 149 |
+
env,
|
| 150 |
+
best_model_save_path=os.path.join(f'{self.LOG_DIR}',f'best_{self.model_name}_{self.experiment_name}_{self.run}'),
|
| 151 |
+
log_path=f'{self.LOG_DIR}/',
|
| 152 |
+
eval_freq=100,
|
| 153 |
+
deterministic=True,
|
| 154 |
+
verbose=0,
|
| 155 |
+
render=False
|
| 156 |
+
)
|
| 157 |
+
if self.dict_obs:
|
| 158 |
+
policy = 'MultiInputPolicy'
|
| 159 |
+
else:
|
| 160 |
+
policy = 'MlpPolicy'
|
| 161 |
+
if self.model_name == 'PPO':
|
| 162 |
+
self.model = PPO(
|
| 163 |
+
policy,
|
| 164 |
+
env,verbose = 0, batch_size = 256, n_steps = 256, gamma = 0.96, gae_lambda = 0.9, n_epochs = 20, ent_coef = 0.0, max_grad_norm = 0.5, vf_coef = 0.5, learning_rate = 5e-3, use_sde = False, clip_range = 0.4, policy_kwargs = dict(log_std_init=-2,ortho_init=False,activation_fn=torch.nn.ReLU,net_arch=[dict(pi=[300, 300], vf=[300, 300])])
|
| 165 |
+
)
|
| 166 |
+
elif self.model_name == 'A2C':
|
| 167 |
+
self.model = A2C(
|
| 168 |
+
policy,
|
| 169 |
+
env,verbose = 0, learning_rate=0.002, n_steps=100, gamma = 0.95, vf_coef = 0.7,policy_kwargs= dict(net_arch=[300, 300]), seed = None
|
| 170 |
+
)
|
| 171 |
+
elif self.model_name == 'DQN':
|
| 172 |
+
self.model = DQN(
|
| 173 |
+
policy,
|
| 174 |
+
env, verbose = 0, learning_rate= 2.3e-3, buffer_size=100000, learning_starts=1000, batch_size=32, tau=1.0, gamma=0.99,target_update_interval=10,train_freq= 256,gradient_steps= 128, exploration_fraction=0.16, exploration_initial_eps=0.04, policy_kwargs= dict(net_arch=[300, 300]), seed = None
|
| 175 |
+
)
|
| 176 |
+
elif self.model_name == 'SAC':
|
| 177 |
+
self.model = SAC(
|
| 178 |
+
policy,
|
| 179 |
+
env, verbose = 0, learning_rate=0.0003, buffer_size=1000000, learning_starts=1000, batch_size=256, tau=0.005, gamma=0.99, train_freq=1, gradient_steps=1,seed = None,action_noise=None, replay_buffer_class=None, replay_buffer_kwargs=None, optimize_memory_usage=False, ent_coef='auto', target_update_interval=1, target_entropy='auto', use_sde=False, sde_sample_freq=-1, use_sde_at_warmup=False, tensorboard_log=None, create_eval_env=False, policy_kwargs=dict(activation_fn=torch.nn.ReLU,net_arch=[dict(pi=[300, 300], vf=[300, 300])])
|
| 180 |
+
)
|
| 181 |
+
elif self.model_name == 'DDPG':
|
| 182 |
+
self.model = DDPG(
|
| 183 |
+
policy,
|
| 184 |
+
env, verbose = 0, learning_rate=0.0003, buffer_size=1000000, learning_starts=1000, batch_size=256
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
def get_action(self, obs):
|
| 188 |
+
obs['last_inventory_level'] = copy.copy(self.last_inventory)
|
| 189 |
+
if isinstance(obs, dict):
|
| 190 |
+
if self.dict_obs:
|
| 191 |
+
act = self.model.predict(obs,deterministic=True)[0]
|
| 192 |
+
else:
|
| 193 |
+
list_obs = []
|
| 194 |
+
for item in obs:
|
| 195 |
+
list_obs.append(obs[item])
|
| 196 |
+
obs_ = np.array(np.concatenate(list_obs))
|
| 197 |
+
act = self.model.predict(obs_,deterministic=True)[0]
|
| 198 |
+
else:
|
| 199 |
+
if self.dict_obs:
|
| 200 |
+
raise('Change the policy to dictionary observations')
|
| 201 |
+
else:
|
| 202 |
+
act = self.model.predict(obs,deterministic=True)[0]
|
| 203 |
+
self.last_inventory = copy.copy(obs['inventory_level'])
|
| 204 |
+
return act
|
| 205 |
+
|
| 206 |
+
def learn(self, epochs=1000):
|
| 207 |
+
print(f"{self.model_name} learning...")
|
| 208 |
+
start_time = time.time()
|
| 209 |
+
|
| 210 |
+
# We define the EvalCallback wrapper to save the best model
|
| 211 |
+
# Here the model learns using the provided environment in the Stable baseline Agent definition
|
| 212 |
+
# We mutiply the number of epochs by the number of time periods to give the number of training steps
|
| 213 |
+
self.model.learn(
|
| 214 |
+
epochs*self.env.T,
|
| 215 |
+
callback=self.eval_callback,
|
| 216 |
+
# tb_log_name='PPO'
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
self.env.close()
|
| 220 |
+
|
| 221 |
+
time_duration = time.time() - start_time
|
| 222 |
+
print(f"Finished Learning {time_duration:.2f} s")
|
| 223 |
+
|
| 224 |
+
def load_agent(self, path):
|
| 225 |
+
if self.model_name == 'PPO':
|
| 226 |
+
self.model = PPO.load(path)
|
| 227 |
+
elif self.model_name == 'A2C':
|
| 228 |
+
self.model = A2C.load(path)
|
| 229 |
+
elif self.model_name == 'DQN':
|
| 230 |
+
self.model = DQN.load(path)
|
| 231 |
+
elif self.model_name == 'SAC':
|
| 232 |
+
self.model = SAC.load(path)
|
| 233 |
+
elif self.model_name == 'DDPG':
|
| 234 |
+
self.model = SAC.load(path)
|
| 235 |
+
|
| 236 |
+
def plot_policy(self, seed=1):
|
| 237 |
+
# ONLY WORKING FOR 2 ITEMS 1 MACHINE
|
| 238 |
+
cmap = plt.cm.get_cmap('viridis', 3)
|
| 239 |
+
policy_map = np.zeros((self.env.max_inventory_level[0]+1,self.env.max_inventory_level[1]+1,self.env.n_items+1))
|
| 240 |
+
for i in range(self.env.max_inventory_level[0]+1):
|
| 241 |
+
for j in range(self.env.max_inventory_level[1]+1):
|
| 242 |
+
for k in range(self.env.n_items+1):
|
| 243 |
+
obs = np.expand_dims(np.array([i,j,k]), axis = 0)
|
| 244 |
+
try: action = self.model.predict(obs,deterministic=True)[0][0][0]
|
| 245 |
+
except: action = self.model.predict(obs,deterministic=True)[0][0]
|
| 246 |
+
#print(f'action: {action} | obs: {obs}')
|
| 247 |
+
policy_map[i,j,k] = action
|
| 248 |
+
self.policy = policy_map
|
| 249 |
+
|
| 250 |
+
fig, axs = plt.subplots(1, self.POSSIBLE_STATES)
|
| 251 |
+
fig.suptitle('Found Policy')
|
| 252 |
+
for i, ax in enumerate(axs):
|
| 253 |
+
ax.set_title(f'Setup {i}')
|
| 254 |
+
im = ax.pcolormesh(
|
| 255 |
+
self.policy[:,:,i], cmap = cmap, edgecolors='k', linewidth=2
|
| 256 |
+
)
|
| 257 |
+
im.set_clim(0, self.POSSIBLE_STATES - 1)
|
| 258 |
+
ax.set_xlabel('I2')
|
| 259 |
+
if i == 0:
|
| 260 |
+
ax.set_ylabel('I1')
|
| 261 |
+
|
| 262 |
+
# COLOR BAR:
|
| 263 |
+
bound = [0,1,2]
|
| 264 |
+
# Creating 8 Patch instances
|
| 265 |
+
fig.subplots_adjust(bottom=0.2)
|
| 266 |
+
ax.legend(
|
| 267 |
+
[mpatches.Patch(color=cmap(b)) for b in bound],
|
| 268 |
+
['{}'.format(i) for i in range(3)],
|
| 269 |
+
loc='upper center', bbox_to_anchor=(-0.8,-0.13),
|
| 270 |
+
fancybox=True, shadow=True, ncol=3
|
| 271 |
+
)
|
| 272 |
+
fig.savefig(os.path.join(f'results', f'policy_function_{self.model_name}_{self.experiment_name}_{seed}.pdf'), bbox_inches='tight')
|
| 273 |
+
plt.close()
|
| 274 |
+
|
| 275 |
+
def plot_value_function(self, seed):
|
| 276 |
+
# ONLY WORKING FOR 2 ITEMS 1 MACHINE
|
| 277 |
+
value_map = np.zeros((self.env.max_inventory_level[0]+1,self.env.max_inventory_level[1]+1,self.env.n_items+1))
|
| 278 |
+
for i in range(self.env.max_inventory_level[0]+1):
|
| 279 |
+
for j in range(self.env.max_inventory_level[1]+1):
|
| 280 |
+
for k in range(self.env.n_items+1):
|
| 281 |
+
value_list = []
|
| 282 |
+
for action in range(self.env.n_items+1):
|
| 283 |
+
obs = np.expand_dims(np.array([j,i,k]), axis = 0)
|
| 284 |
+
action = np.array([[action]])
|
| 285 |
+
if torch.cuda.is_available():
|
| 286 |
+
obs = torch.from_numpy(obs).to(torch.float).to(device="cuda")
|
| 287 |
+
action = torch.from_numpy(action).to(torch.float).to(device="cuda")
|
| 288 |
+
else:
|
| 289 |
+
obs = torch.from_numpy(obs).to(torch.float)
|
| 290 |
+
action = torch.from_numpy(action).to(torch.float)
|
| 291 |
+
try:
|
| 292 |
+
value,prob,dist_entropy = self.model.policy.evaluate_actions(obs,action)
|
| 293 |
+
value_list.append(value.item())
|
| 294 |
+
except:
|
| 295 |
+
value = self.model.policy.q_net(obs)[0][int(action.item())]
|
| 296 |
+
value_list.append(value.item())
|
| 297 |
+
|
| 298 |
+
value_map[j,i,k] = np.array(value_list).mean()
|
| 299 |
+
|
| 300 |
+
self.value_function = value_map
|
| 301 |
+
# Plotting:
|
| 302 |
+
fig, axs = plt.subplots(nrows=1, ncols=self.POSSIBLE_STATES)
|
| 303 |
+
fig.suptitle('Value Function')
|
| 304 |
+
for i, ax in enumerate(axs):
|
| 305 |
+
ax.set_title(f'Setup {i}')
|
| 306 |
+
im = ax.imshow(
|
| 307 |
+
-self.value_function[:,:,i],
|
| 308 |
+
aspect='auto', cmap='viridis'
|
| 309 |
+
)
|
| 310 |
+
if i == 0:
|
| 311 |
+
ax.set_ylabel('I1')
|
| 312 |
+
|
| 313 |
+
ax.set_xlabel('I2')
|
| 314 |
+
ax.invert_yaxis()
|
| 315 |
+
fig.subplots_adjust(right=0.85)
|
| 316 |
+
cbar_ax = fig.add_axes([0.88, 0.15, 0.04, 0.7])
|
| 317 |
+
|
| 318 |
+
fig.colorbar(im, cax=cbar_ax)
|
| 319 |
+
fig.savefig(os.path.join('results',f'value_function_{self.model_name}_{self.experiment_name}_{self.run}_{seed}.pdf'))
|
| 320 |
+
plt.close()
|
code/Lot-sizing/envs/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (337 Bytes). View file
|
|
|
code/Lot-sizing/envs/__pycache__/simplePlant.cpython-38.pyc
ADDED
|
Binary file (6.66 kB). View file
|
|
|
code/Lot-sizing/envs/__pycache__/singleSequenceDependentMachinePlant.cpython-38.pyc
ADDED
|
Binary file (2.66 kB). View file
|
|
|
code/Lot-sizing/experiments.py
CHANGED
|
@@ -12,15 +12,16 @@ sys.path.append(AGENTS_DIR)
|
|
| 12 |
from agents.PPO import PPO
|
| 13 |
from agents.PDPPO import PDPPO
|
| 14 |
|
|
|
|
|
|
|
| 15 |
from agents.PDPPOAgent import PDPPOAgent
|
| 16 |
from agents.PPOAgent import PPOAgent
|
|
|
|
| 17 |
|
|
|
|
|
|
|
| 18 |
import numpy as np
|
| 19 |
-
from envs import
|
| 20 |
-
from agents import *
|
| 21 |
-
from agents import StochasticProgrammingAgent, AdpAgentHD3
|
| 22 |
-
from agents import StableBaselineAgent, MultiAgentRL, EnsembleAgent, PerfectInfoAgent,PSOagent,AdpAgentHD, PPOAgent
|
| 23 |
-
from test_functions import *
|
| 24 |
from scenarioManager.stochasticDemandModel import StochasticDemandModel
|
| 25 |
|
| 26 |
|
|
@@ -29,13 +30,11 @@ from scenarioManager.stochasticDemandModel import StochasticDemandModel
|
|
| 29 |
if __name__ == '__main__':
|
| 30 |
experiments = ['15items_5machines_i100','20items_10machines','25items_10machines']
|
| 31 |
for experiment_name in experiments:
|
| 32 |
-
for i in range(0,
|
| 33 |
# Setting the seeds
|
| 34 |
np.random.seed(1)
|
| 35 |
random.seed(10)
|
| 36 |
# Environment setup load:
|
| 37 |
-
# experiment_name = '15items_5machines_i100' # we set the experiment using the available files in cfg
|
| 38 |
-
# experiment_name = '25items_10machines' # we set the experiment using the available files in cfg
|
| 39 |
file_path = os.path.abspath(f"./cfg_env/setting_{experiment_name}.json")
|
| 40 |
fp = open(file_path, 'r')
|
| 41 |
settings = json.load(fp)
|
|
@@ -57,37 +56,37 @@ if __name__ == '__main__':
|
|
| 57 |
# Parameters for the ADPHS:
|
| 58 |
setting_sol_method['regressor_name'] = 'plain_matrix_I2xM1'
|
| 59 |
setting_sol_method['discount_rate'] = 0.9
|
|
|
|
|
|
|
|
|
|
| 60 |
agents = []
|
| 61 |
# Parameters for the RL:
|
| 62 |
|
| 63 |
-
training_epochs_RL = 30000
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
setting_sol_method['parallelization'] = False
|
| 68 |
env = SimplePlant(settings, stoch_model)
|
| 69 |
|
| 70 |
# Number of test execution (number of complet environment iterations)
|
| 71 |
nreps = 100
|
| 72 |
|
| 73 |
###########################################################################
|
| 74 |
-
#
|
| 75 |
###########################################################################
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
###########################################################################
|
| 90 |
-
# Post-decision PPO
|
| 91 |
###########################################################################
|
| 92 |
|
| 93 |
base_model_name = 'PDPPO'
|
|
@@ -101,7 +100,60 @@ if __name__ == '__main__':
|
|
| 101 |
BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
|
| 102 |
pdppo_agent.load_agent(BEST_MODEL_DIR) # For training purposes
|
| 103 |
agents.append(("PDPPO", pdppo_agent))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
###########################################################################
|
| 107 |
#TESTING
|
|
|
|
| 12 |
from agents.PPO import PPO
|
| 13 |
from agents.PDPPO import PDPPO
|
| 14 |
|
| 15 |
+
from agents.PDPPOAgent_one_critic import PDPPOAgent_one_critic
|
| 16 |
+
from agents.PPOAgent_two_critics import PPOAgent_two_critics
|
| 17 |
from agents.PDPPOAgent import PDPPOAgent
|
| 18 |
from agents.PPOAgent import PPOAgent
|
| 19 |
+
from agents.stableBaselineAgents import StableBaselineAgent
|
| 20 |
|
| 21 |
+
|
| 22 |
+
from test_functions import test_agents
|
| 23 |
import numpy as np
|
| 24 |
+
from envs import SimplePlant
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
from scenarioManager.stochasticDemandModel import StochasticDemandModel
|
| 26 |
|
| 27 |
|
|
|
|
| 30 |
if __name__ == '__main__':
|
| 31 |
experiments = ['15items_5machines_i100','20items_10machines','25items_10machines']
|
| 32 |
for experiment_name in experiments:
|
| 33 |
+
for i in range(0,10):
|
| 34 |
# Setting the seeds
|
| 35 |
np.random.seed(1)
|
| 36 |
random.seed(10)
|
| 37 |
# Environment setup load:
|
|
|
|
|
|
|
| 38 |
file_path = os.path.abspath(f"./cfg_env/setting_{experiment_name}.json")
|
| 39 |
fp = open(file_path, 'r')
|
| 40 |
settings = json.load(fp)
|
|
|
|
| 56 |
# Parameters for the ADPHS:
|
| 57 |
setting_sol_method['regressor_name'] = 'plain_matrix_I2xM1'
|
| 58 |
setting_sol_method['discount_rate'] = 0.9
|
| 59 |
+
setting_sol_method['multiagent'] = False
|
| 60 |
+
setting_sol_method['parallelization'] = True
|
| 61 |
+
setting_sol_method['run'] = i
|
| 62 |
agents = []
|
| 63 |
# Parameters for the RL:
|
| 64 |
|
| 65 |
+
training_epochs_RL = 5000 # 30000
|
| 66 |
+
|
|
|
|
|
|
|
|
|
|
| 67 |
env = SimplePlant(settings, stoch_model)
|
| 68 |
|
| 69 |
# Number of test execution (number of complet environment iterations)
|
| 70 |
nreps = 100
|
| 71 |
|
| 72 |
###########################################################################
|
| 73 |
+
# PPO
|
| 74 |
###########################################################################
|
| 75 |
|
| 76 |
+
base_model_name = 'PPO'
|
| 77 |
+
ppo_agent = PPOAgent(
|
| 78 |
+
env,
|
| 79 |
+
setting_sol_method
|
| 80 |
+
)
|
| 81 |
+
ppo_agent.learn(n_episodes=training_epochs_RL*settings['time_horizon'] ) # Each ep with 200 steps
|
| 82 |
|
| 83 |
+
#load best agent before appending in the test list
|
| 84 |
+
BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
|
| 85 |
+
ppo_agent.load_agent(BEST_MODEL_DIR) # For training purposes
|
| 86 |
+
agents.append(("PPO", ppo_agent))
|
| 87 |
|
| 88 |
###########################################################################
|
| 89 |
+
# Post-decision PPO - Dual critic
|
| 90 |
###########################################################################
|
| 91 |
|
| 92 |
base_model_name = 'PDPPO'
|
|
|
|
| 100 |
BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
|
| 101 |
pdppo_agent.load_agent(BEST_MODEL_DIR) # For training purposes
|
| 102 |
agents.append(("PDPPO", pdppo_agent))
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
###########################################################################
|
| 106 |
+
# Post-decision PPO - Dual critic
|
| 107 |
+
###########################################################################
|
| 108 |
+
|
| 109 |
+
base_model_name = 'PDPPO_one_critic'
|
| 110 |
+
pdppo_agent_one_critic = PDPPOAgent(
|
| 111 |
+
env,
|
| 112 |
+
setting_sol_method
|
| 113 |
+
)
|
| 114 |
+
pdppo_agent_one_critic.learn(n_episodes=training_epochs_RL*settings['time_horizon'] ) # Each ep with 200 steps
|
| 115 |
+
|
| 116 |
+
#load best agent before appending in the test list
|
| 117 |
+
BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
|
| 118 |
+
pdppo_agent_one_critic.load_agent(BEST_MODEL_DIR) # For training purposes
|
| 119 |
+
agents.append(("PDPPO", pdppo_agent_one_critic))
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
###########################################################################
|
| 123 |
+
# Post-decision PPO - Dual critic
|
| 124 |
+
###########################################################################
|
| 125 |
+
|
| 126 |
+
base_model_name = 'PPO_two_critics'
|
| 127 |
+
ppo_agent_two_critics = PDPPOAgent(
|
| 128 |
+
env,
|
| 129 |
+
setting_sol_method
|
| 130 |
+
)
|
| 131 |
+
ppo_agent_two_critics.learn(n_episodes=training_epochs_RL*settings['time_horizon'] ) # Each ep with 200 steps
|
| 132 |
+
|
| 133 |
+
#load best agent before appending in the test list
|
| 134 |
+
BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
|
| 135 |
+
ppo_agent_two_critics.load_agent(BEST_MODEL_DIR) # For training purposes
|
| 136 |
+
agents.append(("PDPPO", ppo_agent_two_critics))
|
| 137 |
+
|
| 138 |
+
###########################################################################
|
| 139 |
+
# RL A2C
|
| 140 |
+
###########################################################################
|
| 141 |
+
|
| 142 |
+
# base_model_name = 'A2C'
|
| 143 |
+
# env = SimplePlant(settings, stoch_model)
|
| 144 |
+
# setting_sol_method['model_name'] = base_model_name
|
| 145 |
+
# rl_agent = StableBaselineAgent(
|
| 146 |
+
# env,
|
| 147 |
+
# setting_sol_method
|
| 148 |
+
# )
|
| 149 |
+
|
| 150 |
+
# rl_agent.learn(epochs=training_epochs_RL) # Each ep with 200 steps
|
| 151 |
|
| 152 |
+
# #load best agent before appending in the test list
|
| 153 |
+
# BEST_MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath('__file__')),'logs',f'best_{base_model_name}_{experiment_name}','best_model')
|
| 154 |
+
# rl_agent.load_agent(BEST_MODEL_DIR)
|
| 155 |
+
# agents.append(("A2C", rl_agent))
|
| 156 |
+
|
| 157 |
|
| 158 |
###########################################################################
|
| 159 |
#TESTING
|
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_0_0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48a42ab9cb3c0ecd6aee7da1dbd709f9ce11e1f7855dd8b4ca263ed02aa4f106
|
| 3 |
+
size 279139
|
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_0.csv
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
episode,timestep,reward
|
| 2 |
+
39,4000,-2141.4023
|
| 3 |
+
79,8000,-2186.1688
|
| 4 |
+
119,12000,-2176.6288
|
| 5 |
+
159,16000,-2191.1093
|
| 6 |
+
199,20000,-2189.4693
|
| 7 |
+
239,24000,-2187.8145
|
| 8 |
+
279,28000,-2196.4592
|
| 9 |
+
319,32000,-2186.1425
|
| 10 |
+
359,36000,-2170.4952
|
| 11 |
+
399,40000,-2184.8005
|
| 12 |
+
439,44000,-2156.964
|
| 13 |
+
479,48000,-2159.1175
|
| 14 |
+
519,52000,-2177.2202
|
| 15 |
+
559,56000,-2184.189
|
| 16 |
+
599,60000,-2173.9785
|
| 17 |
+
639,64000,-2185.696
|
| 18 |
+
679,68000,-2187.422
|
| 19 |
+
719,72000,-2188.3395
|
| 20 |
+
759,76000,-2200.6052
|
| 21 |
+
799,80000,-2206.662
|
| 22 |
+
839,84000,-2198.9285
|
| 23 |
+
879,88000,-2212.6385
|
| 24 |
+
919,92000,-2214.2012
|
| 25 |
+
959,96000,-2210.518
|
| 26 |
+
999,100000,-2212.269
|
| 27 |
+
1039,104000,-2202.792
|
| 28 |
+
1079,108000,-2230.6483
|
| 29 |
+
1119,112000,-2230.9285
|
| 30 |
+
1159,116000,-2234.553
|
| 31 |
+
1199,120000,-2231.0472
|
| 32 |
+
1239,124000,-2205.4692
|
| 33 |
+
1279,128000,-2224.4608
|
| 34 |
+
1319,132000,-2222.5775
|
| 35 |
+
1359,136000,-2220.6603
|
| 36 |
+
1399,140000,-2217.5998
|
| 37 |
+
1439,144000,-2206.9042
|
| 38 |
+
1479,148000,-2219.398
|
| 39 |
+
1519,152000,-2219.977
|
| 40 |
+
1559,156000,-2208.6932
|
| 41 |
+
1599,160000,-2199.11
|
| 42 |
+
1639,164000,-2216.3345
|
| 43 |
+
1679,168000,-2195.7275
|
| 44 |
+
1719,172000,-2208.9112
|
| 45 |
+
1759,176000,-2196.7148
|
| 46 |
+
1799,180000,-2179.6018
|
| 47 |
+
1839,184000,-2185.97
|
| 48 |
+
1879,188000,-2190.0938
|
| 49 |
+
1919,192000,-2182.9055
|
| 50 |
+
1959,196000,-2212.575
|
| 51 |
+
1999,200000,-2217.4317
|
| 52 |
+
2039,204000,-2207.0677
|
| 53 |
+
2079,208000,-2201.1058
|
| 54 |
+
2119,212000,-2210.03
|
| 55 |
+
2159,216000,-2196.9463
|
| 56 |
+
2199,220000,-2186.4732
|
| 57 |
+
2239,224000,-2196.7148
|
| 58 |
+
2279,228000,-2180.514
|
| 59 |
+
2319,232000,-2182.2113
|
| 60 |
+
2359,236000,-2177.5078
|
| 61 |
+
2399,240000,-2177.2448
|
| 62 |
+
2439,244000,-2176.5475
|
| 63 |
+
2479,248000,-2176.4643
|
| 64 |
+
2519,252000,-2166.5628
|
| 65 |
+
2559,256000,-2181.9908
|
| 66 |
+
2599,260000,-2201.7215
|
| 67 |
+
2639,264000,-2189.474
|
| 68 |
+
2679,268000,-2170.9755
|
| 69 |
+
2719,272000,-2169.075
|
| 70 |
+
2759,276000,-2183.726
|
| 71 |
+
2799,280000,-2165.4742
|
| 72 |
+
2839,284000,-2187.9715
|
| 73 |
+
2879,288000,-2179.0172
|
| 74 |
+
2919,292000,-2161.0182
|
| 75 |
+
2959,296000,-2168.9047
|
| 76 |
+
2999,300000,-2165.532
|
| 77 |
+
3039,304000,-2168.5285
|
| 78 |
+
3079,308000,-2159.3415
|
| 79 |
+
3119,312000,-2168.1608
|
| 80 |
+
3159,316000,-2177.5103
|
| 81 |
+
3199,320000,-2185.0758
|
| 82 |
+
3239,324000,-2176.5248
|
| 83 |
+
3279,328000,-2173.1685
|
| 84 |
+
3319,332000,-2190.4757
|
| 85 |
+
3359,336000,-2219.6503
|
| 86 |
+
3399,340000,-2207.4892
|
| 87 |
+
3439,344000,-2199.8988
|
| 88 |
+
3479,348000,-2211.5325
|
| 89 |
+
3519,352000,-2201.5668
|
| 90 |
+
3559,356000,-2202.0843
|
| 91 |
+
3599,360000,-2196.885
|
| 92 |
+
3639,364000,-2199.742
|
| 93 |
+
3679,368000,-2219.324
|
| 94 |
+
3719,372000,-2224.5802
|
| 95 |
+
3759,376000,-2213.3832
|
| 96 |
+
3799,380000,-2191.889
|
| 97 |
+
3839,384000,-2220.2653
|
| 98 |
+
3879,388000,-2206.6353
|
| 99 |
+
3919,392000,-2193.6993
|
| 100 |
+
3959,396000,-2173.2148
|
| 101 |
+
3999,400000,-2168.8942
|
| 102 |
+
4039,404000,-2182.1583
|
| 103 |
+
4079,408000,-2170.8605
|
| 104 |
+
4119,412000,-2179.9363
|
| 105 |
+
4159,416000,-2177.2738
|
| 106 |
+
4199,420000,-2186.921
|
| 107 |
+
4239,424000,-2176.2058
|
| 108 |
+
4279,428000,-2178.0973
|
| 109 |
+
4319,432000,-2179.0505
|
| 110 |
+
4359,436000,-2183.782
|
| 111 |
+
4399,440000,-2189.763
|
| 112 |
+
4439,444000,-2191.9625
|
| 113 |
+
4479,448000,-2190.0078
|
| 114 |
+
4519,452000,-2208.7985
|
| 115 |
+
4559,456000,-2196.431
|
| 116 |
+
4599,460000,-2204.601
|
| 117 |
+
4639,464000,-2198.331
|
| 118 |
+
4679,468000,-2197.519
|
| 119 |
+
4719,472000,-2195.665
|
| 120 |
+
4759,476000,-2179.7755
|
| 121 |
+
4799,480000,-2201.7112
|
| 122 |
+
4839,484000,-2187.7942
|
| 123 |
+
4879,488000,-2177.4918
|
| 124 |
+
4919,492000,-2188.3555
|
| 125 |
+
4959,496000,-2178.8962
|
| 126 |
+
4999,500000,-2193.1755
|
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_2.csv
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
episode,timestep,reward
|
| 2 |
+
39,4000,-2335.3474
|
| 3 |
+
79,8000,-2244.5998
|
| 4 |
+
119,12000,-2238.709
|
| 5 |
+
159,16000,-2290.7835
|
| 6 |
+
199,20000,-2266.6182
|
| 7 |
+
239,24000,-2236.9325
|
| 8 |
+
279,28000,-2260.8245
|
| 9 |
+
319,32000,-2270.0922
|
| 10 |
+
359,36000,-2267.02
|
| 11 |
+
399,40000,-2260.1368
|
| 12 |
+
439,44000,-2246.2922
|
| 13 |
+
479,48000,-2251.6645
|
| 14 |
+
519,52000,-2270.2068
|
| 15 |
+
559,56000,-2271.094
|
| 16 |
+
599,60000,-2270.1955
|
| 17 |
+
639,64000,-2251.9755
|
| 18 |
+
679,68000,-2283.5295
|
| 19 |
+
719,72000,-2265.69
|
| 20 |
+
759,76000,-2307.0708
|
| 21 |
+
799,80000,-2293.882
|
| 22 |
+
839,84000,-2290.8482
|
| 23 |
+
879,88000,-2277.416
|
| 24 |
+
919,92000,-2250.59
|
| 25 |
+
959,96000,-2234.1988
|
| 26 |
+
999,100000,-2249.0185
|
| 27 |
+
1039,104000,-2247.6215
|
| 28 |
+
1079,108000,-2216.7135
|
| 29 |
+
1119,112000,-2213.6995
|
| 30 |
+
1159,116000,-2224.0747
|
| 31 |
+
1199,120000,-2228.468
|
| 32 |
+
1239,124000,-2255.0583
|
| 33 |
+
1279,128000,-2226.098
|
| 34 |
+
1319,132000,-2217.657
|
| 35 |
+
1359,136000,-2243.1698
|
| 36 |
+
1399,140000,-2232.1338
|
| 37 |
+
1439,144000,-2250.4618
|
| 38 |
+
1479,148000,-2235.0085
|
| 39 |
+
1519,152000,-2249.4723
|
| 40 |
+
1559,156000,-2216.2995
|
| 41 |
+
1599,160000,-2233.6805
|
| 42 |
+
1639,164000,-2247.4035
|
| 43 |
+
1679,168000,-2229.968
|
| 44 |
+
1719,172000,-2218.4828
|
| 45 |
+
1759,176000,-2223.798
|
| 46 |
+
1799,180000,-2213.273
|
| 47 |
+
1839,184000,-2219.179
|
| 48 |
+
1879,188000,-2205.1017
|
| 49 |
+
1919,192000,-2207.6708
|
| 50 |
+
1959,196000,-2200.982
|
| 51 |
+
1999,200000,-2218.6955
|
| 52 |
+
2039,204000,-2200.056
|
| 53 |
+
2079,208000,-2218.9955
|
| 54 |
+
2119,212000,-2214.628
|
| 55 |
+
2159,216000,-2230.9135
|
| 56 |
+
2199,220000,-2212.2112
|
| 57 |
+
2239,224000,-2228.0432
|
| 58 |
+
2279,228000,-2228.0378
|
| 59 |
+
2319,232000,-2218.216
|
| 60 |
+
2359,236000,-2237.9682
|
| 61 |
+
2399,240000,-2218.8503
|
| 62 |
+
2439,244000,-2201.6265
|
| 63 |
+
2479,248000,-2216.5263
|
| 64 |
+
2519,252000,-2209.0173
|
| 65 |
+
2559,256000,-2210.7017
|
| 66 |
+
2599,260000,-2192.9838
|
| 67 |
+
2639,264000,-2206.9902
|
| 68 |
+
2679,268000,-2196.276
|
| 69 |
+
2719,272000,-2187.5165
|
| 70 |
+
2759,276000,-2201.5815
|
| 71 |
+
2799,280000,-2197.6468
|
| 72 |
+
2839,284000,-2181.081
|
| 73 |
+
2879,288000,-2191.911
|
| 74 |
+
2919,292000,-2210.5108
|
| 75 |
+
2959,296000,-2191.4668
|
| 76 |
+
2999,300000,-2207.3622
|
| 77 |
+
3039,304000,-2188.681
|
| 78 |
+
3079,308000,-2205.789
|
| 79 |
+
3119,312000,-2189.5567
|
| 80 |
+
3159,316000,-2171.3155
|
| 81 |
+
3199,320000,-2170.6315
|
| 82 |
+
3239,324000,-2170.7322
|
| 83 |
+
3279,328000,-2174.4193
|
| 84 |
+
3319,332000,-2175.8538
|
| 85 |
+
3359,336000,-2154.2035
|
| 86 |
+
3399,340000,-2185.9618
|
| 87 |
+
3439,344000,-2178.553
|
| 88 |
+
3479,348000,-2170.287
|
| 89 |
+
3519,352000,-2159.5517
|
| 90 |
+
3559,356000,-2172.067
|
| 91 |
+
3599,360000,-2159.2972
|
| 92 |
+
3639,364000,-2177.8195
|
| 93 |
+
3679,368000,-2156.6698
|
| 94 |
+
3719,372000,-2168.946
|
| 95 |
+
3759,376000,-2182.2233
|
| 96 |
+
3799,380000,-2170.65
|
| 97 |
+
3839,384000,-2158.5868
|
| 98 |
+
3879,388000,-2162.828
|
| 99 |
+
3919,392000,-2148.9192
|
| 100 |
+
3959,396000,-2152.153
|
| 101 |
+
3999,400000,-2169.9372
|
| 102 |
+
4039,404000,-2169.7798
|
| 103 |
+
4079,408000,-2162.5945
|
| 104 |
+
4119,412000,-2148.3235
|
| 105 |
+
4159,416000,-2157.2015
|
| 106 |
+
4199,420000,-2171.1243
|
| 107 |
+
4239,424000,-2154.7868
|
| 108 |
+
4279,428000,-2164.997
|
| 109 |
+
4319,432000,-2162.2733
|
| 110 |
+
4359,436000,-2167.9713
|
| 111 |
+
4399,440000,-2163.9672
|
| 112 |
+
4439,444000,-2152.2753
|
| 113 |
+
4479,448000,-2149.6665
|
| 114 |
+
4519,452000,-2160.5565
|
| 115 |
+
4559,456000,-2157.0198
|
| 116 |
+
4599,460000,-2158.6238
|
| 117 |
+
4639,464000,-2153.1465
|
| 118 |
+
4679,468000,-2161.9365
|
| 119 |
+
4719,472000,-2147.464
|
| 120 |
+
4759,476000,-2157.8608
|
| 121 |
+
4799,480000,-2163.0485
|
| 122 |
+
4839,484000,-2170.2235
|
| 123 |
+
4879,488000,-2165.6525
|
| 124 |
+
4919,492000,-2161.917
|
| 125 |
+
4959,496000,-2157.1193
|
| 126 |
+
4999,500000,-2146.3092
|
code/Lot-sizing/logs/15items_5machines_i100_PDPPO/PDPPO_15items_5machines_i100_log_3.csv
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
episode,timestep,reward
|
| 2 |
+
39,4000,-2240.3108
|
| 3 |
+
79,8000,-2288.825
|
| 4 |
+
119,12000,-2320.0678
|
| 5 |
+
159,16000,-2266.3545
|
| 6 |
+
199,20000,-2254.704
|
| 7 |
+
239,24000,-2284.7492
|
| 8 |
+
279,28000,-2275.9468
|
| 9 |
+
319,32000,-2268.6098
|
| 10 |
+
359,36000,-2288.8185
|
| 11 |
+
399,40000,-2272.3042
|
| 12 |
+
439,44000,-2261.6483
|
| 13 |
+
479,48000,-2266.3038
|
| 14 |
+
519,52000,-2270.6518
|
| 15 |
+
559,56000,-2244.104
|
| 16 |
+
599,60000,-2245.5072
|
| 17 |
+
639,64000,-2238.031
|
| 18 |
+
679,68000,-2220.696
|
| 19 |
+
719,72000,-2220.8338
|
| 20 |
+
759,76000,-2215.0553
|
| 21 |
+
799,80000,-2206.708
|
| 22 |
+
839,84000,-2227.824
|
| 23 |
+
879,88000,-2214.947
|
| 24 |
+
919,92000,-2217.5067
|
| 25 |
+
959,96000,-2224.1628
|
| 26 |
+
999,100000,-2221.2443
|
| 27 |
+
1039,104000,-2244.4968
|
| 28 |
+
1079,108000,-2239.8208
|
| 29 |
+
1119,112000,-2232.748
|
| 30 |
+
1159,116000,-2223.8978
|
| 31 |
+
1199,120000,-2218.0928
|
| 32 |
+
1239,124000,-2203.4095
|
| 33 |
+
1279,128000,-2204.5672
|
| 34 |
+
1319,132000,-2238.5695
|
| 35 |
+
1359,136000,-2203.7973
|
| 36 |
+
1399,140000,-2217.6258
|
| 37 |
+
1439,144000,-2213.7642
|
| 38 |
+
1479,148000,-2207.4387
|
| 39 |
+
1519,152000,-2215.8908
|
| 40 |
+
1559,156000,-2191.1362
|
| 41 |
+
1599,160000,-2224.398
|
| 42 |
+
1639,164000,-2201.3452
|
| 43 |
+
1679,168000,-2189.2553
|
| 44 |
+
1719,172000,-2209.868
|
| 45 |
+
1759,176000,-2169.1652
|
| 46 |
+
1799,180000,-2191.6032
|
| 47 |
+
1839,184000,-2192.4662
|
| 48 |
+
1879,188000,-2173.139
|
| 49 |
+
1919,192000,-2180.778
|
| 50 |
+
1959,196000,-2193.01
|
| 51 |
+
1999,200000,-2196.909
|
| 52 |
+
2039,204000,-2203.634
|
| 53 |
+
2079,208000,-2203.3062
|
| 54 |
+
2119,212000,-2205.7118
|
| 55 |
+
2159,216000,-2221.6275
|
| 56 |
+
2199,220000,-2207.0085
|
| 57 |
+
2239,224000,-2205.649
|
| 58 |
+
2279,228000,-2229.8532
|
| 59 |
+
2319,232000,-2198.7525
|
| 60 |
+
2359,236000,-2180.7215
|
| 61 |
+
2399,240000,-2173.0688
|
| 62 |
+
2439,244000,-2191.3938
|
| 63 |
+
2479,248000,-2194.5465
|
| 64 |
+
2519,252000,-2200.4895
|
| 65 |
+
2559,256000,-2215.1643
|
| 66 |
+
2599,260000,-2196.0888
|
| 67 |
+
2639,264000,-2205.88
|
| 68 |
+
2679,268000,-2186.5843
|
| 69 |
+
2719,272000,-2189.5945
|
| 70 |
+
2759,276000,-2177.4112
|
| 71 |
+
2799,280000,-2185.7688
|
| 72 |
+
2839,284000,-2180.7005
|
| 73 |
+
2879,288000,-2194.3678
|
| 74 |
+
2919,292000,-2183.5812
|
| 75 |
+
2959,296000,-2188.0495
|
| 76 |
+
2999,300000,-2185.9692
|
| 77 |
+
3039,304000,-2178.563
|
| 78 |
+
3079,308000,-2184.8002
|
| 79 |
+
3119,312000,-2210.264
|
| 80 |
+
3159,316000,-2190.2037
|
| 81 |
+
3199,320000,-2198.2853
|
| 82 |
+
3239,324000,-2206.658
|
| 83 |
+
3279,328000,-2197.803
|
| 84 |
+
3319,332000,-2206.5752
|
| 85 |
+
3359,336000,-2210.574
|
| 86 |
+
3399,340000,-2207.2495
|
| 87 |
+
3439,344000,-2222.5217
|
| 88 |
+
3479,348000,-2208.8218
|
| 89 |
+
3519,352000,-2214.9137
|
| 90 |
+
3559,356000,-2223.4288
|
| 91 |
+
3599,360000,-2226.1332
|
| 92 |
+
3639,364000,-2227.895
|
| 93 |
+
3679,368000,-2213.1972
|
| 94 |
+
3719,372000,-2217.1715
|
| 95 |
+
3759,376000,-2229.5115
|
| 96 |
+
3799,380000,-2232.2263
|
| 97 |
+
3839,384000,-2250.712
|
| 98 |
+
3879,388000,-2237.0413
|
| 99 |
+
3919,392000,-2237.8288
|
| 100 |
+
3959,396000,-2242.2087
|
| 101 |
+
3999,400000,-2242.8518
|
| 102 |
+
4039,404000,-2242.582
|
| 103 |
+
4079,408000,-2247.5048
|
| 104 |
+
4119,412000,-2219.5345
|
| 105 |
+
4159,416000,-2219.813
|
| 106 |
+
4199,420000,-2206.089
|
| 107 |
+
4239,424000,-2229.2065
|
| 108 |
+
4279,428000,-2232.5973
|
| 109 |
+
4319,432000,-2220.915
|
| 110 |
+
4359,436000,-2213.3003
|
| 111 |
+
4399,440000,-2225.92
|
| 112 |
+
4439,444000,-2229.2655
|
| 113 |
+
4479,448000,-2223.2977
|
| 114 |
+
4519,452000,-2222.3368
|
| 115 |
+
4559,456000,-2217.945
|
| 116 |
+
4599,460000,-2209.8247
|
| 117 |
+
4639,464000,-2203.7908
|
| 118 |
+
4679,468000,-2222.4963
|
| 119 |
+
4719,472000,-2213.5595
|
| 120 |
+
4759,476000,-2207.0573
|
| 121 |
+
4799,480000,-2224.0718
|
| 122 |
+
4839,484000,-2192.7728
|
| 123 |
+
4879,488000,-2211.8895
|
| 124 |
+
4919,492000,-2209.2267
|
| 125 |
+
4959,496000,-2208.4648
|
| 126 |
+
4999,500000,-2238.9572
|
code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_0_0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34225abaea5568390f9cbf0a8d2382fa6d67e11addda79738a756604f1550ac6
|
| 3 |
+
size 199811
|
code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_0.csv
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
episode,timestep,reward
|
| 2 |
+
19,2000,-2135.3463
|
| 3 |
+
39,4000,-2149.332
|
| 4 |
+
59,6000,-2116.58
|
| 5 |
+
79,8000,-2147.7215
|
| 6 |
+
99,10000,-2145.4525
|
| 7 |
+
119,12000,-2129.431
|
| 8 |
+
139,14000,-2166.079
|
| 9 |
+
159,16000,-2163.6345
|
| 10 |
+
179,18000,-2139.9555
|
| 11 |
+
199,20000,-2163.143
|
| 12 |
+
219,22000,-2153.983
|
| 13 |
+
239,24000,-2155.218
|
| 14 |
+
259,26000,-2188.591
|
| 15 |
+
279,28000,-2189.6865
|
| 16 |
+
299,30000,-2163.1825
|
| 17 |
+
319,32000,-2164.4245
|
| 18 |
+
339,34000,-2179.1755
|
| 19 |
+
359,36000,-2196.876
|
| 20 |
+
379,38000,-2174.789
|
| 21 |
+
399,40000,-2177.7325
|
| 22 |
+
419,42000,-2171.5065
|
| 23 |
+
439,44000,-2180.978
|
| 24 |
+
459,46000,-2177.033
|
| 25 |
+
479,48000,-2166.894
|
| 26 |
+
499,50000,-2174.373
|
| 27 |
+
519,52000,-2160.9975
|
| 28 |
+
539,54000,-2139.3125
|
| 29 |
+
559,56000,-2153.796
|
| 30 |
+
579,58000,-2160.5355
|
| 31 |
+
599,60000,-2147.8125
|
| 32 |
+
619,62000,-2160.4915
|
| 33 |
+
639,64000,-2156.354
|
| 34 |
+
659,66000,-2120.872
|
| 35 |
+
679,68000,-2144.24
|
| 36 |
+
699,70000,-2132.259
|
| 37 |
+
719,72000,-2161.746
|
| 38 |
+
739,74000,-2157.3845
|
| 39 |
+
759,76000,-2152.7245
|
| 40 |
+
779,78000,-2172.0235
|
| 41 |
+
799,80000,-2142.163
|
| 42 |
+
819,82000,-2139.4385
|
| 43 |
+
839,84000,-2144.8855
|
| 44 |
+
859,86000,-2168.6705
|
| 45 |
+
879,88000,-2151.953
|
| 46 |
+
899,90000,-2163.172
|
| 47 |
+
919,92000,-2146.0855
|
| 48 |
+
939,94000,-2164.2995
|
| 49 |
+
959,96000,-2136.362
|
| 50 |
+
979,98000,-2144.5915
|
| 51 |
+
999,100000,-2140.8605
|
| 52 |
+
1019,102000,-2145.3225
|
| 53 |
+
1039,104000,-2158.671
|
| 54 |
+
1059,106000,-2143.01
|
| 55 |
+
1079,108000,-2139.721
|
| 56 |
+
1099,110000,-2116.822
|
| 57 |
+
1119,112000,-2114.3315
|
| 58 |
+
1139,114000,-2124.026
|
| 59 |
+
1159,116000,-2142.8155
|
| 60 |
+
1179,118000,-2147.8685
|
| 61 |
+
1199,120000,-2143.8945
|
| 62 |
+
1219,122000,-2146.832
|
| 63 |
+
1239,124000,-2124.1695
|
| 64 |
+
1259,126000,-2090.169
|
| 65 |
+
1279,128000,-2146.8415
|
| 66 |
+
1299,130000,-2121.292
|
| 67 |
+
1319,132000,-2121.0195
|
| 68 |
+
1339,134000,-2123.3185
|
| 69 |
+
1359,136000,-2140.0235
|
| 70 |
+
1379,138000,-2099.58
|
| 71 |
+
1399,140000,-2110.5595
|
| 72 |
+
1419,142000,-2113.717
|
| 73 |
+
1439,144000,-2115.8905
|
| 74 |
+
1459,146000,-2095.1055
|
| 75 |
+
1479,148000,-2106.1685
|
| 76 |
+
1499,150000,-2109.4955
|
| 77 |
+
1519,152000,-2111.4375
|
| 78 |
+
1539,154000,-2106.307
|
| 79 |
+
1559,156000,-2130.6555
|
| 80 |
+
1579,158000,-2136.0985
|
| 81 |
+
1599,160000,-2121.9925
|
| 82 |
+
1619,162000,-2109.5255
|
| 83 |
+
1639,164000,-2128.574
|
| 84 |
+
1659,166000,-2124.366
|
| 85 |
+
1679,168000,-2139.8685
|
| 86 |
+
1699,170000,-2116.211
|
| 87 |
+
1719,172000,-2126.852
|
| 88 |
+
1739,174000,-2117.076
|
| 89 |
+
1759,176000,-2135.6755
|
| 90 |
+
1779,178000,-2117.5595
|
| 91 |
+
1799,180000,-2131.8435
|
| 92 |
+
1819,182000,-2141.5565
|
| 93 |
+
1839,184000,-2150.929
|
| 94 |
+
1859,186000,-2139.8145
|
| 95 |
+
1879,188000,-2129.5425
|
| 96 |
+
1899,190000,-2126.8315
|
| 97 |
+
1919,192000,-2133.958
|
| 98 |
+
1939,194000,-2141.4045
|
| 99 |
+
1959,196000,-2133.663
|
| 100 |
+
1979,198000,-2141.9005
|
| 101 |
+
1999,200000,-2148.833
|
| 102 |
+
2019,202000,-2131.9035
|
| 103 |
+
2039,204000,-2174.289
|
| 104 |
+
2059,206000,-2160.0245
|
| 105 |
+
2079,208000,-2165.5785
|
| 106 |
+
2099,210000,-2147.701
|
| 107 |
+
2119,212000,-2154.0045
|
| 108 |
+
2139,214000,-2124.077
|
| 109 |
+
2159,216000,-2166.3835
|
| 110 |
+
2179,218000,-2168.514
|
| 111 |
+
2199,220000,-2152.2125
|
| 112 |
+
2219,222000,-2162.136
|
| 113 |
+
2239,224000,-2154.616
|
| 114 |
+
2259,226000,-2148.246
|
| 115 |
+
2279,228000,-2146.5235
|
| 116 |
+
2299,230000,-2143.4965
|
| 117 |
+
2319,232000,-2133.6155
|
| 118 |
+
2339,234000,-2144.0695
|
| 119 |
+
2359,236000,-2139.9
|
| 120 |
+
2379,238000,-2137.7725
|
| 121 |
+
2399,240000,-2151.268
|
| 122 |
+
2419,242000,-2175.9375
|
| 123 |
+
2439,244000,-2157.845
|
| 124 |
+
2459,246000,-2153.9685
|
| 125 |
+
2479,248000,-2175.319
|
| 126 |
+
2499,250000,-2140.522
|
| 127 |
+
2519,252000,-2154.707
|
| 128 |
+
2539,254000,-2133.141
|
| 129 |
+
2559,256000,-2122.6885
|
| 130 |
+
2579,258000,-2136.63
|
| 131 |
+
2599,260000,-2141.906
|
| 132 |
+
2619,262000,-2136.6815
|
| 133 |
+
2639,264000,-2109.2965
|
| 134 |
+
2659,266000,-2122.899
|
| 135 |
+
2679,268000,-2149.3255
|
| 136 |
+
2699,270000,-2118.7445
|
| 137 |
+
2719,272000,-2131.16
|
| 138 |
+
2739,274000,-2119.327
|
| 139 |
+
2759,276000,-2127.0115
|
| 140 |
+
2779,278000,-2165.839
|
| 141 |
+
2799,280000,-2163.743
|
| 142 |
+
2819,282000,-2135.451
|
| 143 |
+
2839,284000,-2144.345
|
| 144 |
+
2859,286000,-2129.195
|
| 145 |
+
2879,288000,-2143.4665
|
| 146 |
+
2899,290000,-2130.941
|
| 147 |
+
2919,292000,-2147.0725
|
| 148 |
+
2939,294000,-2125.8355
|
| 149 |
+
2959,296000,-2126.066
|
| 150 |
+
2979,298000,-2146.799
|
| 151 |
+
2999,300000,-2147.949
|
| 152 |
+
3019,302000,-2100.444
|
| 153 |
+
3039,304000,-2116.093
|
| 154 |
+
3059,306000,-2122.09
|
| 155 |
+
3079,308000,-2136.446
|
| 156 |
+
3099,310000,-2106.498
|
| 157 |
+
3119,312000,-2101.977
|
| 158 |
+
3139,314000,-2102.1295
|
| 159 |
+
3159,316000,-2092.621
|
| 160 |
+
3179,318000,-2112.0175
|
| 161 |
+
3199,320000,-2102.532
|
| 162 |
+
3219,322000,-2100.1165
|
| 163 |
+
3239,324000,-2108.5405
|
| 164 |
+
3259,326000,-2117.316
|
| 165 |
+
3279,328000,-2113.263
|
| 166 |
+
3299,330000,-2095.814
|
| 167 |
+
3319,332000,-2097.3245
|
| 168 |
+
3339,334000,-2091.1245
|
| 169 |
+
3359,336000,-2112.114
|
| 170 |
+
3379,338000,-2107.756
|
| 171 |
+
3399,340000,-2105.6305
|
| 172 |
+
3419,342000,-2106.4435
|
| 173 |
+
3439,344000,-2093.697
|
| 174 |
+
3459,346000,-2101.936
|
| 175 |
+
3479,348000,-2087.019
|
| 176 |
+
3499,350000,-2094.8375
|
| 177 |
+
3519,352000,-2091.358
|
| 178 |
+
3539,354000,-2114.3615
|
| 179 |
+
3559,356000,-2131.719
|
| 180 |
+
3579,358000,-2116.838
|
| 181 |
+
3599,360000,-2128.923
|
| 182 |
+
3619,362000,-2104.5615
|
| 183 |
+
3639,364000,-2109.625
|
| 184 |
+
3659,366000,-2106.293
|
| 185 |
+
3679,368000,-2124.0315
|
| 186 |
+
3699,370000,-2116.146
|
| 187 |
+
3719,372000,-2121.4415
|
| 188 |
+
3739,374000,-2084.2695
|
| 189 |
+
3759,376000,-2104.179
|
| 190 |
+
3779,378000,-2111.046
|
| 191 |
+
3799,380000,-2108.5605
|
| 192 |
+
3819,382000,-2092.0465
|
| 193 |
+
3839,384000,-2107.194
|
| 194 |
+
3859,386000,-2095.3865
|
| 195 |
+
3879,388000,-2082.453
|
| 196 |
+
3899,390000,-2119.981
|
| 197 |
+
3919,392000,-2104.4325
|
| 198 |
+
3939,394000,-2100.127
|
| 199 |
+
3959,396000,-2103.365
|
| 200 |
+
3979,398000,-2108.799
|
| 201 |
+
3999,400000,-2087.373
|
| 202 |
+
4019,402000,-2089.962
|
| 203 |
+
4039,404000,-2113.7635
|
| 204 |
+
4059,406000,-2127.984
|
| 205 |
+
4079,408000,-2087.538
|
| 206 |
+
4099,410000,-2071.391
|
| 207 |
+
4119,412000,-2103.1025
|
| 208 |
+
4139,414000,-2092.2085
|
| 209 |
+
4159,416000,-2088.2855
|
| 210 |
+
4179,418000,-2094.342
|
| 211 |
+
4199,420000,-2089.6075
|
| 212 |
+
4219,422000,-2088.1145
|
| 213 |
+
4239,424000,-2101.0985
|
| 214 |
+
4259,426000,-2107.1365
|
| 215 |
+
4279,428000,-2093.734
|
| 216 |
+
4299,430000,-2090.7895
|
| 217 |
+
4319,432000,-2079.56
|
| 218 |
+
4339,434000,-2083.1335
|
| 219 |
+
4359,436000,-2087.81
|
| 220 |
+
4379,438000,-2096.6135
|
| 221 |
+
4399,440000,-2089.9545
|
| 222 |
+
4419,442000,-2074.709
|
| 223 |
+
4439,444000,-2080.6065
|
| 224 |
+
4459,446000,-2078.952
|
| 225 |
+
4479,448000,-2059.433
|
| 226 |
+
4499,450000,-2049.38
|
| 227 |
+
4519,452000,-2065.312
|
| 228 |
+
4539,454000,-2057.3825
|
| 229 |
+
4559,456000,-2085.955
|
| 230 |
+
4579,458000,-2092.071
|
| 231 |
+
4599,460000,-2073.4495
|
| 232 |
+
4619,462000,-2082.937
|
| 233 |
+
4639,464000,-2077.3055
|
| 234 |
+
4659,466000,-2078.2065
|
| 235 |
+
4679,468000,-2062.653
|
| 236 |
+
4699,470000,-2054.374
|
| 237 |
+
4719,472000,-2074.2705
|
| 238 |
+
4739,474000,-2066.9925
|
| 239 |
+
4759,476000,-2049.2215
|
| 240 |
+
4779,478000,-2071.545
|
| 241 |
+
4799,480000,-2057.4975
|
| 242 |
+
4819,482000,-2045.2775
|
| 243 |
+
4839,484000,-2059.3195
|
| 244 |
+
4859,486000,-2054.074
|
| 245 |
+
4879,488000,-2069.4245
|
| 246 |
+
4899,490000,-2069.116
|
| 247 |
+
4919,492000,-2038.679
|
| 248 |
+
4939,494000,-2068.0445
|
| 249 |
+
4959,496000,-2039.354
|
| 250 |
+
4979,498000,-2032.349
|
| 251 |
+
4999,500000,-2026.1585
|
code/Lot-sizing/logs/15items_5machines_i100_PPO/PPO_15items_5machines_i100_log_2.csv
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
episode,timestep,reward
|
| 2 |
+
19,2000,-2380.8889
|
| 3 |
+
39,4000,-2373.828
|
| 4 |
+
59,6000,-2335.17
|
| 5 |
+
79,8000,-2358.4425
|
| 6 |
+
99,10000,-2358.743
|
| 7 |
+
119,12000,-2359.581
|
| 8 |
+
139,14000,-2374.202
|
| 9 |
+
159,16000,-2350.303
|
| 10 |
+
179,18000,-2351.3965
|
| 11 |
+
199,20000,-2369.7695
|
| 12 |
+
219,22000,-2333.9435
|
| 13 |
+
239,24000,-2349.8265
|
| 14 |
+
259,26000,-2369.537
|
| 15 |
+
279,28000,-2370.489
|
| 16 |
+
299,30000,-2325.0625
|
| 17 |
+
319,32000,-2364.1575
|
| 18 |
+
339,34000,-2315.2425
|
| 19 |
+
359,36000,-2347.791
|
| 20 |
+
379,38000,-2333.213
|
| 21 |
+
399,40000,-2345.041
|
| 22 |
+
419,42000,-2324.087
|
| 23 |
+
439,44000,-2327.946
|
| 24 |
+
459,46000,-2290.026
|
| 25 |
+
479,48000,-2297.604
|
| 26 |
+
499,50000,-2318.3805
|
| 27 |
+
519,52000,-2304.4445
|
| 28 |
+
539,54000,-2278.6925
|
| 29 |
+
559,56000,-2279.398
|
| 30 |
+
579,58000,-2275.2595
|
| 31 |
+
599,60000,-2292.0225
|
| 32 |
+
619,62000,-2292.0715
|
| 33 |
+
639,64000,-2304.709
|
| 34 |
+
659,66000,-2277.029
|
| 35 |
+
679,68000,-2258.1065
|
| 36 |
+
699,70000,-2263.7255
|
| 37 |
+
719,72000,-2265.5755
|
| 38 |
+
739,74000,-2246.222
|
| 39 |
+
759,76000,-2234.866
|
| 40 |
+
779,78000,-2254.3855
|
| 41 |
+
799,80000,-2239.978
|
| 42 |
+
819,82000,-2205.3095
|
| 43 |
+
839,84000,-2230.3575
|
| 44 |
+
859,86000,-2225.6055
|
| 45 |
+
879,88000,-2229.913
|
| 46 |
+
899,90000,-2244.0605
|
| 47 |
+
919,92000,-2229.112
|
| 48 |
+
939,94000,-2233.0065
|
| 49 |
+
959,96000,-2221.533
|
| 50 |
+
979,98000,-2199.2975
|
| 51 |
+
999,100000,-2218.0545
|
| 52 |
+
1019,102000,-2219.6245
|
| 53 |
+
1039,104000,-2212.3515
|
| 54 |
+
1059,106000,-2228.366
|
| 55 |
+
1079,108000,-2213.773
|
| 56 |
+
1099,110000,-2214.438
|
| 57 |
+
1119,112000,-2218.1
|
| 58 |
+
1139,114000,-2195.482
|
| 59 |
+
1159,116000,-2225.5825
|
| 60 |
+
1179,118000,-2213.362
|
| 61 |
+
1199,120000,-2211.5005
|
| 62 |
+
1219,122000,-2208.066
|
| 63 |
+
1239,124000,-2191.501
|
| 64 |
+
1259,126000,-2235.9985
|
| 65 |
+
1279,128000,-2211.5905
|
| 66 |
+
1299,130000,-2202.716
|
| 67 |
+
1319,132000,-2212.6015
|
| 68 |
+
1339,134000,-2216.1535
|
| 69 |
+
1359,136000,-2215.3695
|
| 70 |
+
1379,138000,-2210.9315
|
| 71 |
+
1399,140000,-2219.104
|
| 72 |
+
1419,142000,-2223.478
|
| 73 |
+
1439,144000,-2222.4635
|
| 74 |
+
1459,146000,-2221.686
|
| 75 |
+
1479,148000,-2211.6465
|
| 76 |
+
1499,150000,-2208.096
|
| 77 |
+
1519,152000,-2209.976
|
| 78 |
+
1539,154000,-2199.5775
|
| 79 |
+
1559,156000,-2213.538
|
| 80 |
+
1579,158000,-2196.544
|
| 81 |
+
1599,160000,-2191.9365
|
| 82 |
+
1619,162000,-2202.8655
|
| 83 |
+
1639,164000,-2195.785
|
| 84 |
+
1659,166000,-2197.826
|
| 85 |
+
1679,168000,-2198.4345
|
| 86 |
+
1699,170000,-2192.2155
|
| 87 |
+
1719,172000,-2183.3555
|
| 88 |
+
1739,174000,-2215.12
|
| 89 |
+
1759,176000,-2183.842
|
| 90 |
+
1779,178000,-2185.168
|
| 91 |
+
1799,180000,-2173.7945
|
| 92 |
+
1819,182000,-2172.845
|
| 93 |
+
1839,184000,-2176.132
|
| 94 |
+
1859,186000,-2188.4535
|
| 95 |
+
1879,188000,-2156.692
|
| 96 |
+
1899,190000,-2169.1765
|
| 97 |
+
1919,192000,-2150.046
|
| 98 |
+
1939,194000,-2169.566
|
| 99 |
+
1959,196000,-2159.7815
|
| 100 |
+
1979,198000,-2167.8865
|
| 101 |
+
1999,200000,-2188.2145
|
| 102 |
+
2019,202000,-2145.591
|
| 103 |
+
2039,204000,-2156.559
|
| 104 |
+
2059,206000,-2164.4925
|
| 105 |
+
2079,208000,-2162.0795
|
| 106 |
+
2099,210000,-2157.1775
|
| 107 |
+
2119,212000,-2145.232
|
| 108 |
+
2139,214000,-2147.627
|
| 109 |
+
2159,216000,-2154.1195
|
| 110 |
+
2179,218000,-2155.8565
|
| 111 |
+
2199,220000,-2134.7075
|
| 112 |
+
2219,222000,-2127.8285
|
| 113 |
+
2239,224000,-2168.0365
|
| 114 |
+
2259,226000,-2142.8975
|
| 115 |
+
2279,228000,-2140.589
|
| 116 |
+
2299,230000,-2149.4825
|
| 117 |
+
2319,232000,-2140.3645
|
| 118 |
+
2339,234000,-2143.029
|
| 119 |
+
2359,236000,-2126.4945
|
| 120 |
+
2379,238000,-2135.033
|
| 121 |
+
2399,240000,-2124.0925
|
| 122 |
+
2419,242000,-2118.4725
|
| 123 |
+
2439,244000,-2134.5425
|
| 124 |
+
2459,246000,-2102.6725
|
| 125 |
+
2479,248000,-2117.8615
|
| 126 |
+
2499,250000,-2123.2265
|
| 127 |
+
2519,252000,-2099.3605
|
| 128 |
+
2539,254000,-2113.7965
|
| 129 |
+
2559,256000,-2126.2285
|
| 130 |
+
2579,258000,-2124.4015
|
| 131 |
+
2599,260000,-2099.2445
|
| 132 |
+
2619,262000,-2131.927
|
| 133 |
+
2639,264000,-2128.738
|
| 134 |
+
2659,266000,-2120.8685
|
| 135 |
+
2679,268000,-2121.595
|
| 136 |
+
2699,270000,-2140.079
|
| 137 |
+
2719,272000,-2115.872
|
| 138 |
+
2739,274000,-2105.3305
|
| 139 |
+
2759,276000,-2133.0435
|
| 140 |
+
2779,278000,-2117.732
|
| 141 |
+
2799,280000,-2143.8175
|
| 142 |
+
2819,282000,-2076.888
|
| 143 |
+
2839,284000,-2106.048
|
| 144 |
+
2859,286000,-2105.761
|
| 145 |
+
2879,288000,-2102.9825
|
| 146 |
+
2899,290000,-2118.6665
|
| 147 |
+
2919,292000,-2122.7975
|
| 148 |
+
2939,294000,-2121.764
|
| 149 |
+
2959,296000,-2128.1515
|
| 150 |
+
2979,298000,-2113.3235
|
| 151 |
+
2999,300000,-2126.751
|
| 152 |
+
3019,302000,-2111.186
|
| 153 |
+
3039,304000,-2112.5405
|
| 154 |
+
3059,306000,-2110.0095
|
| 155 |
+
3079,308000,-2118.0815
|
| 156 |
+
3099,310000,-2100.6005
|
| 157 |
+
3119,312000,-2106.429
|
| 158 |
+
3139,314000,-2092.304
|
| 159 |
+
3159,316000,-2105.092
|
| 160 |
+
3179,318000,-2085.4645
|
| 161 |
+
3199,320000,-2107.1535
|
| 162 |
+
3219,322000,-2107.04
|
| 163 |
+
3239,324000,-2092.5935
|
| 164 |
+
3259,326000,-2096.9715
|
| 165 |
+
3279,328000,-2103.3905
|
| 166 |
+
3299,330000,-2105.1935
|
| 167 |
+
3319,332000,-2108.05
|
| 168 |
+
3339,334000,-2100.4505
|
| 169 |
+
3359,336000,-2087.976
|
| 170 |
+
3379,338000,-2093.996
|
| 171 |
+
3399,340000,-2103.395
|
| 172 |
+
3419,342000,-2075.1395
|
| 173 |
+
3439,344000,-2100.193
|
| 174 |
+
3459,346000,-2097.6485
|
| 175 |
+
3479,348000,-2103.601
|
| 176 |
+
3499,350000,-2109.8605
|
| 177 |
+
3519,352000,-2087.653
|
| 178 |
+
3539,354000,-2126.2165
|
| 179 |
+
3559,356000,-2117.9495
|
| 180 |
+
3579,358000,-2112.5835
|
| 181 |
+
3599,360000,-2117.1415
|
| 182 |
+
3619,362000,-2108.5045
|
| 183 |
+
3639,364000,-2103.0745
|
| 184 |
+
3659,366000,-2111.068
|
| 185 |
+
3679,368000,-2126.239
|
| 186 |
+
3699,370000,-2104.904
|
| 187 |
+
3719,372000,-2085.1685
|
| 188 |
+
3739,374000,-2093.3945
|
| 189 |
+
3759,376000,-2101.3165
|
| 190 |
+
3779,378000,-2103.0655
|
| 191 |
+
3799,380000,-2101.006
|
| 192 |
+
3819,382000,-2103.158
|
| 193 |
+
3839,384000,-2102.5225
|
| 194 |
+
3859,386000,-2107.4555
|
| 195 |
+
3879,388000,-2095.627
|
| 196 |
+
3899,390000,-2114.5905
|
| 197 |
+
3919,392000,-2112.6065
|
| 198 |
+
3939,394000,-2103.3065
|
| 199 |
+
3959,396000,-2111.277
|
| 200 |
+
3979,398000,-2106.088
|
| 201 |
+
3999,400000,-2106.7605
|
| 202 |
+
4019,402000,-2084.668
|
| 203 |
+
4039,404000,-2104.5425
|
| 204 |
+
4059,406000,-2105.0865
|
| 205 |
+
4079,408000,-2086.7805
|
| 206 |
+
4099,410000,-2116.368
|
| 207 |
+
4119,412000,-2100.076
|
| 208 |
+
4139,414000,-2115.2785
|
| 209 |
+
4159,416000,-2111.847
|
| 210 |
+
4179,418000,-2075.2525
|
| 211 |
+
4199,420000,-2089.003
|
| 212 |
+
4219,422000,-2101.154
|
| 213 |
+
4239,424000,-2099.7625
|
| 214 |
+
4259,426000,-2118.5795
|
| 215 |
+
4279,428000,-2108.951
|
| 216 |
+
4299,430000,-2099.8935
|
code/Lot-sizing/logs/best_A2C_15items_5machines_i100_0/best_model.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43b750cb2d2aac2c7897b0e0f6d1495b4e6a4cd6a5dc5938b3359734024810bc
|
| 3 |
+
size 1022438
|
code/Lot-sizing/logs/evaluations.npz
ADDED
|
Binary file (40.2 kB). View file
|
|
|
code/Lot-sizing/models/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (419 Bytes). View file
|
|
|
code/Lot-sizing/models/__pycache__/multistageOptimization.cpython-38.pyc
ADDED
|
Binary file (6.56 kB). View file
|
|
|
code/Lot-sizing/models/__pycache__/optimizationProblemInstance.cpython-38.pyc
ADDED
|
Binary file (877 Bytes). View file
|
|
|
code/Lot-sizing/models/__pycache__/perfectInfoOptimization.cpython-38.pyc
ADDED
|
Binary file (5.42 kB). View file
|
|
|
code/Lot-sizing/results/PDPPO_15items_5machines_i100_actions_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e173a4ff1f1af7ec376b991a97d5d971a9358e0ee59e78d1dac0c5706dce0a3
|
| 3 |
+
size 1200128
|
code/Lot-sizing/results/PDPPO_15items_5machines_i100_costs_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d4f6e5d4c9a9837667f1df2a98f3556b3b67f700011695efcd5503099bfd39b7
|
| 3 |
+
size 136
|
code/Lot-sizing/results/PDPPO_15items_5machines_i100_demands_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8427469f56af868c1a5fed54b3b235af670398cb091217c84c3d710f8c02ea5
|
| 3 |
+
size 1800128
|
code/Lot-sizing/results/PDPPO_15items_5machines_i100_holding_costs_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:433d5d5321c151cfdc6fb420c90de1c55c6ba45888445f942fe3880c56b7d947
|
| 3 |
+
size 240128
|
code/Lot-sizing/results/PDPPO_15items_5machines_i100_lost_sales_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85741d269fea615ac96e4de74211ce0aa144e1d69d471fcce17cc133c6c0a88b
|
| 3 |
+
size 240128
|
code/Lot-sizing/results/PDPPO_15items_5machines_i100_observations_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49242c880236d956378b24445e0d0c653d8af1f474e5fd61113e125ed244dfba
|
| 3 |
+
size 744805
|
code/Lot-sizing/results/PDPPO_15items_5machines_i100_setup_costs_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:da0680577feff47cc4071a0ac12f0ba5b4f6c3a1240e33a5cfb799916317c6f9
|
| 3 |
+
size 240128
|
code/Lot-sizing/results/PPO_15items_5machines_i100_actions_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:de66cf942bdc3f335699c6c75de45a952811465f0a0d19dfdb4121c2baa1314f
|
| 3 |
+
size 400128
|
code/Lot-sizing/results/PPO_15items_5machines_i100_costs_test.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b6056e23a628ef0b32f9b252ff401ac9aadb26324986af77c7e80e2d0bb201cc
|
| 3 |
+
size 136
|