pdppo / code /Lot-sizing /agents /stableBaselineAgents.py
leokana's picture
include modifications to test dual critic ppo
4b36c77
# -*- coding: utf-8 -*-
import os
import time
import gym
import torch
import numpy as np
import copy
from envs import SimplePlant
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from stable_baselines3 import PPO,A2C,DQN,SAC,DDPG
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback
class SimplePlantSB(SimplePlant):
def __init__(self, settings, stoch_model):
super().__init__(settings, stoch_model)
try:self.dict_obs = settings['dict_obs']
except:self.dict_obs = False
self.last_inventory = copy.copy(self.inventory_level)
self.action_space = gym.spaces.MultiDiscrete(
[self.n_items+1] * self.n_machines
)
if self.dict_obs:
self.observation_space = gym.spaces.Dict({
'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items),
'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines),
'last_inventory_level':gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items)
})
else:
self.observation_space = gym.spaces.Box(
low=np.zeros(2*self.n_items+self.n_machines),# high for the inventory level
high=np.concatenate(
[
np.array(self.max_inventory_level),
np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups
np.array(self.max_inventory_level) # high for the inventory level
]),
dtype=np.int32
)
def step(self, action):
"""
Step method: Execute one time step within the environment
Parameters
----------
action : action given by the agent
Returns
-------
obs : Observation of the state give the method _next_observation
reward : Cost given by the _reward method
done : returns True or False given by the _done method
dict : possible information for control to environment monitoring
"""
self.last_inventory = copy.copy(self.inventory_level)
self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand)
# self.total_cost['setup_costs'] = 0
# self.total_cost['holding_costs'] = 0
reward = -sum([ele for key, ele in self.total_cost.items()])
#reward = -self.total_cost['lost_sales']
#reward = np.abs(action)
self.current_step += 1
done = self.current_step == self.T
obs = self._next_observation()
return obs, reward, done, self.total_cost
def _next_observation(self):
"""
Returns the next demand
"""
obs = SimplePlant._next_observation(self)
obs['last_inventory_level'] = copy.copy(self.last_inventory)
if isinstance(obs, dict):
if not self.dict_obs:
obs = np.concatenate(
(
obs['inventory_level'], # n_items size
obs['machine_setup'], # n_machine size
obs['last_inventory_level']# n_items size
)
)
else:
if self.dict_obs:
raise('Change dict_obst to False')
return obs
class StableBaselineAgent():
"""
Stable baseline Agent Agent from StableBaselines3
We adapt the env to stablebaseline requirements:
A different _next_observation is required, with the observation space.
"""
def __init__(self, env: SimplePlant, settings: dict):
super(StableBaselineAgent, self).__init__()
if settings['multiagent']:
self.env = env
else:
self.env = SimplePlantSB(env.settings, env.stoch_model)
self.last_inventory = env.inventory_level
self.model_name = settings['model_name']
self.experiment_name = settings['experiment_name']
self.parallelization = settings['parallelization']
self.run = settings['run']
try:self.dict_obs = settings['dict_obs']
except:self.dict_obs = False
self.POSSIBLE_STATES = self.env.n_items + 1
self.env.cost_to_reward = True
self.epsilon = 0
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# Use the logs file in the root path of the main.
self.LOG_DIR = os.path.join(BASE_DIR,'logs')
if self.parallelization:
# For cpu parallelization in StableBaseline learning
def make_env(seed):
def _init():
env = self.env
env = Monitor(
env,
os.path.join(f'{self.LOG_DIR}','monitor',f'{self.model_name}_{self.experiment_name}_{seed}_{self.run}'),
allow_early_resets=True
)
return env
return _init
num_cpu = 5
env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
else:
env = Monitor(
self.env,
os.path.join(f'{self.LOG_DIR}','monitor',f'{self.model_name}_{self.experiment_name}_{self.run}')
)
self.eval_callback = EvalCallback(
env,
best_model_save_path=os.path.join(f'{self.LOG_DIR}',f'best_{self.model_name}_{self.experiment_name}_{self.run}'),
log_path=f'{self.LOG_DIR}/',
eval_freq=100,
deterministic=True,
verbose=0,
render=False
)
if self.dict_obs:
policy = 'MultiInputPolicy'
else:
policy = 'MlpPolicy'
if self.model_name == 'PPO':
self.model = PPO(
policy,
env,verbose = 0, batch_size = 256, n_steps = 256, gamma = 0.96, gae_lambda = 0.9, n_epochs = 20, ent_coef = 0.0, max_grad_norm = 0.5, vf_coef = 0.5, learning_rate = 5e-3, use_sde = False, clip_range = 0.4, policy_kwargs = dict(log_std_init=-2,ortho_init=False,activation_fn=torch.nn.ReLU,net_arch=[dict(pi=[300, 300], vf=[300, 300])])
)
elif self.model_name == 'A2C':
self.model = A2C(
policy,
env,verbose = 0, learning_rate=0.002, n_steps=100, gamma = 0.95, vf_coef = 0.7,policy_kwargs= dict(net_arch=[300, 300]), seed = None
)
elif self.model_name == 'DQN':
self.model = DQN(
policy,
env, verbose = 0, learning_rate= 2.3e-3, buffer_size=100000, learning_starts=1000, batch_size=32, tau=1.0, gamma=0.99,target_update_interval=10,train_freq= 256,gradient_steps= 128, exploration_fraction=0.16, exploration_initial_eps=0.04, policy_kwargs= dict(net_arch=[300, 300]), seed = None
)
elif self.model_name == 'SAC':
self.model = SAC(
policy,
env, verbose = 0, learning_rate=0.0003, buffer_size=1000000, learning_starts=1000, batch_size=256, tau=0.005, gamma=0.99, train_freq=1, gradient_steps=1,seed = None,action_noise=None, replay_buffer_class=None, replay_buffer_kwargs=None, optimize_memory_usage=False, ent_coef='auto', target_update_interval=1, target_entropy='auto', use_sde=False, sde_sample_freq=-1, use_sde_at_warmup=False, tensorboard_log=None, create_eval_env=False, policy_kwargs=dict(activation_fn=torch.nn.ReLU,net_arch=[dict(pi=[300, 300], vf=[300, 300])])
)
elif self.model_name == 'DDPG':
self.model = DDPG(
policy,
env, verbose = 0, learning_rate=0.0003, buffer_size=1000000, learning_starts=1000, batch_size=256
)
def get_action(self, obs):
obs['last_inventory_level'] = copy.copy(self.last_inventory)
if isinstance(obs, dict):
if self.dict_obs:
act = self.model.predict(obs,deterministic=True)[0]
else:
list_obs = []
for item in obs:
list_obs.append(obs[item])
obs_ = np.array(np.concatenate(list_obs))
act = self.model.predict(obs_,deterministic=True)[0]
else:
if self.dict_obs:
raise('Change the policy to dictionary observations')
else:
act = self.model.predict(obs,deterministic=True)[0]
self.last_inventory = copy.copy(obs['inventory_level'])
return act
def learn(self, epochs=1000):
print(f"{self.model_name} learning...")
start_time = time.time()
# We define the EvalCallback wrapper to save the best model
# Here the model learns using the provided environment in the Stable baseline Agent definition
# We mutiply the number of epochs by the number of time periods to give the number of training steps
self.model.learn(
epochs*self.env.T,
callback=self.eval_callback,
# tb_log_name='PPO'
)
self.env.close()
time_duration = time.time() - start_time
print(f"Finished Learning {time_duration:.2f} s")
def load_agent(self, path):
if self.model_name == 'PPO':
self.model = PPO.load(path)
elif self.model_name == 'A2C':
self.model = A2C.load(path)
elif self.model_name == 'DQN':
self.model = DQN.load(path)
elif self.model_name == 'SAC':
self.model = SAC.load(path)
elif self.model_name == 'DDPG':
self.model = SAC.load(path)
def plot_policy(self, seed=1):
# ONLY WORKING FOR 2 ITEMS 1 MACHINE
cmap = plt.cm.get_cmap('viridis', 3)
policy_map = np.zeros((self.env.max_inventory_level[0]+1,self.env.max_inventory_level[1]+1,self.env.n_items+1))
for i in range(self.env.max_inventory_level[0]+1):
for j in range(self.env.max_inventory_level[1]+1):
for k in range(self.env.n_items+1):
obs = np.expand_dims(np.array([i,j,k]), axis = 0)
try: action = self.model.predict(obs,deterministic=True)[0][0][0]
except: action = self.model.predict(obs,deterministic=True)[0][0]
#print(f'action: {action} | obs: {obs}')
policy_map[i,j,k] = action
self.policy = policy_map
fig, axs = plt.subplots(1, self.POSSIBLE_STATES)
fig.suptitle('Found Policy')
for i, ax in enumerate(axs):
ax.set_title(f'Setup {i}')
im = ax.pcolormesh(
self.policy[:,:,i], cmap = cmap, edgecolors='k', linewidth=2
)
im.set_clim(0, self.POSSIBLE_STATES - 1)
ax.set_xlabel('I2')
if i == 0:
ax.set_ylabel('I1')
# COLOR BAR:
bound = [0,1,2]
# Creating 8 Patch instances
fig.subplots_adjust(bottom=0.2)
ax.legend(
[mpatches.Patch(color=cmap(b)) for b in bound],
['{}'.format(i) for i in range(3)],
loc='upper center', bbox_to_anchor=(-0.8,-0.13),
fancybox=True, shadow=True, ncol=3
)
fig.savefig(os.path.join(f'results', f'policy_function_{self.model_name}_{self.experiment_name}_{seed}.pdf'), bbox_inches='tight')
plt.close()
def plot_value_function(self, seed):
# ONLY WORKING FOR 2 ITEMS 1 MACHINE
value_map = np.zeros((self.env.max_inventory_level[0]+1,self.env.max_inventory_level[1]+1,self.env.n_items+1))
for i in range(self.env.max_inventory_level[0]+1):
for j in range(self.env.max_inventory_level[1]+1):
for k in range(self.env.n_items+1):
value_list = []
for action in range(self.env.n_items+1):
obs = np.expand_dims(np.array([j,i,k]), axis = 0)
action = np.array([[action]])
if torch.cuda.is_available():
obs = torch.from_numpy(obs).to(torch.float).to(device="cuda")
action = torch.from_numpy(action).to(torch.float).to(device="cuda")
else:
obs = torch.from_numpy(obs).to(torch.float)
action = torch.from_numpy(action).to(torch.float)
try:
value,prob,dist_entropy = self.model.policy.evaluate_actions(obs,action)
value_list.append(value.item())
except:
value = self.model.policy.q_net(obs)[0][int(action.item())]
value_list.append(value.item())
value_map[j,i,k] = np.array(value_list).mean()
self.value_function = value_map
# Plotting:
fig, axs = plt.subplots(nrows=1, ncols=self.POSSIBLE_STATES)
fig.suptitle('Value Function')
for i, ax in enumerate(axs):
ax.set_title(f'Setup {i}')
im = ax.imshow(
-self.value_function[:,:,i],
aspect='auto', cmap='viridis'
)
if i == 0:
ax.set_ylabel('I1')
ax.set_xlabel('I2')
ax.invert_yaxis()
fig.subplots_adjust(right=0.85)
cbar_ax = fig.add_axes([0.88, 0.15, 0.04, 0.7])
fig.colorbar(im, cax=cbar_ax)
fig.savefig(os.path.join('results',f'value_function_{self.model_name}_{self.experiment_name}_{self.run}_{seed}.pdf'))
plt.close()