| import os |
| import sys |
| import glob |
| import time |
| from datetime import datetime |
|
|
| import numpy as np |
| import gym |
| import torch |
| import copy |
| import matplotlib.pyplot as plt |
| import matplotlib.patches as mpatches |
| BASE_DIR = os.path.dirname(os.path.abspath('__file__')) |
| AGENTS_DIR = os.path.join(BASE_DIR,'agents') |
| sys.path.append(AGENTS_DIR) |
| from agents.PDPPO import PDPPO |
| from envs import SimplePlant |
| import copy |
|
|
|
|
| class SimplePlantSB(SimplePlant): |
| def __init__(self, settings, stoch_model): |
| super().__init__(settings, stoch_model) |
| try:self.dict_obs = settings['dict_obs'] |
| except:self.dict_obs = False |
| self.last_inventory = copy.copy(self.inventory_level) |
| self.action_space = gym.spaces.MultiDiscrete( |
| [self.n_items+1] * self.n_machines |
| ) |
| |
| if self.dict_obs: |
| self.observation_space = gym.spaces.Dict({ |
| 'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items), |
| |
| |
| }) |
| else: |
| self.observation_space = gym.spaces.Box( |
| low=np.zeros(self.n_items), |
| high=np.concatenate( |
| [ |
| np.array(self.max_inventory_level), |
| |
| |
| ]), |
| dtype=np.int32 |
| ) |
|
|
| def step(self, action): |
| """ |
| Step method: Execute one time step within the environment |
| |
| Parameters |
| ---------- |
| action : action given by the agent |
| |
| Returns |
| ------- |
| obs : Observation of the state give the method _next_observation |
| reward : Cost given by the _reward method |
| done : returns True or False given by the _done method |
| dict : possible information for control to environment monitoring |
| |
| """ |
| self.last_inventory = copy.copy(self.inventory_level) |
| |
| self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand) |
| |
| |
| |
| |
| reward = -sum([ele for key, ele in self.total_cost.items()]) |
| |
| |
| |
| |
| self.current_step += 1 |
| done = self.current_step == self.T |
| obs = self._next_observation() |
|
|
| return obs, reward, done, self.total_cost.copy() |
|
|
| def _next_observation(self): |
| """ |
| Returns the next demand |
| """ |
| obs = SimplePlant._next_observation(self) |
| |
| if isinstance(obs, dict): |
| if not self.dict_obs: |
| obs = np.concatenate( |
| ( |
| obs['inventory_level'], |
| |
| |
| ) |
| ) |
| else: |
| if self.dict_obs: |
| raise('Change dict_obst to False') |
| return obs |
|
|
|
|
| class PDPPOAgent(): |
| def __init__(self, env: SimplePlant, settings: dict): |
| self.env = SimplePlantSB(env.settings, env.stoch_model) |
| self.last_inventory = env.inventory_level |
| self.model_name = settings['model_name'] |
| self.experiment_name = settings['experiment_name'] |
| self.parallelization = settings['parallelization'] |
| try:self.dict_obs = settings['dict_obs'] |
| except:self.dict_obs = False |
| |
| self.POSSIBLE_STATES = self.env.n_items + 1 |
| self.env.cost_to_reward = True |
| self.epsilon = 0 |
|
|
| BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| |
| self.LOG_DIR = os.path.join(BASE_DIR,'logs') |
| |
| print("============================================================================================") |
| |
| |
| |
| self.has_continuous_action_space = False |
| |
| self.max_ep_len = env.settings['time_horizon'] |
| self.tau = 1 |
| self.tau_start = 1.0 |
| self.tau_end = 2.0 |
| |
| self.print_freq = self.max_ep_len * 40 |
| self.log_freq = self.max_ep_len * 40 |
| self.save_model_freq = int(4999) |
|
|
| |
| self.action_std = 0.6 |
| self.action_std_decay_rate = 0.05 |
| self.min_action_std = 0.1 |
| self.action_std_decay_freq = int(2.5e5) |
| |
| |
| |
| |
| |
| self.update_timestep = int(self.max_ep_len*4) |
| self.K_epochs = 40 |
| self.buffer_size_mul = 5 |
|
|
| self.eps_clip = 0.20 |
| self.gamma = 0.90 |
| |
| self.lr_actor = 0.00055 |
| self.lr_critic = 0.001 |
| |
| self.random_seed = 0 |
| |
| self.run_num_pretrained = 0 |
| |
| print("training environment name : " + self.experiment_name + '_PDPPO') |
| |
| |
| self.state_dim = self.env.observation_space.shape[0] |
| |
| |
| if self.has_continuous_action_space: |
| self.action_dim = self.env.action_space.shape[0] |
| else: |
| self.action_dim = self.env.action_space |
|
|
| self.pdppo_agent = PDPPO(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space,self.tau, self.action_std) |
|
|
| |
| |
| def learn(self,n_episodes = 100000): |
|
|
| |
| |
| |
| self.max_training_timesteps = n_episodes |
| |
| env = self.env |
| |
| |
| log_dir = self.LOG_DIR |
| if not os.path.exists(log_dir): |
| os.makedirs(log_dir) |
| |
| log_dir = log_dir + '/' + self.experiment_name + '_PDPPO/' |
| if not os.path.exists(log_dir): |
| os.makedirs(log_dir) |
| |
| |
| run_num = 0 |
| current_num_files = next(os.walk(log_dir))[2] |
| run_num = len(current_num_files) |
| |
| |
| log_f_name = log_dir + '/PDPPO_' + self.experiment_name + "_log_" + str(run_num) + ".csv" |
| |
| print("current logging run number for " + self.experiment_name + " : ", run_num) |
| print("logging at : " + log_f_name) |
| |
| |
| |
| |
| |
| directory = self.LOG_DIR |
| if not os.path.exists(directory): |
| os.makedirs(directory) |
| |
| directory = directory + '/' + self.experiment_name + '_PDPPO' + '/' |
| if not os.path.exists(directory): |
| os.makedirs(directory) |
| |
| |
| checkpoint_path = directory + "PDPPO_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained) |
| print("save checkpoint path : " + checkpoint_path) |
| |
| |
| |
| |
| print("--------------------------------------------------------------------------------------------") |
| print("max training timesteps : ", self.max_training_timesteps) |
| print("max timesteps per episode : ", self.max_ep_len) |
| print("model saving frequency : " + str(self.save_model_freq) + " timesteps") |
| print("log frequency : " + str(self.log_freq) + " timesteps") |
| print("printing average reward over episodes in last : " + str(self.print_freq) + " timesteps") |
| print("--------------------------------------------------------------------------------------------") |
| print("state space dimension : ", self.state_dim) |
| print("action space dimension : ", self.action_dim) |
| print("--------------------------------------------------------------------------------------------") |
| if self.has_continuous_action_space: |
| print("Initializing a continuous action space policy") |
| print("--------------------------------------------------------------------------------------------") |
| print("starting std of action distribution : ", self.action_std) |
| print("decay rate of std of action distribution : ", self.action_std_decay_rate) |
| print("minimum std of action distribution : ", self.action_std) |
| print("decay frequency of std of action distribution : " + str(self.action_std_decay_freq) + " timesteps") |
| else: |
| print("Initializing a discrete action space policy") |
| print("--------------------------------------------------------------------------------------------") |
| print("PDPPO update frequency : " + str(self.update_timestep) + " timesteps") |
| print("PDPPO K epochs : ", self.K_epochs) |
| print("PDPPO epsilon clip : ", self.eps_clip) |
| print("discount factor (self.gamma) : ", self.gamma) |
| print("--------------------------------------------------------------------------------------------") |
| print("optimizer learning rate actor : ", self.lr_actor) |
| print("optimizer learning rate critic : ", self.lr_critic) |
| if self.random_seed: |
| print("--------------------------------------------------------------------------------------------") |
| print("setting random seed to ", self.random_seed) |
|
|
| |
| |
| print("============================================================================================") |
| |
| |
| |
| |
| self.pdppo_agent = PDPPO(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space,self.tau, self.action_std) |
| |
| start_time = datetime.now().replace(microsecond=0) |
| print("Started training at (GMT) : ", start_time) |
| |
| print("============================================================================================") |
| |
| |
| log_f = open(log_f_name,"w+") |
| log_f.write('episode,timestep,reward\n') |
| |
| |
| print_running_reward = 0 |
| print_running_episodes = 0 |
| |
| log_running_reward = 0 |
| log_running_episodes = 0 |
| |
| time_step = 0 |
| i_episode = 0 |
| |
| annealing_steps = self.max_training_timesteps |
| |
| |
| while time_step <= self.max_training_timesteps: |
| |
| anneal_rate = (self.tau_end - self.tau_start) / annealing_steps |
| |
| self.tau = max(self.tau_end, self.tau_start + anneal_rate * time_step) |
| |
| state = env.reset() |
| current_ep_reward = 0 |
| |
| for t in range(1, self.max_ep_len+1): |
| |
| |
| action, post_reward = self.pdppo_agent.select_action(state,self.tau) |
| state, reward, done, _ = env.step(action) |
|
|
| |
| self.pdppo_agent.buffer.rewards.append(reward - post_reward.item()) |
| self.pdppo_agent.buffer.is_terminals.append(done) |
| |
| time_step +=1 |
| current_ep_reward += reward |
| |
| |
| if time_step % self.update_timestep == 0: |
| self.pdppo_agent.update() |
| |
| if time_step > self.update_timestep*self.buffer_size_mul: |
| self.pdppo_agent.buffer.clear(self.update_timestep) |
| |
| |
| if self.has_continuous_action_space and time_step % self.action_std_decay_freq == 0: |
| self.pdppo_agent.decay_self.action_std(self.action_std_decay_rate, self.action_std) |
| |
| |
| if time_step % self.log_freq == 0: |
| |
| |
| log_avg_reward = log_running_reward / log_running_episodes |
| log_avg_reward = round(log_avg_reward, 4) |
| |
| log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward)) |
| log_f.flush() |
| |
| log_running_reward = 0 |
| log_running_episodes = 0 |
| |
| |
| if time_step % self.print_freq == 0: |
| |
| |
| print_avg_reward = print_running_reward / print_running_episodes |
| print_avg_reward = round(print_avg_reward, 2) |
| |
| print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward)) |
| |
| print_running_reward = 0 |
| print_running_episodes = 0 |
| |
| |
| if time_step % self.save_model_freq == 0: |
| print("--------------------------------------------------------------------------------------------") |
| |
| self.pdppo_agent.save(checkpoint_path) |
| |
| print("Elapsed Time : ", datetime.now().replace(microsecond=0) - start_time) |
| print("--------------------------------------------------------------------------------------------") |
| |
| |
| if done: |
| break |
| |
| print_running_reward += current_ep_reward |
| print_running_episodes += 1 |
| |
| log_running_reward += current_ep_reward |
| log_running_episodes += 1 |
| |
| i_episode += 1 |
| |
| log_f.close() |
| |
| |
| |
| print("============================================================================================") |
| end_time = datetime.now().replace(microsecond=0) |
| print("Started training at (GMT) : ", start_time) |
| print("Finished training at (GMT) : ", end_time) |
| print("Total training time : ", end_time - start_time) |
| print("============================================================================================") |
|
|
| def get_action(self,state): |
| if isinstance(state, dict): |
| if not self.dict_obs: |
| state = np.concatenate( |
| ( |
| state['inventory_level'], |
| ) |
| ) |
| else: |
| if self.dict_obs: |
| raise('Change dict_obst to False') |
| return self.pdppo_agent.select_action(state,self.tau) |
| |
| def load_agent(self,path): |
| directory = self.LOG_DIR |
| directory = directory + '/' + self.experiment_name + '_PDPPO' + '/' |
| checkpoint_path = directory + "PDPPO_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained) |
| print("loading network from : " + checkpoint_path) |
| self.pdppo_agent.load(checkpoint_path) |