import os # Provides a way of interacting with the file system import sys import glob # Helps find all the pathnames matching a specified pattern according to the rules used by the Unix shell import time # Provides various time-related functions from datetime import datetime # Module that supplies classes for working with dates and times import numpy as np # A library for the Python programming language, adding support for large, multi-dimensional arrays and matrices import gym # Provides a collection of test problems — environments — that you can use to work out your reinforcement learning algorithms import torch # A machine learning framework that provides tensor computation (like NumPy) with strong acceleration on GPUs import copy # Provides a module for shallow and deep copying operations import matplotlib.pyplot as plt # A plotting library for the Python programming language and its numerical mathematics extension NumPy import matplotlib.patches as mpatches # Provides a way of adding a colored patch to the plot, for example to create a legend BASE_DIR = os.path.dirname(os.path.abspath('__file__')) AGENTS_DIR = os.path.join(BASE_DIR,'agents') sys.path.append(AGENTS_DIR) from agents.PDPPOonecritic import PDPPOonecritic from envs import SimplePlant import copy class SimplePlantSB(SimplePlant): def __init__(self, settings, stoch_model): super().__init__(settings, stoch_model) try:self.dict_obs = settings['dict_obs'] except:self.dict_obs = False self.last_inventory = copy.copy(self.inventory_level) self.action_space = gym.spaces.MultiDiscrete( [self.n_items+1] * self.n_machines ) if self.dict_obs: self.observation_space = gym.spaces.Dict({ 'inventory_level': gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items), #'machine_setup': gym.spaces.MultiDiscrete([self.n_items+1] * self.n_machines) #'last_inventory_level':gym.spaces.Box(low = np.zeros(self.n_items),high = np.ones(self.n_items)*(settings['max_inventory_level'][0]+1)*self.n_items) }) else: self.observation_space = gym.spaces.Box( low=np.zeros(self.n_items),# high for the inventory level + self.n_machines high=np.concatenate( [ np.array(self.max_inventory_level), #np.ones(self.n_machines) * (self.n_items+1), #high for the machine setups #np.array(self.max_inventory_level) # high for the inventory level ]), dtype=np.int32 ) def step(self, action): """ Step method: Execute one time step within the environment Parameters ---------- action : action given by the agent Returns ------- obs : Observation of the state give the method _next_observation reward : Cost given by the _reward method done : returns True or False given by the _done method dict : possible information for control to environment monitoring """ self.last_inventory = copy.copy(self.inventory_level) self.total_cost = self._take_action(action, self.machine_setup, self.inventory_level, self.demand) # self.total_cost['setup_costs'] = 0 # self.total_cost['holding_costs'] = 0 reward = -sum([ele for key, ele in self.total_cost.items()]) #reward = -self.total_cost['lost_sales'] #reward = np.abs(action) self.current_step += 1 done = self.current_step == self.T obs = self._next_observation() return obs, reward, done, self.total_cost def _next_observation(self): """ Returns the next demand """ obs = SimplePlant._next_observation(self) #obs['last_inventory_level'] = copy.copy(self.last_inventory) if isinstance(obs, dict): if not self.dict_obs: obs = np.concatenate( ( obs['inventory_level'], # n_items size #obs['machine_setup'], # n_machine size #obs['last_inventory_level']# n_items size ) ) else: if self.dict_obs: raise('Change dict_obst to False') return obs class PDPPOAgent_one_critic(): def __init__(self, env: SimplePlant, settings: dict): self.env = SimplePlantSB(env.settings, env.stoch_model) self.last_inventory = env.inventory_level self.model_name = settings['model_name'] self.experiment_name = settings['experiment_name'] self.parallelization = settings['parallelization'] try:self.dict_obs = settings['dict_obs'] except:self.dict_obs = False self.POSSIBLE_STATES = self.env.n_items + 1 self.env.cost_to_reward = True self.epsilon = 0 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # Use the logs file in the root path of the main. self.LOG_DIR = os.path.join(BASE_DIR,'logs') print("============================================================================================") ####### initialize environment hyperparameters ###### self.has_continuous_action_space = False # continuous action space; else discrete self.max_ep_len = 1000 # max timesteps in one episode self.tau = 1 self.tau_start = 1.0 # initial value of tau self.tau_end = 2.0 # final value of tau self.print_freq = self.max_ep_len * 4 # print avg reward in the interval (in num timesteps) self.log_freq = self.max_ep_len * 4 # log avg reward in the interval (in num timesteps) self.save_model_freq = int(4999) # save model frequency (in num timesteps) self.action_std = 0.6 # starting std for action distribution (Multivariate Normal) self.action_std_decay_rate = 0.05 # linearly decay self.action_std (self.action_std = self.action_std - self.action_std_decay_rate) self.min_action_std = 0.1 # minimum self.action_std (stop decay after self.action_std <= min_self.action_std) self.action_std_decay_freq = int(2.5e5) # self.action_std decay frequency (in num timesteps) ##################################################### ## Note : print/log frequencies should be > than self.max_ep_len ################ PDPPO_one_critic hyperparameters ################ self.update_timestep = int(self.max_ep_len*4) # update policy every n timesteps self.K_epochs = 40 # update policy for K epochs in one PDPPO update self.buffer_size_mul = 5 # buffer size multiplier self.eps_clip = 0.20 # clip parameter for PDPPO self.gamma = 0.90 # discount factor self.lr_actor = 0.00055 # learning rate for actor network self.lr_critic = 0.001 # learning rate for critic network self.random_seed = 0 # set random seed if required (0 = no random seed) ##################################################### self.run_num_pretrained = 0 #### change this to prevent overwriting weights in same self.experiment_name folder print("training environment name : " + self.experiment_name + '_PDPPO_one_critic') # state space dimension self.state_dim = self.env.observation_space.shape[0] # action space dimension if self.has_continuous_action_space: self.action_dim = self.env.action_space.shape[0] else: self.action_dim = self.env.action_space self.pdppo_agent = PDPPOonecritic(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space,self.tau, self.action_std) ################################### Training ################################### def learn(self,n_episodes = 100000): ###################### logging ###################### self.max_training_timesteps = n_episodes # break training loop if timeteps > self.max_training_timesteps env = self.env #### log files for multiple runs are NOT overwritten log_dir = self.LOG_DIR if not os.path.exists(log_dir): os.makedirs(log_dir) log_dir = log_dir + '/' + self.experiment_name + '_PDPPO_one_critic/' if not os.path.exists(log_dir): os.makedirs(log_dir) #### get number of log files in log directory run_num = 0 current_num_files = next(os.walk(log_dir))[2] run_num = len(current_num_files) #### create new log file for each run log_f_name = log_dir + '/PDPPO_one_critic_' + self.experiment_name + "_log_" + str(run_num) + ".csv" print("current logging run number for " + self.experiment_name + " : ", run_num) print("logging at : " + log_f_name) ##################################################### ################### checkpointing ################### directory = self.LOG_DIR if not os.path.exists(directory): os.makedirs(directory) directory = directory + '/' + self.experiment_name + '_PDPPO_one_critic' + '/' if not os.path.exists(directory): os.makedirs(directory) checkpoint_path = directory + "PDPPO_one_critic_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained) print("save checkpoint path : " + checkpoint_path) ##################################################### ############# print all hyperparameters ############# print("--------------------------------------------------------------------------------------------") print("max training timesteps : ", self.max_training_timesteps) print("max timesteps per episode : ", self.max_ep_len) print("model saving frequency : " + str(self.save_model_freq) + " timesteps") print("log frequency : " + str(self.log_freq) + " timesteps") print("printing average reward over episodes in last : " + str(self.print_freq) + " timesteps") print("--------------------------------------------------------------------------------------------") print("state space dimension : ", self.state_dim) print("action space dimension : ", self.action_dim) print("--------------------------------------------------------------------------------------------") if self.has_continuous_action_space: print("Initializing a continuous action space policy") print("--------------------------------------------------------------------------------------------") print("starting std of action distribution : ", self.action_std) print("decay rate of std of action distribution : ", self.action_std_decay_rate) print("minimum std of action distribution : ", min(self.action_std)) print("decay frequency of std of action distribution : " + str(self.action_std_decay_freq) + " timesteps") else: print("Initializing a discrete action space policy") print("--------------------------------------------------------------------------------------------") print("PDPPO_one_critic update frequency : " + str(self.update_timestep) + " timesteps") print("PDPPO_one_critic K epochs : ", self.K_epochs) print("PDPPO_one_critic epsilon clip : ", self.eps_clip) print("discount factor (self.gamma) : ", self.gamma) print("--------------------------------------------------------------------------------------------") print("optimizer learning rate actor : ", self.lr_actor) print("optimizer learning rate critic : ", self.lr_critic) if self.random_seed: print("--------------------------------------------------------------------------------------------") print("setting random seed to ", self.random_seed) ##################################################### print("============================================================================================") ################# training procedure ################ # initialize a PDPPO_one_critic agent self.pdppo_agent = PDPPOonecritic(self.state_dim, self.action_dim, self.lr_actor, self.lr_critic, self.gamma, self.K_epochs, self.eps_clip, copy.copy(self.env), self.has_continuous_action_space, self.action_std) # track total training time start_time = datetime.now().replace(microsecond=0) print("Started training at (GMT) : ", start_time) print("============================================================================================") # logging file log_f = open(log_f_name,"w+") log_f.write('episode,timestep,reward\n') # printing and logging variables print_running_reward = 0 print_running_episodes = 0 log_running_reward = 0 log_running_episodes = 0 time_step = 0 i_episode = 0 annealing_steps = self.max_training_timesteps # total number of training steps # training loop while time_step <= self.max_training_timesteps: anneal_rate = (self.tau_end - self.tau_start) / annealing_steps # rate of tau increase per step self.tau = max(self.tau_end, self.tau_start + anneal_rate * time_step) state = env.reset() current_ep_reward = 0 for t in range(1, self.max_ep_len+1): # select action with policy action, post_reward = self.pdppo_agent.select_action(state,self.tau) state, reward, done, _ = env.step(action) # saving reward and is_terminals self.pdppo_agent.buffer.rewards.append(reward - post_reward.item()) self.pdppo_agent.buffer.is_terminals.append(done) time_step +=1 current_ep_reward += reward # update PDPPO agent if time_step % self.update_timestep == 0: self.pdppo_agent.update() if time_step > self.update_timestep*self.buffer_size_mul: self.pdppo_agent.buffer.clear(self.update_timestep) # if continuous action space; then decay action std of ouput action distribution if self.has_continuous_action_space and time_step % self.action_std_decay_freq == 0: self.pdppo_agent.decay_self.action_std(self.action_std_decay_rate, self.action_std) # log in logging file if time_step % self.log_freq == 0: # log average reward till last episode log_avg_reward = log_running_reward / log_running_episodes log_avg_reward = round(log_avg_reward, 4) log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward)) log_f.flush() log_running_reward = 0 log_running_episodes = 0 # printing average reward if time_step % self.print_freq == 0: # print average reward till last episode print_avg_reward = print_running_reward / print_running_episodes print_avg_reward = round(print_avg_reward, 2) print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward)) print_running_reward = 0 print_running_episodes = 0 # save model weights if time_step % self.save_model_freq == 0: print("--------------------------------------------------------------------------------------------") #print("saving model at : " + checkpoint_path) self.pdppo_agent.save(checkpoint_path) #print("model saved") print("Elapsed Time : ", datetime.now().replace(microsecond=0) - start_time) print("--------------------------------------------------------------------------------------------") # break; if the episode is over if done: break print_running_reward += current_ep_reward print_running_episodes += 1 log_running_reward += current_ep_reward log_running_episodes += 1 i_episode += 1 log_f.close() #env.close() # print total training time print("============================================================================================") end_time = datetime.now().replace(microsecond=0) print("Started training at (GMT) : ", start_time) print("Finished training at (GMT) : ", end_time) print("Total training time : ", end_time - start_time) print("============================================================================================") def get_action(self,state): if isinstance(state, dict): if not self.dict_obs: state = np.concatenate( ( state['inventory_level'], # n_items size state['machine_setup'], # n_machine size ) ) else: if self.dict_obs: raise('Change dict_obst to False') return self.pdppo_agent.select_action(state,self.tau) def load_agent(self,path): #directory = "PDPPO_preTrained" + '/' + env_name + '/' directory = self.LOG_DIR directory = directory + '/' + self.experiment_name + '_PDPPO_one_critic' + '/' checkpoint_path = directory + "PDPPO_one_critic_{}_{}_{}.pth".format(self.experiment_name, self.random_seed, self.run_num_pretrained) print("loading network from : " + checkpoint_path) self.pdppo_agent.load(checkpoint_path)