| import os |
| import math |
| import time |
| import gym |
| import random |
| import utils |
| import keras |
| import numpy as np |
|
|
| from collections import deque |
| from matplotlib import pyplot as plt |
| from sklearn.preprocessing import OneHotEncoder |
|
|
| class ReplayBuffer(): |
| """ |
| Thank you: https://github.com/BY571/ |
| """ |
|
|
| def __init__(self, max_size): |
| self.max_size = max_size |
| self.buffer = [] |
| |
| def add_sample(self, states, actions, rewards): |
| episode = {"states": states, "actions":actions, "rewards": rewards, "summed_rewards":sum(rewards)} |
| self.buffer.append(episode) |
| |
| def sort(self): |
| |
| self.buffer = sorted(self.buffer, key = lambda i: i["summed_rewards"],reverse=True) |
| |
| self.buffer = self.buffer[:self.max_size] |
|
|
| def get_random_samples(self, batch_size): |
| self.sort() |
| idxs = np.random.randint(0, len(self.buffer), batch_size) |
| batch = [self.buffer[idx] for idx in idxs] |
| return batch |
| |
| def get_n_best(self, n): |
| self.sort() |
| return self.buffer[:n] |
| |
| def __len__(self): |
| return len(self.buffer) |
|
|
| class UpsideDownAgent(): |
| def __init__(self, environment, approximator): |
| self.environment = gym.make(environment) |
| self.approximator = approximator |
| self.state_size = (84, 84, 4) |
| self.action_size = 3 |
| self.warm_up_episodes = 1 |
| self.render = False |
| self.memory = ReplayBuffer(700) |
| self.last_few = 50 |
| self.batch_size = 256 |
| self.command_size = 2 |
| self.desired_return = 1 |
| self.desired_horizon = 1 |
| self.horizon_scale = 0.02 |
| self.return_scale = 0.02 |
| |
| self.behaviour_function = utils.get_atari_behaviour_function(self.action_size) |
| |
| self.testing_rewards = [] |
| self.warm_up_buffer() |
|
|
| def warm_up_buffer(self): |
| print('Warming up') |
|
|
| for i in range(self.warm_up_episodes): |
| |
| states = [] |
| rewards = [] |
| actions = [] |
| |
| dead = False |
| done = False |
| desired_return = 1 |
| desired_horizon = 1 |
|
|
| step, score, start_life = 0, 0, 5 |
| observe = self.environment.reset() |
|
|
| for _ in range(random.randint(1, 30)): |
| observe, _, _, _ = self.environment.step(1) |
|
|
| state = utils.pre_processing(observe) |
| history = np.stack((state, state, state, state), axis=2) |
| history = np.reshape([history], (1, 84, 84, 4)) |
|
|
|
|
| while not done: |
| |
| states.append(history) |
| command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale]) |
| command = np.reshape(command, [1, len(command)]) |
|
|
| action = self.get_action(history, command) |
| actions.append(action) |
| |
| if action == 0: |
| real_action = 1 |
| elif action == 1: |
| real_action = 2 |
| else: |
| real_action = 3 |
|
|
| next_state, reward, done, info = self.environment.step(real_action) |
| next_state = utils.pre_processing(observe) |
| next_state = np.reshape([next_state], (1, 84, 84, 1)) |
| next_history = np.append(next_state, history[:, :, :, :3], axis = 3) |
| |
| rewards.append(reward) |
|
|
| state = next_state |
| |
| if start_life > info['ale.lives']: |
| dead = True |
| start_lide = info['ale.lives'] |
|
|
| if dead: |
| dead = False |
| else: |
| history = next_history |
|
|
| desired_return -= reward |
| desired_horizon -= 1 |
| desired_horizon = np.maximum(desired_horizon, 1) |
| |
| self.memory.add_sample(states, actions, rewards) |
|
|
|
|
| def get_action(self, observation, command): |
| """ |
| We will sample from the action distribution modeled by the Behavior Function |
| """ |
| |
| observation = np.float32(observation / 255.0) |
|
|
| action_probs = self.behaviour_function.predict([observation, command]) |
| action = np.random.choice(np.arange(0, self.action_size), p=action_probs[0]) |
|
|
| return action |
| |
| def get_greedy_action(self, observation, command): |
|
|
| action_probs = self.behaviour_function.predict([observation, command]) |
| action = np.argmax(action_probs) |
|
|
| return action |
| |
| def train_behaviour_function(self): |
|
|
| random_episodes = self.memory.get_random_samples(self.batch_size) |
| |
| training_observations = np.zeros((self.batch_size, self.state_size[0], self.state_size[1], self.state_size[2])) |
| training_commands = np.zeros((self.batch_size, 2)) |
|
|
| y = [] |
| |
| for idx, episode in enumerate(random_episodes): |
| T = len(episode['states']) |
| t1 = np.random.randint(0, T-1) |
| t2 = np.random.randint(t1+1, T) |
| |
| state = np.float32(episode['states'][t1] / 255.) |
| desired_return = sum(episode["rewards"][t1:t2]) |
| desired_horizon = t2 -t1 |
| |
| target = episode['actions'][t1] |
| |
| training_observations[idx] = state[0] |
| training_commands[idx] = np.asarray([desired_return*self.return_scale, desired_horizon*self.horizon_scale]) |
| y.append(target) |
| |
| _y = keras.utils.to_categorical(y, num_classes=self.action_size) |
| |
| self.behaviour_function.fit([training_observations, training_commands], _y, verbose=0) |
| |
|
|
| def sample_exploratory_commands(self): |
| best_episodes = self.memory.get_n_best(self.last_few) |
| exploratory_desired_horizon = np.mean([len(i["states"]) for i in best_episodes]) |
| |
| returns = [i["summed_rewards"] for i in best_episodes] |
| exploratory_desired_returns = np.random.uniform(np.mean(returns), np.mean(returns)+np.std(returns)) |
|
|
| return [exploratory_desired_returns, exploratory_desired_horizon] |
|
|
| def generate_episode(self, environment, e, desired_return, desired_horizon, testing): |
| |
| env = gym.make(environment) |
|
|
| tot_rewards = [] |
| |
| done = False |
| dead = False |
| |
| scores = [] |
| states = [] |
| actions = [] |
| rewards = [] |
|
|
| step, score, start_life = 0, 0, 5 |
|
|
| observe = env.reset() |
| for _ in range(random.randint(1, 30)): |
| observe, _, _, _ = env.step(1) |
|
|
| state = utils.pre_processing(observe) |
| history = np.stack((state, state, state, state), axis=2) |
| history = np.reshape([history], (1, 84, 84, 4)) |
|
|
| while not done: |
| states.append(history) |
| |
| command = np.asarray([desired_return * self.return_scale, desired_horizon * self.horizon_scale]) |
| command = np.reshape(command, [1, len(command)]) |
|
|
| if not testing: |
| action = self.get_action(history, command) |
| actions.append(action) |
| else: |
| action = self.get_greedy_action(history, command) |
|
|
| if action == 0: |
| real_action = 1 |
| elif action == 1: |
| real_action = 2 |
| else: |
| real_action = 3 |
|
|
| next_state, reward, done, info = env.step(real_action) |
| next_state = utils.pre_processing(observe) |
| next_state = np.reshape([next_state], (1, 84, 84, 1)) |
| next_history = np.append(next_state, history[:, :, :, :3], axis = 3) |
|
|
| clipped_reward = np.clip(reward, -1, 1) |
| rewards.append(clipped_reward) |
| |
| score += reward |
|
|
| if start_life > info['ale.lives']: |
| dead = True |
| start_life = info['ale.lives'] |
|
|
| if dead: |
| dead = False |
| else: |
| history = next_history |
| |
| desired_return -= reward |
| desired_horizon -= 1 |
| desired_horizon = np.maximum(desired_horizon, 1) |
| |
| self.memory.add_sample(states, actions, rewards) |
| |
| self.testing_rewards.append(score) |
|
|
| if testing: |
| print('Querying the model ...') |
| print('Testing score: {}'.format(score)) |
|
|
| return score |
|
|
| def run_experiment(): |
|
|
| import argparse |
|
|
| parser = argparse.ArgumentParser() |
| |
| parser.add_argument('--approximator', type=str, default='neural_network') |
| parser.add_argument('--environment', type=str, default='PongDeterministic-v4') |
| parser.add_argument('--seed', type=int, default=1) |
|
|
| args = parser.parse_args() |
|
|
| approximator = args.approximator |
| environment = args.environment |
| seed = args.seed |
|
|
| episodes = 1500 |
| returns = [] |
|
|
| agent = UpsideDownAgent(environment, approximator) |
|
|
| for e in range(episodes): |
|
|
| print("Episode {}".format(e)) |
|
|
| for i in range(100): |
| agent.train_behaviour_function() |
|
|
| print("Finished training B!") |
| |
| for i in range(15): |
| tmp_r = [] |
| exploratory_commands = agent.sample_exploratory_commands() |
| desired_return = exploratory_commands[0] |
| desired_horizon = exploratory_commands[1] |
| r = agent.generate_episode(environment, e, desired_return, desired_horizon, False) |
| tmp_r.append(r) |
|
|
| print(np.mean(tmp_r)) |
| returns.append(np.mean(tmp_r)) |
|
|
| exploratory_commands = agent.sample_exploratory_commands() |
| |
| |
|
|
| utils.save_results(environment, approximator, seed, returns) |
| |
| if approximator == 'neural_network': |
| utils.save_trained_model(environment, seed, agent.behaviour_function) |
|
|
| plt.plot(returns) |
| plt.show() |
|
|
| if __name__ == "__main__": |
| run_experiment() |
|
|