From 31e8b5051b041c38d9a4606d920f6ccae2cb823a Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Tue, 18 Feb 2020 13:59:45 +0100 Subject: [PATCH 01/10] New A2C example with entropy --- examples/actor_critic_cartpole.py | 192 ++++++++++++++++-------------- 1 file changed, 101 insertions(+), 91 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 61c217e..4bf8a07 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -1,112 +1,122 @@ #!/usr/bin/env python3 -""" -Simple example of using cherry to solve cartpole with an actor-critic. - -The code is an adaptation of the PyTorch reinforcement learning example. -""" - -import random +import torch +import cherry import gym import numpy as np - from itertools import count -import torch as th -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim - -import cherry.envs as envs -from cherry.td import discount -from cherry import normalize -import cherry.distributions as distributions +SEED = 42 + +class A2C(torch.nn.Module): + def __init__(self): + super(A2C, self).__init__() + + self.gamma = 0.99 + self.vf_coef = 0.5 + self.ent_coef = 0.01 + + def select_action(self, state): + probs, value = self(state) + mass = torch.distributions.Categorical(probs) + action = mass.sample() + # Return selected action, logprob, value estimation and categorical entropy + return action, {"log_prob": mass.log_prob(action), "value": value, "entropy": mass.entropy()} + + + def learn_step(self, replay, optimizer): + policy_loss = [] + value_loss = [] + entropy_loss = [] + + # Discount and normalize rewards + rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done()) + rewards = cherry.normalize(rewards) + + # Value function error (MSE) + value_loss_fn = torch.nn.MSELoss() + for sars, reward in zip(replay, rewards): + log_prob = sars.log_prob + value = sars.value + entropy = sars.entropy + + # Compute advantage + advantage = reward - value.squeeze(0) + + # Compute policy gradient loss + # (advantage.detach() because you do not have to backward on the advantage path) + policy_loss.append(-log_prob * advantage.detach()) + # Compute value estimation loss + value_loss.append(value_loss_fn(value.squeeze(0), reward)) + # Compute entropy loss + entropy_loss.append(entropy) + + # Compute means over the accumulated errors + policy_loss = torch.stack(policy_loss).mean() + value_loss = torch.stack(value_loss).mean() + entropy_loss = torch.stack(entropy_loss).mean() + + # Take an optimization step + optimizer.zero_grad() + loss = policy_loss + self.vf_coef * value_loss - self.ent_coef * entropy_loss + loss.backward() + optimizer.step() + + + +class A2CPolicy(A2C): + def __init__(self, state_size, action_size): + super(A2CPolicy, self).__init__() + self.state_size = state_size + self.action_size = action_size + self.n_hidden = 128 + + # Backbone net + self.net = torch.nn.Sequential( + torch.nn.Linear(self.state_size, self.n_hidden), + torch.nn.LeakyReLU(), + torch.nn.Linear(self.n_hidden, self.n_hidden), + torch.nn.LeakyReLU(), + ) + + # Action head (policy gradient) + self.action_head = torch.nn.Sequential( + torch.nn.Linear(self.n_hidden, self.action_size), + torch.nn.Softmax(dim=1) + ) + + # Value estimation head (A2C) + self.value_head = torch.nn.Sequential( + torch.nn.Linear(self.n_hidden, 1), + ) -SEED = 567 -GAMMA = 0.99 -RENDER = False -V_WEIGHT = 0.5 - -random.seed(SEED) -np.random.seed(SEED) -th.manual_seed(SEED) - - -class ActorCriticNet(nn.Module): - def __init__(self, env): - super(ActorCriticNet, self).__init__() - self.affine1 = nn.Linear(env.state_size, 128) - self.action_head = nn.Linear(128, env.action_size) - self.value_head = nn.Linear(128, 1) - self.distribution = distributions.ActionDistribution(env, - use_probs=True) def forward(self, x): - x = F.relu(self.affine1(x)) - action_scores = self.action_head(x) - action_mass = self.distribution(F.softmax(action_scores, dim=1)) - value = self.value_head(x) - return action_mass, value - - -def update(replay, optimizer): - policy_loss = [] - value_loss = [] - - # Discount and normalize rewards - rewards = discount(GAMMA, replay.reward(), replay.done()) - rewards = normalize(rewards) - - # Compute losses - for sars, reward in zip(replay, rewards): - log_prob = sars.log_prob - value = sars.value - policy_loss.append(-log_prob * (reward - value.item())) - value_loss.append(F.mse_loss(value, reward.detach())) - - # Take optimization step - optimizer.zero_grad() - loss = th.stack(policy_loss).sum() + V_WEIGHT * th.stack(value_loss).sum() - loss.backward() - optimizer.step() - - -def get_action_value(state, policy): - mass, value = policy(state) - action = mass.sample() - info = { - 'log_prob': mass.log_prob(action), # Cache log_prob for later - 'value': value - } - return action, info - + # Return both the action probabilities and the value estimations + return self.action_head(self.net(x)), self.value_head(self.net(x)) if __name__ == '__main__': - env = gym.vector.make('CartPole-v0', num_envs=1) - env = envs.Logger(env, interval=1000) - env = envs.Torch(env) - env = envs.Runner(env) + env = gym.make('CartPole-v0') + env = cherry.envs.Logger(env, interval=1000) + env = cherry.envs.Torch(env) + env = cherry.envs.Runner(env) env.seed(SEED) - policy = ActorCriticNet(env) - optimizer = optim.Adam(policy.parameters(), lr=1e-2) - running_reward = 10.0 - get_action = lambda state: get_action_value(state, policy) + policy = A2CPolicy(env.state_size, env.action_size) + optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3) + running_reward = 10 for episode in count(1): - # We use the Runner collector, but could've written our own - replay = env.run(get_action, episodes=1) - - # Update policy - update(replay, optimizer) + replay = env.run(lambda state: policy.select_action(state), episodes=1) + policy.learn_step(replay, optimizer) - # Compute termination criterion - running_reward = running_reward * 0.99 + len(replay) * 0.01 - if episode % 10 == 0: - # Should start with 10.41, 12.21, 14.60, then 100:71.30, 200:135.74 - print(episode, running_reward) + running_reward = running_reward * 0.99 + replay.reward().sum() * 0.01 + if running_reward > 190.0: print('Solved! Running reward now {} and ' 'the last episode runs to {} time steps!'.format(running_reward, len(replay))) break + + while True: + env.run(lambda state: policy.select_action(state), episodes=1, render=True) From 819943eee88f35e2734e8328212887784e2ff3fe Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Wed, 19 Feb 2020 12:42:26 +0100 Subject: [PATCH 02/10] Removed rewards normalization --- examples/actor_critic_cartpole.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 4bf8a07..a84ab28 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -29,9 +29,8 @@ def learn_step(self, replay, optimizer): value_loss = [] entropy_loss = [] - # Discount and normalize rewards + # Discount rewards rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done()) - rewards = cherry.normalize(rewards) # Value function error (MSE) value_loss_fn = torch.nn.MSELoss() @@ -107,11 +106,11 @@ def forward(self, x): running_reward = 10 for episode in count(1): - replay = env.run(lambda state: policy.select_action(state), episodes=1) + replay = env.run(lambda state: policy.select_action(state), steps=5) policy.learn_step(replay, optimizer) running_reward = running_reward * 0.99 + replay.reward().sum() * 0.01 - + if episode % 10 == 0: print('Running reward: {}'.format(running_reward)) if running_reward > 190.0: print('Solved! Running reward now {} and ' 'the last episode runs to {} time steps!'.format(running_reward, From 85981d089287a8f382da9386dbb9ce6a11a2142e Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Wed, 19 Feb 2020 14:23:25 +0100 Subject: [PATCH 03/10] Use multiple environments --- examples/actor_critic_cartpole.py | 57 ++++++++++++++++--------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index a84ab28..0498ee5 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -5,16 +5,21 @@ import gym import numpy as np from itertools import count +import statistics -SEED = 42 +NUM_ENVS = 4 +STEPS = 5 +TRAIN_STEPS = int(1e4) class A2C(torch.nn.Module): - def __init__(self): + def __init__(self, num_envs): super(A2C, self).__init__() + self.num_envs = num_envs self.gamma = 0.99 - self.vf_coef = 0.5 + self.vf_coef = 0.25 self.ent_coef = 0.01 + self.max_clip_norm = 0.5 def select_action(self, state): probs, value = self(state) @@ -32,40 +37,43 @@ def learn_step(self, replay, optimizer): # Discount rewards rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done()) - # Value function error (MSE) - value_loss_fn = torch.nn.MSELoss() for sars, reward in zip(replay, rewards): - log_prob = sars.log_prob - value = sars.value - entropy = sars.entropy + log_prob = sars.log_prob.view(self.num_envs, -1) + value = sars.value.view(self.num_envs, -1) + entropy = sars.entropy.view(self.num_envs, -1) + reward = reward.view(self.num_envs, -1) # Compute advantage - advantage = reward - value.squeeze(0) + advantage = reward - value # Compute policy gradient loss # (advantage.detach() because you do not have to backward on the advantage path) policy_loss.append(-log_prob * advantage.detach()) # Compute value estimation loss - value_loss.append(value_loss_fn(value.squeeze(0), reward)) + value_loss.append((reward - value)**2) # Compute entropy loss entropy_loss.append(entropy) - # Compute means over the accumulated errors - policy_loss = torch.stack(policy_loss).mean() + + # Compute means over accumulated errors value_loss = torch.stack(value_loss).mean() + policy_loss = torch.stack(policy_loss).mean() entropy_loss = torch.stack(entropy_loss).mean() # Take an optimization step optimizer.zero_grad() loss = policy_loss + self.vf_coef * value_loss - self.ent_coef * entropy_loss loss.backward() + # Clip gradients + torch.nn.utils.clip_grad_norm_(self.parameters(), self.max_clip_norm) optimizer.step() + class A2CPolicy(A2C): - def __init__(self, state_size, action_size): - super(A2CPolicy, self).__init__() + def __init__(self, state_size, action_size, num_envs): + super(A2CPolicy, self).__init__(num_envs) self.state_size = state_size self.action_size = action_size self.n_hidden = 128 @@ -95,27 +103,20 @@ def forward(self, x): return self.action_head(self.net(x)), self.value_head(self.net(x)) if __name__ == '__main__': - env = gym.make('CartPole-v0') + env = gym.vector.make('CartPole-v0', num_envs=NUM_ENVS) env = cherry.envs.Logger(env, interval=1000) env = cherry.envs.Torch(env) env = cherry.envs.Runner(env) - env.seed(SEED) - policy = A2CPolicy(env.state_size, env.action_size) + policy = A2CPolicy(env.state_size, env.action_size, NUM_ENVS) optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3) - running_reward = 10 - for episode in count(1): - replay = env.run(lambda state: policy.select_action(state), steps=5) + for step in range(0, TRAIN_STEPS): + replay = env.run(lambda state: policy.select_action(state), steps=STEPS) policy.learn_step(replay, optimizer) - - running_reward = running_reward * 0.99 + replay.reward().sum() * 0.01 - if episode % 10 == 0: print('Running reward: {}'.format(running_reward)) - if running_reward > 190.0: - print('Solved! Running reward now {} and ' - 'the last episode runs to {} time steps!'.format(running_reward, - len(replay))) - break + env = gym.make('CartPole-v0') + env = cherry.envs.Torch(env) + env = cherry.envs.Runner(env) while True: env.run(lambda state: policy.select_action(state), episodes=1, render=True) From 93076adf672d36ff51107fd11aef1a1f0335f25f Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Wed, 19 Feb 2020 16:42:33 +0100 Subject: [PATCH 04/10] Without Runner wrapper and with RMSprop optimizer --- examples/actor_critic_cartpole.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 0498ee5..625aaa9 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -106,13 +106,19 @@ def forward(self, x): env = gym.vector.make('CartPole-v0', num_envs=NUM_ENVS) env = cherry.envs.Logger(env, interval=1000) env = cherry.envs.Torch(env) - env = cherry.envs.Runner(env) policy = A2CPolicy(env.state_size, env.action_size, NUM_ENVS) - optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3) - - for step in range(0, TRAIN_STEPS): - replay = env.run(lambda state: policy.select_action(state), steps=STEPS) + optimizer = torch.optim.RMSprop(policy.parameters(), lr=7e-4, eps=1e-5, alpha=0.99) + + state = env.reset() + for train_step in range(0, TRAIN_STEPS): + replay = cherry.ExperienceReplay() + for step in range(0, STEPS): + action, info = policy.select_action(state) + new_state, reward, done, _ = env.step(action) + replay.append(state, action, reward, new_state, done, **info) + state = new_state + policy.learn_step(replay, optimizer) env = gym.make('CartPole-v0') From 1f7ce5218d4cf8c5c5a2f1e3dbbb98cd049a6823 Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Thu, 20 Feb 2020 16:36:03 +0100 Subject: [PATCH 05/10] Boostraping rewards as shown in Algorithm S3 in the A3C paper --- examples/actor_critic_cartpole.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 625aaa9..86fad78 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -34,8 +34,12 @@ def learn_step(self, replay, optimizer): value_loss = [] entropy_loss = [] - # Discount rewards - rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done()) + # Discount rewards boostraping them from the last estimated value + last_action, last_value = self(replay.state()[0,:,:]) + # Boostrap from zero if it is a terminal state + last_value = last_value*(1 - replay.done()[0]) + + rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value) for sars, reward in zip(replay, rewards): log_prob = sars.log_prob.view(self.num_envs, -1) From 9ab728fd02ac12c3124511524da4a173d2c83def Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Thu, 20 Feb 2020 18:35:07 +0100 Subject: [PATCH 06/10] Bugfix: use the last state for rewards boostraping --- examples/actor_critic_cartpole.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 86fad78..74b3a02 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -35,9 +35,9 @@ def learn_step(self, replay, optimizer): entropy_loss = [] # Discount rewards boostraping them from the last estimated value - last_action, last_value = self(replay.state()[0,:,:]) + last_action, last_value = self(replay.state()[-1,:,:]) # Boostrap from zero if it is a terminal state - last_value = last_value*(1 - replay.done()[0]) + last_value = last_value*(1 - replay.done()[-1]) rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value) From 0663ef0d7b4bf32a63883ba9021c2267c4d06f94 Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Fri, 21 Feb 2020 13:22:43 +0100 Subject: [PATCH 07/10] Bugfix: do not backward through the boostrap path --- examples/actor_critic_cartpole.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 74b3a02..5ad762e 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -37,7 +37,7 @@ def learn_step(self, replay, optimizer): # Discount rewards boostraping them from the last estimated value last_action, last_value = self(replay.state()[-1,:,:]) # Boostrap from zero if it is a terminal state - last_value = last_value*(1 - replay.done()[-1]) + last_value = (last_value*(1 - replay.done()[-1])).detach() rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value) From 04651329abc2781b47d4b2836ab2ae9bc949c76e Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Fri, 21 Feb 2020 14:22:09 +0100 Subject: [PATCH 08/10] Bootstrap tensor shape fix --- examples/actor_critic_cartpole.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 5ad762e..76678a9 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -7,7 +7,7 @@ from itertools import count import statistics -NUM_ENVS = 4 +NUM_ENVS = 6 STEPS = 5 TRAIN_STEPS = int(1e4) @@ -37,10 +37,9 @@ def learn_step(self, replay, optimizer): # Discount rewards boostraping them from the last estimated value last_action, last_value = self(replay.state()[-1,:,:]) # Boostrap from zero if it is a terminal state - last_value = (last_value*(1 - replay.done()[-1])).detach() + last_value = (last_value[:, 0]*(1 - replay.done()[-1])).detach() rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value) - for sars, reward in zip(replay, rewards): log_prob = sars.log_prob.view(self.num_envs, -1) value = sars.value.view(self.num_envs, -1) From 540b338922a616164adf6bef0c18ec237b2c66e9 Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Fri, 21 Feb 2020 14:53:42 +0100 Subject: [PATCH 09/10] Revert: according to the A3C paper Appendix Algorithm S3 we should backward through the bootstrap path --- cherry/td.py | 2 +- examples/actor_critic_cartpole.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cherry/td.py b/cherry/td.py index 03338d9..c62a64c 100644 --- a/cherry/td.py +++ b/cherry/td.py @@ -52,7 +52,7 @@ def discount(gamma, rewards, dones, bootstrap=0.0): msg = 'dones and rewards must have equal length.' assert rewards.size(0) == dones.size(0), msg - R = th.zeros_like(rewards[0]) + bootstrap + R = th.zeros_like(rewards) + bootstrap discounted = th.zeros_like(rewards) length = discounted.size(0) for t in reversed(range(length)): diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index 76678a9..b316b34 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -37,7 +37,7 @@ def learn_step(self, replay, optimizer): # Discount rewards boostraping them from the last estimated value last_action, last_value = self(replay.state()[-1,:,:]) # Boostrap from zero if it is a terminal state - last_value = (last_value[:, 0]*(1 - replay.done()[-1])).detach() + last_value = (last_value[:, 0]*(1 - replay.done()[-1])) rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value) for sars, reward in zip(replay, rewards): From 1eda7c59c93f859a47da08e6d0a6e94949c766b5 Mon Sep 17 00:00:00 2001 From: Federico Galatolo Date: Fri, 21 Feb 2020 16:25:17 +0100 Subject: [PATCH 10/10] Using the last next state estimated V as boostrap --- examples/actor_critic_cartpole.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py index b316b34..4696ffb 100644 --- a/examples/actor_critic_cartpole.py +++ b/examples/actor_critic_cartpole.py @@ -34,8 +34,8 @@ def learn_step(self, replay, optimizer): value_loss = [] entropy_loss = [] - # Discount rewards boostraping them from the last estimated value - last_action, last_value = self(replay.state()[-1,:,:]) + # Discount rewards and boostrap them with the estimation from the next state + last_action, last_value = self(replay.next_state()[-1,:,:]) # Boostrap from zero if it is a terminal state last_value = (last_value[:, 0]*(1 - replay.done()[-1]))