From 31e8b5051b041c38d9a4606d920f6ccae2cb823a Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Tue, 18 Feb 2020 13:59:45 +0100
Subject: [PATCH 01/10] New A2C example with entropy

---
 examples/actor_critic_cartpole.py | 192 ++++++++++++++++--------------
 1 file changed, 101 insertions(+), 91 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 61c217e..4bf8a07 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -1,112 +1,122 @@
 #!/usr/bin/env python3
 
-"""
-Simple example of using cherry to solve cartpole with an actor-critic.
-
-The code is an adaptation of the PyTorch reinforcement learning example.
-"""
-
-import random
+import torch
+import cherry
 import gym
 import numpy as np
-
 from itertools import count
 
-import torch as th
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-
-import cherry.envs as envs
-from cherry.td import discount
-from cherry import normalize
-import cherry.distributions as distributions
+SEED = 42
+
+class A2C(torch.nn.Module):
+    def __init__(self):
+        super(A2C, self).__init__()
+        
+        self.gamma = 0.99
+        self.vf_coef = 0.5
+        self.ent_coef = 0.01
+
+    def select_action(self, state):
+        probs, value = self(state)
+        mass = torch.distributions.Categorical(probs)
+        action = mass.sample()
+        # Return selected action, logprob, value estimation and categorical entropy
+        return action, {"log_prob": mass.log_prob(action), "value": value, "entropy": mass.entropy()}
+
+    
+    def learn_step(self, replay, optimizer):
+        policy_loss = []
+        value_loss = []
+        entropy_loss = []
+
+        # Discount and normalize rewards
+        rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done())
+        rewards = cherry.normalize(rewards)
+
+        # Value function error (MSE)
+        value_loss_fn = torch.nn.MSELoss()
+        for sars, reward in zip(replay, rewards):
+            log_prob = sars.log_prob
+            value = sars.value
+            entropy = sars.entropy
+
+            # Compute advantage
+            advantage = reward - value.squeeze(0)
+            
+            # Compute policy gradient loss
+            # (advantage.detach() because you do not have to backward on the advantage path) 
+            policy_loss.append(-log_prob * advantage.detach())
+            # Compute value estimation loss
+            value_loss.append(value_loss_fn(value.squeeze(0), reward))
+            # Compute entropy loss
+            entropy_loss.append(entropy)
+        
+        # Compute means over the accumulated errors
+        policy_loss = torch.stack(policy_loss).mean()
+        value_loss = torch.stack(value_loss).mean()
+        entropy_loss = torch.stack(entropy_loss).mean()
+
+        # Take an optimization step
+        optimizer.zero_grad()
+        loss = policy_loss + self.vf_coef * value_loss - self.ent_coef * entropy_loss
+        loss.backward()
+        optimizer.step()
+
+
+
+class A2CPolicy(A2C):
+    def __init__(self, state_size, action_size):
+        super(A2CPolicy, self).__init__()
+        self.state_size = state_size
+        self.action_size = action_size
+        self.n_hidden = 128
+
+        # Backbone net
+        self.net = torch.nn.Sequential(
+            torch.nn.Linear(self.state_size, self.n_hidden),
+            torch.nn.LeakyReLU(),
+            torch.nn.Linear(self.n_hidden, self.n_hidden),
+            torch.nn.LeakyReLU(),
+        )
+
+        # Action head (policy gradient)
+        self.action_head = torch.nn.Sequential(
+            torch.nn.Linear(self.n_hidden, self.action_size),
+            torch.nn.Softmax(dim=1)
+        )
+
+        # Value estimation head (A2C)
+        self.value_head = torch.nn.Sequential(
+            torch.nn.Linear(self.n_hidden, 1),
+        )
 
-SEED = 567
-GAMMA = 0.99
-RENDER = False
-V_WEIGHT = 0.5
-
-random.seed(SEED)
-np.random.seed(SEED)
-th.manual_seed(SEED)
-
-
-class ActorCriticNet(nn.Module):
-    def __init__(self, env):
-        super(ActorCriticNet, self).__init__()
-        self.affine1 = nn.Linear(env.state_size, 128)
-        self.action_head = nn.Linear(128, env.action_size)
-        self.value_head = nn.Linear(128, 1)
-        self.distribution = distributions.ActionDistribution(env,
-                                                             use_probs=True)
 
     def forward(self, x):
-        x = F.relu(self.affine1(x))
-        action_scores = self.action_head(x)
-        action_mass = self.distribution(F.softmax(action_scores, dim=1))
-        value = self.value_head(x)
-        return action_mass, value
-
-
-def update(replay, optimizer):
-    policy_loss = []
-    value_loss = []
-
-    # Discount and normalize rewards
-    rewards = discount(GAMMA, replay.reward(), replay.done())
-    rewards = normalize(rewards)
-
-    # Compute losses
-    for sars, reward in zip(replay, rewards):
-        log_prob = sars.log_prob
-        value = sars.value
-        policy_loss.append(-log_prob * (reward - value.item()))
-        value_loss.append(F.mse_loss(value, reward.detach()))
-
-    # Take optimization step
-    optimizer.zero_grad()
-    loss = th.stack(policy_loss).sum() + V_WEIGHT * th.stack(value_loss).sum()
-    loss.backward()
-    optimizer.step()
-
-
-def get_action_value(state, policy):
-    mass, value = policy(state)
-    action = mass.sample()
-    info = {
-        'log_prob': mass.log_prob(action),  # Cache log_prob for later
-        'value': value
-    }
-    return action, info
-
+        # Return both the action probabilities and the value estimations
+        return self.action_head(self.net(x)), self.value_head(self.net(x))
 
 if __name__ == '__main__':
-    env = gym.vector.make('CartPole-v0', num_envs=1)
-    env = envs.Logger(env, interval=1000)
-    env = envs.Torch(env)
-    env = envs.Runner(env)
+    env = gym.make('CartPole-v0')
+    env = cherry.envs.Logger(env, interval=1000)
+    env = cherry.envs.Torch(env)
+    env = cherry.envs.Runner(env)
     env.seed(SEED)
 
-    policy = ActorCriticNet(env)
-    optimizer = optim.Adam(policy.parameters(), lr=1e-2)
-    running_reward = 10.0
-    get_action = lambda state: get_action_value(state, policy)
+    policy = A2CPolicy(env.state_size, env.action_size)
+    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
 
+    running_reward = 10
     for episode in count(1):
-        # We use the Runner collector, but could've written our own
-        replay = env.run(get_action, episodes=1)
-
-        # Update policy
-        update(replay, optimizer)
+        replay = env.run(lambda state: policy.select_action(state), episodes=1)
+        policy.learn_step(replay, optimizer)
 
-        # Compute termination criterion
-        running_reward = running_reward * 0.99 + len(replay) * 0.01
-        if episode % 10 == 0:
-            # Should start with 10.41, 12.21, 14.60, then 100:71.30, 200:135.74
-            print(episode, running_reward)
+        running_reward = running_reward * 0.99 + replay.reward().sum() * 0.01
+        
         if running_reward > 190.0:
             print('Solved! Running reward now {} and '
                   'the last episode runs to {} time steps!'.format(running_reward,
                                                                    len(replay)))
             break
+    
+    while True:
+        env.run(lambda state: policy.select_action(state), episodes=1, render=True)

From 819943eee88f35e2734e8328212887784e2ff3fe Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Wed, 19 Feb 2020 12:42:26 +0100
Subject: [PATCH 02/10] Removed rewards normalization

---
 examples/actor_critic_cartpole.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 4bf8a07..a84ab28 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -29,9 +29,8 @@ def learn_step(self, replay, optimizer):
         value_loss = []
         entropy_loss = []
 
-        # Discount and normalize rewards
+        # Discount rewards
         rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done())
-        rewards = cherry.normalize(rewards)
 
         # Value function error (MSE)
         value_loss_fn = torch.nn.MSELoss()
@@ -107,11 +106,11 @@ def forward(self, x):
 
     running_reward = 10
     for episode in count(1):
-        replay = env.run(lambda state: policy.select_action(state), episodes=1)
+        replay = env.run(lambda state: policy.select_action(state), steps=5)
         policy.learn_step(replay, optimizer)
 
         running_reward = running_reward * 0.99 + replay.reward().sum() * 0.01
-        
+        if episode % 10 == 0: print('Running reward: {}'.format(running_reward))
         if running_reward > 190.0:
             print('Solved! Running reward now {} and '
                   'the last episode runs to {} time steps!'.format(running_reward,

From 85981d089287a8f382da9386dbb9ce6a11a2142e Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Wed, 19 Feb 2020 14:23:25 +0100
Subject: [PATCH 03/10] Use multiple environments

---
 examples/actor_critic_cartpole.py | 57 ++++++++++++++++---------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index a84ab28..0498ee5 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -5,16 +5,21 @@
 import gym
 import numpy as np
 from itertools import count
+import statistics
 
-SEED = 42
+NUM_ENVS = 4
+STEPS = 5
+TRAIN_STEPS = int(1e4)
 
 class A2C(torch.nn.Module):
-    def __init__(self):
+    def __init__(self, num_envs):
         super(A2C, self).__init__()
         
+        self.num_envs = num_envs
         self.gamma = 0.99
-        self.vf_coef = 0.5
+        self.vf_coef = 0.25
         self.ent_coef = 0.01
+        self.max_clip_norm = 0.5
 
     def select_action(self, state):
         probs, value = self(state)
@@ -32,40 +37,43 @@ def learn_step(self, replay, optimizer):
         # Discount rewards
         rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done())
 
-        # Value function error (MSE)
-        value_loss_fn = torch.nn.MSELoss()
         for sars, reward in zip(replay, rewards):
-            log_prob = sars.log_prob
-            value = sars.value
-            entropy = sars.entropy
+            log_prob = sars.log_prob.view(self.num_envs, -1)
+            value = sars.value.view(self.num_envs, -1)
+            entropy = sars.entropy.view(self.num_envs, -1)
+            reward = reward.view(self.num_envs, -1)
 
             # Compute advantage
-            advantage = reward - value.squeeze(0)
+            advantage = reward - value
             
             # Compute policy gradient loss
             # (advantage.detach() because you do not have to backward on the advantage path) 
             policy_loss.append(-log_prob * advantage.detach())
             # Compute value estimation loss
-            value_loss.append(value_loss_fn(value.squeeze(0), reward))
+            value_loss.append((reward - value)**2)
             # Compute entropy loss
             entropy_loss.append(entropy)
         
-        # Compute means over the accumulated errors
-        policy_loss = torch.stack(policy_loss).mean()
+
+        # Compute means over accumulated errors
         value_loss = torch.stack(value_loss).mean()
+        policy_loss = torch.stack(policy_loss).mean()
         entropy_loss = torch.stack(entropy_loss).mean()
 
         # Take an optimization step
         optimizer.zero_grad()
         loss = policy_loss + self.vf_coef * value_loss - self.ent_coef * entropy_loss
         loss.backward()
+        # Clip gradients
+        torch.nn.utils.clip_grad_norm_(self.parameters(), self.max_clip_norm)
         optimizer.step()
 
 
 
+
 class A2CPolicy(A2C):
-    def __init__(self, state_size, action_size):
-        super(A2CPolicy, self).__init__()
+    def __init__(self, state_size, action_size, num_envs):
+        super(A2CPolicy, self).__init__(num_envs)
         self.state_size = state_size
         self.action_size = action_size
         self.n_hidden = 128
@@ -95,27 +103,20 @@ def forward(self, x):
         return self.action_head(self.net(x)), self.value_head(self.net(x))
 
 if __name__ == '__main__':
-    env = gym.make('CartPole-v0')
+    env = gym.vector.make('CartPole-v0', num_envs=NUM_ENVS)
     env = cherry.envs.Logger(env, interval=1000)
     env = cherry.envs.Torch(env)
     env = cherry.envs.Runner(env)
-    env.seed(SEED)
 
-    policy = A2CPolicy(env.state_size, env.action_size)
+    policy = A2CPolicy(env.state_size, env.action_size, NUM_ENVS)
     optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
 
-    running_reward = 10
-    for episode in count(1):
-        replay = env.run(lambda state: policy.select_action(state), steps=5)
+    for step in range(0, TRAIN_STEPS):
+        replay = env.run(lambda state: policy.select_action(state), steps=STEPS)
         policy.learn_step(replay, optimizer)
-
-        running_reward = running_reward * 0.99 + replay.reward().sum() * 0.01
-        if episode % 10 == 0: print('Running reward: {}'.format(running_reward))
-        if running_reward > 190.0:
-            print('Solved! Running reward now {} and '
-                  'the last episode runs to {} time steps!'.format(running_reward,
-                                                                   len(replay)))
-            break
     
+    env = gym.make('CartPole-v0')
+    env = cherry.envs.Torch(env)
+    env = cherry.envs.Runner(env)    
     while True:
         env.run(lambda state: policy.select_action(state), episodes=1, render=True)

From 93076adf672d36ff51107fd11aef1a1f0335f25f Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Wed, 19 Feb 2020 16:42:33 +0100
Subject: [PATCH 04/10] Without Runner wrapper and with RMSprop optimizer

---
 examples/actor_critic_cartpole.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 0498ee5..625aaa9 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -106,13 +106,19 @@ def forward(self, x):
     env = gym.vector.make('CartPole-v0', num_envs=NUM_ENVS)
     env = cherry.envs.Logger(env, interval=1000)
     env = cherry.envs.Torch(env)
-    env = cherry.envs.Runner(env)
 
     policy = A2CPolicy(env.state_size, env.action_size, NUM_ENVS)
-    optimizer = torch.optim.Adam(policy.parameters(), lr=1e-3)
-
-    for step in range(0, TRAIN_STEPS):
-        replay = env.run(lambda state: policy.select_action(state), steps=STEPS)
+    optimizer = torch.optim.RMSprop(policy.parameters(), lr=7e-4, eps=1e-5, alpha=0.99)
+    
+    state = env.reset()
+    for train_step in range(0, TRAIN_STEPS):
+        replay = cherry.ExperienceReplay()
+        for step in range(0, STEPS):
+            action, info = policy.select_action(state)
+            new_state, reward, done, _ = env.step(action)
+            replay.append(state, action, reward, new_state, done, **info)
+            state = new_state
+            
         policy.learn_step(replay, optimizer)
     
     env = gym.make('CartPole-v0')

From 1f7ce5218d4cf8c5c5a2f1e3dbbb98cd049a6823 Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Thu, 20 Feb 2020 16:36:03 +0100
Subject: [PATCH 05/10] Boostraping rewards as shown in Algorithm S3 in the A3C
 paper

---
 examples/actor_critic_cartpole.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 625aaa9..86fad78 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -34,8 +34,12 @@ def learn_step(self, replay, optimizer):
         value_loss = []
         entropy_loss = []
 
-        # Discount rewards
-        rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done())
+        # Discount rewards boostraping them from the last estimated value
+        last_action, last_value = self(replay.state()[0,:,:])
+        # Boostrap from zero if it is a terminal state
+        last_value = last_value*(1 - replay.done()[0])
+
+        rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value)
 
         for sars, reward in zip(replay, rewards):
             log_prob = sars.log_prob.view(self.num_envs, -1)

From 9ab728fd02ac12c3124511524da4a173d2c83def Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Thu, 20 Feb 2020 18:35:07 +0100
Subject: [PATCH 06/10] Bugfix: use the last state for rewards boostraping

---
 examples/actor_critic_cartpole.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 86fad78..74b3a02 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -35,9 +35,9 @@ def learn_step(self, replay, optimizer):
         entropy_loss = []
 
         # Discount rewards boostraping them from the last estimated value
-        last_action, last_value = self(replay.state()[0,:,:])
+        last_action, last_value = self(replay.state()[-1,:,:])
         # Boostrap from zero if it is a terminal state
-        last_value = last_value*(1 - replay.done()[0])
+        last_value = last_value*(1 - replay.done()[-1])
 
         rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value)
 

From 0663ef0d7b4bf32a63883ba9021c2267c4d06f94 Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Fri, 21 Feb 2020 13:22:43 +0100
Subject: [PATCH 07/10] Bugfix: do not backward through the boostrap path

---
 examples/actor_critic_cartpole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 74b3a02..5ad762e 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -37,7 +37,7 @@ def learn_step(self, replay, optimizer):
         # Discount rewards boostraping them from the last estimated value
         last_action, last_value = self(replay.state()[-1,:,:])
         # Boostrap from zero if it is a terminal state
-        last_value = last_value*(1 - replay.done()[-1])
+        last_value = (last_value*(1 - replay.done()[-1])).detach()
 
         rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value)
 

From 04651329abc2781b47d4b2836ab2ae9bc949c76e Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Fri, 21 Feb 2020 14:22:09 +0100
Subject: [PATCH 08/10] Bootstrap tensor shape fix

---
 examples/actor_critic_cartpole.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 5ad762e..76678a9 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -7,7 +7,7 @@
 from itertools import count
 import statistics
 
-NUM_ENVS = 4
+NUM_ENVS = 6
 STEPS = 5
 TRAIN_STEPS = int(1e4)
 
@@ -37,10 +37,9 @@ def learn_step(self, replay, optimizer):
         # Discount rewards boostraping them from the last estimated value
         last_action, last_value = self(replay.state()[-1,:,:])
         # Boostrap from zero if it is a terminal state
-        last_value = (last_value*(1 - replay.done()[-1])).detach()
+        last_value = (last_value[:, 0]*(1 - replay.done()[-1])).detach()
 
         rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value)
-
         for sars, reward in zip(replay, rewards):
             log_prob = sars.log_prob.view(self.num_envs, -1)
             value = sars.value.view(self.num_envs, -1)

From 540b338922a616164adf6bef0c18ec237b2c66e9 Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Fri, 21 Feb 2020 14:53:42 +0100
Subject: [PATCH 09/10] Revert: according to the A3C paper Appendix Algorithm
 S3 we should backward through the bootstrap path

---
 cherry/td.py                      | 2 +-
 examples/actor_critic_cartpole.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cherry/td.py b/cherry/td.py
index 03338d9..c62a64c 100644
--- a/cherry/td.py
+++ b/cherry/td.py
@@ -52,7 +52,7 @@ def discount(gamma, rewards, dones, bootstrap=0.0):
 
     msg = 'dones and rewards must have equal length.'
     assert rewards.size(0) == dones.size(0), msg
-    R = th.zeros_like(rewards[0]) + bootstrap
+    R = th.zeros_like(rewards) + bootstrap
     discounted = th.zeros_like(rewards)
     length = discounted.size(0)
     for t in reversed(range(length)):
diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index 76678a9..b316b34 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -37,7 +37,7 @@ def learn_step(self, replay, optimizer):
         # Discount rewards boostraping them from the last estimated value
         last_action, last_value = self(replay.state()[-1,:,:])
         # Boostrap from zero if it is a terminal state
-        last_value = (last_value[:, 0]*(1 - replay.done()[-1])).detach()
+        last_value = (last_value[:, 0]*(1 - replay.done()[-1]))
 
         rewards = cherry.td.discount(self.gamma, replay.reward(), replay.done(), last_value)
         for sars, reward in zip(replay, rewards):

From 1eda7c59c93f859a47da08e6d0a6e94949c766b5 Mon Sep 17 00:00:00 2001
From: Federico Galatolo <galatolo.federico@gmail.com>
Date: Fri, 21 Feb 2020 16:25:17 +0100
Subject: [PATCH 10/10] Using the last next state estimated V as boostrap

---
 examples/actor_critic_cartpole.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/actor_critic_cartpole.py b/examples/actor_critic_cartpole.py
index b316b34..4696ffb 100644
--- a/examples/actor_critic_cartpole.py
+++ b/examples/actor_critic_cartpole.py
@@ -34,8 +34,8 @@ def learn_step(self, replay, optimizer):
         value_loss = []
         entropy_loss = []
 
-        # Discount rewards boostraping them from the last estimated value
-        last_action, last_value = self(replay.state()[-1,:,:])
+        # Discount rewards and boostrap them with the estimation from the next state
+        last_action, last_value = self(replay.next_state()[-1,:,:])
         # Boostrap from zero if it is a terminal state
         last_value = (last_value[:, 0]*(1 - replay.done()[-1]))