omkarchittar
diff --git a/‎.DS_Store
10 KB b/‎.DS_Store
10 KB
diff --git a/‎LICENSE
+21 b/‎LICENSE
+21
diff --git a/‎README.md
+38 b/‎README.md
+38
diff --git a/‎agents/ddpg.py
+144 b/‎agents/ddpg.py
+144
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Aymen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,38 @@
+# Robot arm control with Reinforcement Learning
+
+![anim](https://github.com/kaymen99/Robot-arm-control-with-RL/assets/83681204/224cf960-43d8-4bdc-83be-ac8fe37e5be9)
+This project focuses on controlling a 7 DOF robot arm provided in the [pandas_gym](https://github.com/qgallouedec/panda-gym) Reacher environment using two continuous reinforcement learning algorithms: DDPG (Deep Deterministic Policy Gradients) and TD3 (Twin Delayed Deep Deterministic Policy Gradients). The technique of Hindsight Experience Replay is used to enhance the learning process of both algorithms.
+
+## Continuous RL Algorithms
+
+<p align="justify">
+Continuous reinforcement learning deals with environments where actions are continuous, such as the precise control of robotic arm joints or controlling the throttle of an autonomous vehicle. The primary objective is to find policies that effectively map observed states to continuous actions, ultimately optimizing the accumulation of expected rewards. Several algorithms have been specifically developed to address this challenge, including DDPG, TD3, SAC, PPO, and more.
+</p>
+
+### 1- DDPG (Deep Deterministic Policy Gradients)
+
+<p align="justify">
+DDPG is an actor-critic algorithm designed for continuous action spaces. It combines the strengths of policy gradients and Q-learning. In DDPG, an actor network learns the policy, while a critic network approximates the action-value (Q-function). The actor network directly outputs continuous actions, which are evaluted by the critic network to find the best action thus allowing for fine-grained control.
+</p>
+
+### 2- TD3 (Twin Delayed Deep Deterministic Policy Gradients)
+
+<p align="justify">
+TD3 is an enhancement of DDPG that addresses issues such as overestimation bias. It introduces the concept of "twin" critics to estimate the Q-value (it uses two critic networks instead of a single one like in DDPG), and it uses target networks with delayed updates to stabilize training. TD3 is known for its robustness and improved performance over DDPG.
+</p>
+
+## Hindsight Experience Replay
+
+<p align="justify">
+Hindsight Experience Replay (HER) is a technique developed to address the challenge of sparse and binary rewards in RL environments. For example, in many robotic tasks, achieving the desired goal is rare, and traditional RL algorithms struggle to learn from such feedback (agent always gets a zero reward unless the robot successfully completed the task which makes it difficult for the algorithm to learn as it doesn't know if the steps done were good or not).
+</p>
+
+<p align="justify">
+HER tackles this issue by reusing past experiences for learning, even if they didn't lead to the desired goal. It works by relabeling and storing experiences in a replay buffer, allowing the agent to learn from both successful and failed attempts which significantly accelerates the learning process.
+</p>
+
+- You can train a given model simply by running one of the files in the `training` folder.
+
+- You can change the values of the hyperparameters of both algorithms (learning_rate (alpha/beta), discount factor (gamma),...) by going directly to each agent class in the `agents` folder. The architecture of the Actor/Critic networks can be modified from the `networks.py` file.
+
+Link to HER paper: https://arxiv.org/pdf/1707.01495.pdf
@@ -0,0 +1,144 @@
+import tensorflow as tf
+import tensorflow.keras as keras
+from replay_memory.ReplayBuffer import ReplayBuffer
+from utils.networks import ActorNetwork, CriticNetwork
+
+## Actor-critic networks parameters :
+
+# actor learning rate
+alpha = 0.001
+
+# critic learning rate
+beta = 0.002
+
+## DDPG algorithms paramters
+
+# discount factor
+gamma = 0.99
+
+# target netwroks soft update factor 
+tau = 0.005
+
+# replay buffer max memory size
+max_size = 10**6
+
+# exploration noise factor 
+noise_factor = 0.1
+
+# training batch size 
+batch_size = 64
+
+## DDPG agent class 
+class DDPGAgent:
+    def __init__(self, env, input_dims):
+        self.gamma = gamma
+        self.tau = tau
+        self.batch_size = batch_size
+        self.noise_factor = noise_factor
+
+        self.env = env
+        self.n_actions = env.action_space.shape[0]
+        self.max_action = env.action_space.high[0]
+        self.min_action = env.action_space.low[0]
+
+        self.memory = ReplayBuffer(max_size, input_dims, self.n_actions)
+
+        self._initialize_networks(self.n_actions)
+        self.update_parameters(tau=1)
+
+    # Choose action based on actor network
+    # Add exploration noise if in traning mode
+    def choose_action(self, state, evaluate=False):
+        state = tf.convert_to_tensor([state], dtype=tf.float32)
+        actions = self.actor(state)
+        if not evaluate:
+            actions += tf.random.normal(shape=[self.n_actions], mean=0, stddev=self.noise_factor)
+        actions = tf.clip_by_value(actions, self.min_action, self.max_action)
+        return actions[0]
+    
+    def remember(self, state, action, reward, new_state, done):
+        self.memory.store_transition(state, action, reward, new_state, done)
+    
+    # Main DDPG algorithms learning process
+    def learn(self):
+          if self.memory.counter < self.batch_size:
+              return
+
+          # Sample batch size of experiences from replay buffer
+          states, actions, rewards, new_states, dones = self.memory.sample(self.batch_size)
+          states = tf.convert_to_tensor(states, dtype=tf.float32)
+          actions = tf.convert_to_tensor(actions, dtype=tf.float32)
+          rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
+          new_states = tf.convert_to_tensor(new_states, dtype=tf.float32)
+
+          # Calculate critic network loss
+          with tf.GradientTape() as tape:
+              target_actions = self.target_actor(new_states)
+              new_critic_value = tf.squeeze(self.target_critic(new_states, target_actions), 1)
+              critic_value = tf.squeeze(self.critic(states, actions), 1)
+              target = rewards + self.gamma * new_critic_value * (1 - dones)
+              critic_loss = tf.keras.losses.MSE(target, critic_value)
+
+          # Apply gradient decente with the calculated critic loss
+          critic_network_gradient = tape.gradient(critic_loss, self.critic.trainable_variables)
+          self.critic.optimizer.apply_gradients(zip(
+              critic_network_gradient, self.critic.trainable_variables 
+          ))
+
+          # Calculate actor network loss
+          with tf.GradientTape() as tape:
+              new_actions = self.actor(states)
+              actor_loss = - self.critic(states, new_actions)
+              actor_loss = tf.math.reduce_mean(actor_loss)
+          
+          # Apply gradient decente with the calculated actor loss
+          actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables)
+          self.actor.optimizer.apply_gradients(zip(
+                  actor_network_gradient, self.actor.trainable_variables 
+              ))
+          
+          # Update actor/critic target networks
+          self.update_parameters()
+
+    # Update actor/critic target networks parameters with soft update rule
+    def update_parameters(self, tau=None):
+        if tau is None:
+            tau = self.tau
+        
+        weights = []
+        targets = self.target_actor.weights
+        for i, weight in enumerate(self.actor.weights):
+            weights.append(tau * weight + (1 - tau) * targets[i])
+        self.target_actor.set_weights(weights)
+
+        weights = []
+        targets = self.target_critic.weights
+        for i, weight in enumerate(self.critic.weights):
+            weights.append(tau * weight + (1 - tau) * targets[i])
+        self.target_critic.set_weights(weights)
+
+    def save_models(self):
+        print("---- saving models ----")
+        self.actor.save_weights(self.actor.checkpoints_file)
+        self.critic.save_weights(self.critic.checkpoints_file)
+        self.target_actor.save_weights(self.target_actor.checkpoints_file)
+        self.target_critic.save_weights(self.target_critic.checkpoints_file)
+
+    def load_models(self):
+        print("---- loading models ----")
+        self.actor.load_weights(self.actor.checkpoints_file)
+        self.critic.load_weights(self.critic.checkpoints_file)
+        self.target_actor.load_weights(self.target_actor.checkpoints_file)
+        self.target_critic.load_weights(self.target_critic.checkpoints_file)
+
+    def _initialize_networks(self, n_actions):
+        model = "ddpg"
+        self.actor = ActorNetwork(n_actions, name="actor", model=model)
+        self.critic = CriticNetwork(name="critic", model=model)
+        self.target_actor = ActorNetwork(n_actions, name="target_actor", model=model)
+        self.target_critic = CriticNetwork(name="target_critic", model=model)
+
+        self.actor.compile(keras.optimizers.Adam(learning_rate=alpha))
+        self.critic.compile(keras.optimizers.Adam(learning_rate=beta))
+        self.target_actor.compile(keras.optimizers.Adam(learning_rate=alpha))
+        self.target_critic.compile(keras.optimizers.Adam(learning_rate=beta))