From 4cfc3eb0edb893f2645c374ae346e942ffcfa6ac Mon Sep 17 00:00:00 2001 From: Muhammad Anas Raza <63569834+anas-rz@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:03:02 -0400 Subject: [PATCH] port actor_critic_cartpole to keras core (#542) --- .../tensorflow/rl/actor_critic_cartpole.py | 189 ++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 examples/keras_io/tensorflow/rl/actor_critic_cartpole.py diff --git a/examples/keras_io/tensorflow/rl/actor_critic_cartpole.py b/examples/keras_io/tensorflow/rl/actor_critic_cartpole.py new file mode 100644 index 000000000..929e8a847 --- /dev/null +++ b/examples/keras_io/tensorflow/rl/actor_critic_cartpole.py @@ -0,0 +1,189 @@ +""" +Title: Actor Critic Method +Author: [Apoorv Nandan](https://twitter.com/NandanApoorv) +Converted to Keras Core by: [Muhammad Anas Raza](https://anasrz.com) +Date created: 2020/05/13 +Last modified: 2023/07/19 +Description: Implement Actor Critic Method in CartPole environment. +Accelerator: NONE +""" +""" +## Introduction + +This script shows an implementation of Actor Critic method on CartPole-V0 environment. + +### Actor Critic Method + +As an agent takes actions and moves through an environment, it learns to map +the observed state of the environment to two possible outputs: + +1. Recommended action: A probability value for each action in the action space. + The part of the agent responsible for this output is called the **actor**. +2. Estimated rewards in the future: Sum of all rewards it expects to receive in the + future. The part of the agent responsible for this output is the **critic**. + +Agent and Critic learn to perform their tasks, such that the recommended actions +from the actor maximize the rewards. + +### CartPole-V1 + +A pole is attached to a cart placed on a frictionless track. The agent has to apply +force to move the cart. It is rewarded for every time step the pole +remains upright. The agent, therefore, must learn to keep the pole from falling over. + +### References + +- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf) +- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document) +""" +""" +## Setup +""" + +import os +os.environ['KERAS_BACKEND'] = 'tensorflow' + +import keras_core as keras +from keras_core import layers + +import gym +import numpy as np +import tensorflow as tf + + +# Configuration parameters for the whole setup +seed = 42 +gamma = 0.99 # Discount factor for past rewards +max_steps_per_episode = 10000 +env = gym.make("CartPole-v1", new_step_api=True) # Create the environment +env.reset(seed=seed) +eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0 + +""" +## Implement Actor Critic network + +This network learns two functions: + +1. Actor: This takes as input the state of our environment and returns a +probability value for each action in its action space. +2. Critic: This takes as input the state of our environment and returns +an estimate of total rewards in the future. + +In our implementation, they share the initial layer. +""" + +num_inputs = 4 +num_actions = 2 +num_hidden = 128 + +inputs = layers.Input(shape=(num_inputs,)) +common = layers.Dense(num_hidden, activation="relu")(inputs) +action = layers.Dense(num_actions, activation="softmax")(common) +critic = layers.Dense(1)(common) + +model = keras.Model(inputs=inputs, outputs=[action, critic]) + +""" +## Train +""" + +optimizer = keras.optimizers.Adam(learning_rate=0.01) +huber_loss = keras.losses.Huber() +action_probs_history = [] +critic_value_history = [] +rewards_history = [] +running_reward = 0 +episode_count = 0 + +while True: # Run until solved + state = env.reset() + episode_reward = 0 + with tf.GradientTape() as tape: + for timestep in range(1, max_steps_per_episode): + # env.render(); Adding this line would show the attempts + # of the agent in a pop up window. + + state = tf.convert_to_tensor(state) + state = tf.expand_dims(state, 0) + + # Predict action probabilities and estimated future rewards + # from environment state + action_probs, critic_value = model(state) + critic_value_history.append(critic_value[0, 0]) + + # Sample action from action probability distribution + action = np.random.choice(num_actions, p=np.squeeze(action_probs)) + action_probs_history.append(tf.math.log(action_probs[0, action])) + + # Apply the sampled action in our environment + state, reward, done, _, _ = env.step(action) + rewards_history.append(reward) + episode_reward += reward + + if done: + break + + # Update running reward to check condition for solving + running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward + + # Calculate expected value from rewards + # - At each timestep what was the total reward received after that timestep + # - Rewards in the past are discounted by multiplying them with gamma + # - These are the labels for our critic + returns = [] + discounted_sum = 0 + for r in rewards_history[::-1]: + discounted_sum = r + gamma * discounted_sum + returns.insert(0, discounted_sum) + + # Normalize + returns = np.array(returns) + returns = (returns - np.mean(returns)) / (np.std(returns) + eps) + returns = returns.tolist() + + # Calculating loss values to update our network + history = zip(action_probs_history, critic_value_history, returns) + actor_losses = [] + critic_losses = [] + for log_prob, value, ret in history: + # At this point in history, the critic estimated that we would get a + # total reward = `value` in the future. We took an action with log probability + # of `log_prob` and ended up recieving a total reward = `ret`. + # The actor must be updated so that it predicts an action that leads to + # high rewards (compared to critic's estimate) with high probability. + diff = ret - value + actor_losses.append(-log_prob * diff) # actor loss + + # The critic must be updated so that it predicts a better estimate of + # the future rewards. + critic_losses.append( + huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)) + ) + + # Backpropagation + loss_value = sum(actor_losses) + sum(critic_losses) + grads = tape.gradient(loss_value, model.trainable_variables) + optimizer.apply_gradients(zip(grads, model.trainable_variables)) + + # Clear the loss and reward history + action_probs_history.clear() + critic_value_history.clear() + rewards_history.clear() + + # Log details + episode_count += 1 + if episode_count % 10 == 0: + template = "running reward: {:.2f} at episode {}" + print(template.format(running_reward, episode_count)) + + if running_reward > 195: # Condition to consider the task solved + print("Solved at episode {}!".format(episode_count)) + break +""" +## Visualizations +In early stages of training: +![Imgur](https://i.imgur.com/5gCs5kH.gif) + +In later stages of training: +![Imgur](https://i.imgur.com/5ziiZUD.gif) +"""