From 4cfc3eb0edb893f2645c374ae346e942ffcfa6ac Mon Sep 17 00:00:00 2001
From: Muhammad Anas Raza <63569834+anas-rz@users.noreply.github.com>
Date: Wed, 19 Jul 2023 16:03:02 -0400
Subject: [PATCH] port actor_critic_cartpole to keras core (#542)

---
 .../tensorflow/rl/actor_critic_cartpole.py    | 189 ++++++++++++++++++
 1 file changed, 189 insertions(+)
 create mode 100644 examples/keras_io/tensorflow/rl/actor_critic_cartpole.py

diff --git a/examples/keras_io/tensorflow/rl/actor_critic_cartpole.py b/examples/keras_io/tensorflow/rl/actor_critic_cartpole.py
new file mode 100644
index 000000000..929e8a847
--- /dev/null
+++ b/examples/keras_io/tensorflow/rl/actor_critic_cartpole.py
@@ -0,0 +1,189 @@
+"""
+Title: Actor Critic Method
+Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)
+Converted to Keras Core by: [Muhammad Anas Raza](https://anasrz.com)
+Date created: 2020/05/13
+Last modified: 2023/07/19
+Description: Implement Actor Critic Method in CartPole environment.
+Accelerator: NONE
+"""
+"""
+## Introduction
+
+This script shows an implementation of Actor Critic method on CartPole-V0 environment.
+
+### Actor Critic Method
+
+As an agent takes actions and moves through an environment, it learns to map
+the observed state of the environment to two possible outputs:
+
+1. Recommended action: A probability value for each action in the action space.
+   The part of the agent responsible for this output is called the **actor**.
+2. Estimated rewards in the future: Sum of all rewards it expects to receive in the
+   future. The part of the agent responsible for this output is the **critic**.
+
+Agent and Critic learn to perform their tasks, such that the recommended actions
+from the actor maximize the rewards.
+
+### CartPole-V1
+
+A pole is attached to a cart placed on a frictionless track. The agent has to apply
+force to move the cart. It is rewarded for every time step the pole
+remains upright. The agent, therefore, must learn to keep the pole from falling over.
+
+### References
+
+- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
+- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
+"""
+"""
+## Setup
+"""
+
+import os
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+
+import keras_core as keras
+from keras_core import layers
+
+import gym
+import numpy as np
+import tensorflow as tf
+
+
+# Configuration parameters for the whole setup
+seed = 42
+gamma = 0.99  # Discount factor for past rewards
+max_steps_per_episode = 10000
+env = gym.make("CartPole-v1", new_step_api=True)  # Create the environment
+env.reset(seed=seed)
+eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0
+
+"""
+## Implement Actor Critic network
+
+This network learns two functions:
+
+1. Actor: This takes as input the state of our environment and returns a
+probability value for each action in its action space.
+2. Critic: This takes as input the state of our environment and returns
+an estimate of total rewards in the future.
+
+In our implementation, they share the initial layer.
+"""
+
+num_inputs = 4
+num_actions = 2
+num_hidden = 128
+
+inputs = layers.Input(shape=(num_inputs,))
+common = layers.Dense(num_hidden, activation="relu")(inputs)
+action = layers.Dense(num_actions, activation="softmax")(common)
+critic = layers.Dense(1)(common)
+
+model = keras.Model(inputs=inputs, outputs=[action, critic])
+
+"""
+## Train
+"""
+
+optimizer = keras.optimizers.Adam(learning_rate=0.01)
+huber_loss = keras.losses.Huber()
+action_probs_history = []
+critic_value_history = []
+rewards_history = []
+running_reward = 0
+episode_count = 0
+
+while True:  # Run until solved
+    state = env.reset()
+    episode_reward = 0
+    with tf.GradientTape() as tape:
+        for timestep in range(1, max_steps_per_episode):
+            # env.render(); Adding this line would show the attempts
+            # of the agent in a pop up window.
+
+            state = tf.convert_to_tensor(state)
+            state = tf.expand_dims(state, 0)
+
+            # Predict action probabilities and estimated future rewards
+            # from environment state
+            action_probs, critic_value = model(state)
+            critic_value_history.append(critic_value[0, 0])
+
+            # Sample action from action probability distribution
+            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
+            action_probs_history.append(tf.math.log(action_probs[0, action]))
+
+            # Apply the sampled action in our environment
+            state, reward, done, _, _ = env.step(action)
+            rewards_history.append(reward)
+            episode_reward += reward
+
+            if done:
+                break
+
+        # Update running reward to check condition for solving
+        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
+
+        # Calculate expected value from rewards
+        # - At each timestep what was the total reward received after that timestep
+        # - Rewards in the past are discounted by multiplying them with gamma
+        # - These are the labels for our critic
+        returns = []
+        discounted_sum = 0
+        for r in rewards_history[::-1]:
+            discounted_sum = r + gamma * discounted_sum
+            returns.insert(0, discounted_sum)
+
+        # Normalize
+        returns = np.array(returns)
+        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
+        returns = returns.tolist()
+
+        # Calculating loss values to update our network
+        history = zip(action_probs_history, critic_value_history, returns)
+        actor_losses = []
+        critic_losses = []
+        for log_prob, value, ret in history:
+            # At this point in history, the critic estimated that we would get a
+            # total reward = `value` in the future. We took an action with log probability
+            # of `log_prob` and ended up recieving a total reward = `ret`.
+            # The actor must be updated so that it predicts an action that leads to
+            # high rewards (compared to critic's estimate) with high probability.
+            diff = ret - value
+            actor_losses.append(-log_prob * diff)  # actor loss
+
+            # The critic must be updated so that it predicts a better estimate of
+            # the future rewards.
+            critic_losses.append(
+                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
+            )
+
+        # Backpropagation
+        loss_value = sum(actor_losses) + sum(critic_losses)
+        grads = tape.gradient(loss_value, model.trainable_variables)
+        optimizer.apply_gradients(zip(grads, model.trainable_variables))
+
+        # Clear the loss and reward history
+        action_probs_history.clear()
+        critic_value_history.clear()
+        rewards_history.clear()
+
+    # Log details
+    episode_count += 1
+    if episode_count % 10 == 0:
+        template = "running reward: {:.2f} at episode {}"
+        print(template.format(running_reward, episode_count))
+
+    if running_reward > 195:  # Condition to consider the task solved
+        print("Solved at episode {}!".format(episode_count))
+        break
+"""
+## Visualizations
+In early stages of training:
+![Imgur](https://i.imgur.com/5gCs5kH.gif)
+
+In later stages of training:
+![Imgur](https://i.imgur.com/5ziiZUD.gif)
+"""