From 379afd7c6730b9729fb9d388be90d0b64143551f Mon Sep 17 00:00:00 2001
From: Anastasia Psarou <97515093+AnastasiaPsarou@users.noreply.github.com>
Date: Fri, 8 Nov 2024 15:37:00 +0100
Subject: [PATCH] params
---
tutorials/MLinPL/iql_mutation.ipynb | 9 +-
tutorials/MLinPL/mappo_ippo_mutation.ipynb | 1156 ++++++++++++++++++++
tutorials/MLinPL/qmix_mutation.ipynb | 9 +-
tutorials/MLinPL/vdn_mutation.ipynb | 9 +-
tutorials/PettingZooEnv/params_main.json | 15 +-
5 files changed, 1169 insertions(+), 29 deletions(-)
create mode 100644 tutorials/MLinPL/mappo_ippo_mutation.ipynb
diff --git a/tutorials/MLinPL/iql_mutation.ipynb b/tutorials/MLinPL/iql_mutation.ipynb
index 1ba0f1e63..ac7aa98d6 100644
--- a/tutorials/MLinPL/iql_mutation.ipynb
+++ b/tutorials/MLinPL/iql_mutation.ipynb
@@ -103,7 +103,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -139,12 +139,7 @@
"max_grad_norm = 1.0 # Maximum norm for the gradients\n",
"memory_size = 1000 # Size of the replay buffer\n",
"tau = 0.005\n",
- "\n",
- "# PPO\n",
- "clip_epsilon = 0.2 # clip value for PPO loss\n",
- "gamma = 0.99 # discount factor\n",
- "lmbda = 0.9 # lambda for generalised advantage estimation\n",
- "entropy_eps = 1e-4 # coefficient of the entropy term in the PPO loss"
+ "gamma = 0.99 # discount factor"
]
},
{
diff --git a/tutorials/MLinPL/mappo_ippo_mutation.ipynb b/tutorials/MLinPL/mappo_ippo_mutation.ipynb
new file mode 100644
index 000000000..cf4281e8d
--- /dev/null
+++ b/tutorials/MLinPL/mappo_ippo_mutation.ipynb
@@ -0,0 +1,1156 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# MAPPO - IPPO algorithms implementation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> In this notebook, we implement two state-of-the-art Multi Agent Reinforcement Leaning (MARL) algorithms **Multi-Agent Proximal Policy Optimization [MAPPO](https://arxiv.org/pdf/2103.01955)** and **Independent Proximal Policy Optimization [IPPO](https://arxiv.org/pdf/2011.09533)** in our environment. \n",
+ "\n",
+ "\n",
+ "> Tutorial based on [Multi-Agent Reinforcement Learning (PPO) with TorchRL Tutorial](https://pytorch.org/rl/stable/tutorials/multiagent_ppo.html)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Simulation overview"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> We simulate our environment with an initial population of **20 human agents**. These agents navigate the environment and eventually converge on the fastest path. After this convergence, we will transition **10 of these human agents** into **machine agents**, specifically autonomous vehicles (AVs), which will then employ either the MAPPO or IPPO reinforcement learning algorithms to further refine their learning."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "![Alt text](../../docs/img/env.png)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Imported libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import sys\n",
+ "import torch\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "from tensordict.nn import TensorDictModule\n",
+ "from torchrl.collectors import SyncDataCollector\n",
+ "from torch.distributions import Categorical\n",
+ "from torchrl.envs.libs.pettingzoo import PettingZooWrapper\n",
+ "from torchrl.envs.transforms import TransformedEnv, RewardSum\n",
+ "from torchrl.envs.utils import check_env_specs\n",
+ "from torchrl.data.replay_buffers import ReplayBuffer\n",
+ "from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement\n",
+ "from torchrl.data.replay_buffers.storages import LazyTensorStorage\n",
+ "from torchrl.modules import MultiAgentMLP, ProbabilisticActor\n",
+ "from torchrl.objectives.value import GAE\n",
+ "from torchrl.objectives import ClipPPOLoss, ValueEstimators\n",
+ "\n",
+ "\n",
+ "sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../')))\n",
+ "\n",
+ "from RouteRL.keychain import Keychain as kc\n",
+ "from RouteRL.environment.environment import TrafficEnvironment\n",
+ "from RouteRL.services.plotter import Plotter\n",
+ "from RouteRL.utilities import get_params\n",
+ "\n",
+ "os.environ[\"KMP_DUPLICATE_LIB_OK\"]=\"TRUE\"\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Hyperparameters setting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "params = get_params(\"params.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "device is: cpu\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Devices\n",
+ "device = (\n",
+ " torch.device(0)\n",
+ " if torch.cuda.is_available()\n",
+ " else torch.device(\"cpu\")\n",
+ ")\n",
+ "\n",
+ "print(\"device is: \", device)\n",
+ "vmas_device = device # The device where the simulator is run\n",
+ "\n",
+ "machine_agents = params[\"agent_generation_parameters\"][\"new_machines_after_mutation\"]\n",
+ "\n",
+ "# Sampling\n",
+ "frames_per_batch = machine_agents * 4 # Number of team frames collected per training iteration\n",
+ "n_iters = 30 # Number of sampling and training iterations - the episodes the plotter plots\n",
+ "total_frames = frames_per_batch * n_iters\n",
+ "\n",
+ "# Training\n",
+ "num_epochs = 10 # Number of optimization steps per training iteration\n",
+ "minibatch_size = 2 # Size of the mini-batches in each optimization step\n",
+ "lr = 3e-4 # Learning rate\n",
+ "max_grad_norm = 1.0 # Maximum norm for the gradients\n",
+ "\n",
+ "# PPO\n",
+ "clip_epsilon = 0.2 # clip value for PPO loss\n",
+ "gamma = 0.99 # discount factor\n",
+ "lmbda = 0.9 # lambda for generalised advantage estimation\n",
+ "entropy_eps = 1e-4 # coefficient of the entropy term in the PPO loss"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Environment initialization"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> In this example, the environment initially contains only human agents."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[CONFIRMED] Environment variable exists: SUMO_HOME\n",
+ "[SUCCESS] Added module directory: C:\\Program Files (x86)\\Eclipse\\Sumo\\tools\n"
+ ]
+ }
+ ],
+ "source": [
+ "env = TrafficEnvironment(params[kc.RUNNER], params[kc.ENVIRONMENT], params[kc.SIMULATOR], params[kc.AGENT_GEN], params[kc.AGENTS], params[kc.PLOTTER])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of total agents is: 20 \n",
+ "\n",
+ "Agents are: [Human 0, Human 1, Human 2, Human 3, Human 4, Human 5, Human 6, Human 7, Human 8, Human 9, Human 10, Human 11, Human 12, Human 13, Human 14, Human 15, Human 16, Human 17, Human 18, Human 19] \n",
+ "\n",
+ "Number of human agents is: 20 \n",
+ "\n",
+ "Number of machine agents (autonomous vehicles) is: 0 \n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Number of total agents is: \", len(env.all_agents), \"\\n\")\n",
+ "print(\"Agents are: \", env.all_agents, \"\\n\")\n",
+ "print(\"Number of human agents is: \", len(env.human_agents), \"\\n\")\n",
+ "print(\"Number of machine agents (autonomous vehicles) is: \", len(env.machine_agents), \"\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> Reset the environment and the connection with SUMO"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "({}, {})"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "env.start()\n",
+ "env.reset()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Human learning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "num_episodes = 100\n",
+ "\n",
+ "for episode in range(num_episodes):\n",
+ " env.step()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Mutation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> **Mutation**: a portion of human agents are converted into machine agents (autonomous vehicles). You can adjust the number of agents to be mutated in the /params.json
file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "env.mutation()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of total agents is: 20 \n",
+ "\n",
+ "Agents are: [Machine 13, Machine 11, Machine 10, Machine 6, Machine 19, Machine 8, Machine 12, Machine 18, Machine 7, Machine 15, Human 0, Human 1, Human 2, Human 3, Human 4, Human 5, Human 9, Human 14, Human 16, Human 17] \n",
+ "\n",
+ "Number of human agents is: 10 \n",
+ "\n",
+ "Number of machine agents (autonomous vehicles) is: 10 \n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Number of total agents is: \", len(env.all_agents), \"\\n\")\n",
+ "print(\"Agents are: \", env.all_agents, \"\\n\")\n",
+ "print(\"Number of human agents is: \", len(env.human_agents), \"\\n\")\n",
+ "print(\"Number of machine agents (autonomous vehicles) is: \", len(env.machine_agents), \"\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> Create a group that contains all the machine (RL) agents.\n",
+ "\n",
+ "> **Hint:** the agents aren't competely independent in this example."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "machine_list = []\n",
+ "for machines in env.machine_agents:\n",
+ " machine_list.append(str(machines.id))\n",
+ " \n",
+ "group = {'agents': machine_list}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### PettingZoo environment wrapper"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "env = PettingZooWrapper(\n",
+ " env=env,\n",
+ " use_mask=True,\n",
+ " categorical_actions=True,\n",
+ " done_on_any = False,\n",
+ " group_map=group,\n",
+ " device=device\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> The environment is defined by a series of metadata that describe what can be expected during its execution. \n",
+ "\n",
+ "There are four specs to look at:\n",
+ "\n",
+ "- action_spec
defines the action space;\n",
+ "\n",
+ "- reward_spec
defines the reward domain;\n",
+ "\n",
+ "- done_spec
defines the done domain;\n",
+ "\n",
+ "- observation_spec
which defines the domain of all other outputs from environment steps;"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "action_spec: CompositeSpec(\n",
+ " agents: CompositeSpec(\n",
+ " action: DiscreteTensorSpec(\n",
+ " shape=torch.Size([10]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.int64,\n",
+ " domain=discrete), device=cpu, shape=torch.Size([10])), device=cpu, shape=torch.Size([])) \n",
+ "\n",
+ "\n",
+ "reward_spec: CompositeSpec(\n",
+ " agents: CompositeSpec(\n",
+ " reward: UnboundedContinuousTensorSpec(\n",
+ " shape=torch.Size([10, 1]),\n",
+ " space=None,\n",
+ " device=cpu,\n",
+ " dtype=torch.float32,\n",
+ " domain=continuous), device=cpu, shape=torch.Size([10])), device=cpu, shape=torch.Size([])) \n",
+ "\n",
+ "\n",
+ "done_spec: CompositeSpec(\n",
+ " done: DiscreteTensorSpec(\n",
+ " shape=torch.Size([1]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.bool,\n",
+ " domain=discrete),\n",
+ " terminated: DiscreteTensorSpec(\n",
+ " shape=torch.Size([1]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.bool,\n",
+ " domain=discrete),\n",
+ " truncated: DiscreteTensorSpec(\n",
+ " shape=torch.Size([1]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.bool,\n",
+ " domain=discrete),\n",
+ " agents: CompositeSpec(\n",
+ " done: DiscreteTensorSpec(\n",
+ " shape=torch.Size([10, 1]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.bool,\n",
+ " domain=discrete),\n",
+ " terminated: DiscreteTensorSpec(\n",
+ " shape=torch.Size([10, 1]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.bool,\n",
+ " domain=discrete),\n",
+ " truncated: DiscreteTensorSpec(\n",
+ " shape=torch.Size([10, 1]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.bool,\n",
+ " domain=discrete), device=cpu, shape=torch.Size([10])), device=cpu, shape=torch.Size([])) \n",
+ "\n",
+ "\n",
+ "observation_spec: CompositeSpec(\n",
+ " agents: CompositeSpec(\n",
+ " observation: BoundedTensorSpec(\n",
+ " shape=torch.Size([10, 2]),\n",
+ " space=ContinuousBox(\n",
+ " low=Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.float32, contiguous=True),\n",
+ " high=Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.float32, contiguous=True)),\n",
+ " device=cpu,\n",
+ " dtype=torch.float32,\n",
+ " domain=continuous),\n",
+ " mask: DiscreteTensorSpec(\n",
+ " shape=torch.Size([10]),\n",
+ " space=DiscreteBox(n=2),\n",
+ " device=cpu,\n",
+ " dtype=torch.bool,\n",
+ " domain=discrete), device=cpu, shape=torch.Size([10])), device=cpu, shape=torch.Size([])) \n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"action_spec:\", env.full_action_spec, \"\\n\\n\")\n",
+ "print(\"reward_spec:\", env.full_reward_spec, \"\\n\\n\")\n",
+ "print(\"done_spec:\", env.full_done_spec, \"\\n\\n\")\n",
+ "print(\"observation_spec:\", env.observation_spec, \"\\n\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> Agent group mapping"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env.group is: {'agents': ['13', '11', '10', '6', '19', '8', '12', '18', '7', '15']} \n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"env.group is: \", env.group_map, \"\\n\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Transforms"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> We can append any TorchRL transform we need to our environment. These will modify its input/output in some desired way. In multi-agent contexts, it is paramount to provide explicitly the keys to modify.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Here we instatiate a RewardSum
transformer that will sum rewards over episode."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "env = TransformedEnv(\n",
+ " env,\n",
+ " RewardSum(in_keys=[env.reward_key], out_keys=[(\"agents\", \"episode_reward\")]),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The check_env_specs()
function runs a small rollout and compared it output against the environment specs. It will raise an error if the specs aren't properly defined."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "2024-11-08 15:31:34,578 [torchrl][INFO] check_env_specs succeeded!\n"
+ ]
+ }
+ ],
+ "source": [
+ "check_env_specs(env)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reset_td = env.reset()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Rollout"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "With env.rollout(n_steps)
we can get an overview of what the environment inputs and outputs look like."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "##### Observation and Action Availability in TorchRL\n",
+ "\n",
+ "In the TorchRL framework, the observations are available in both the root and next states, as they can be accessed during both the reset phase and after executing a step.\n",
+ "\n",
+ "##### Action Availability\n",
+ "Actions are exclusively available in the root state, as there are no actions generated as a result of a step.\n",
+ "\n",
+ "##### Reward Availability\n",
+ "Rewards are provided only in the next state, as no rewards are issued at the reset time.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "rollout of three steps: TensorDict(\n",
+ " fields={\n",
+ " agents: TensorDict(\n",
+ " fields={\n",
+ " action: Tensor(shape=torch.Size([5, 10]), device=cpu, dtype=torch.int64, is_shared=False),\n",
+ " done: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " episode_reward: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " mask: Tensor(shape=torch.Size([5, 10]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " observation: Tensor(shape=torch.Size([5, 10, 2]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([5, 10]),\n",
+ " device=cpu,\n",
+ " is_shared=False),\n",
+ " done: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " next: TensorDict(\n",
+ " fields={\n",
+ " agents: TensorDict(\n",
+ " fields={\n",
+ " done: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " episode_reward: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " mask: Tensor(shape=torch.Size([5, 10]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " observation: Tensor(shape=torch.Size([5, 10, 2]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " reward: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([5, 10, 1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([5, 10]),\n",
+ " device=cpu,\n",
+ " is_shared=False),\n",
+ " done: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([5]),\n",
+ " device=cpu,\n",
+ " is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([5, 1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([5]),\n",
+ " device=cpu,\n",
+ " is_shared=False) \n",
+ "\n",
+ "\n",
+ "\n",
+ "Shape of the rollout TensorDict: torch.Size([5]) \n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "n_rollout_steps = 5\n",
+ "rollout = env.rollout(n_rollout_steps)\n",
+ "print(\"rollout of three steps:\", rollout, \"\\n\\n\\n\")\n",
+ "print(\"Shape of the rollout TensorDict:\", rollout.batch_size, \"\\n\\n\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Policy/Actor network"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The Proximal Policy Optimization (PPO) algorithm employs a **stochastic policy** to effectively handle exploration during training. Instead of directly outputting a single action, the neural network generates the parameters of a probability distribution over the action space. Actions are then sampled from this distribution.\n",
+ "\n",
+ "For discrete action spaces, the neural network outputs the **logits**, which represent the unnormalized scores for each possible action. These logits are subsequently transformed into a probability distribution using a softmax function. The agent samples an action from this distribution, ensuring a balance between exploration and exploitation during learning.\n",
+ "\n",
+ "This stochastic approach is essential for robust exploration, as it prevents the policy from becoming deterministic too early, which could lead to suboptimal solutions in complex environments."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "share_parameters_policy = False \n",
+ "\n",
+ "policy_net = torch.nn.Sequential(\n",
+ " MultiAgentMLP(\n",
+ " n_agent_inputs = env.observation_spec[\"agents\", \"observation\"].shape[-1],\n",
+ " n_agent_outputs = env.action_spec.space.n,\n",
+ " n_agents = env.n_agents,\n",
+ " centralised=False,\n",
+ " share_params=share_parameters_policy,\n",
+ " device=device,\n",
+ " depth=3,\n",
+ " num_cells=64,\n",
+ " activation_class=torch.nn.Tanh,\n",
+ " ),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> The neural network is wrapped in a `TensorDictModule`, which is responsible for managing the input and output interactions with the tensordict. Specifically, the module reads from the specified `in_keys`, processes the inputs through the neural network, and writes the resulting outputs to the defined `out_keys`. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "policy_module = TensorDictModule(\n",
+ " policy_net,\n",
+ " in_keys=[(\"agents\", \"observation\")],\n",
+ " out_keys=[(\"agents\", \"logits\")],\n",
+ ") "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> The `ProbabilisticActor` takes as input the logits which are used to parameterize a probability distribution. Actions are then sampled from this distribution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "policy = ProbabilisticActor(\n",
+ " module=policy_module,\n",
+ " spec=env.action_spec,\n",
+ " in_keys=[(\"agents\", \"logits\")],\n",
+ " out_keys=[env.action_key],\n",
+ " distribution_class=Categorical,\n",
+ " return_log_prob=True,\n",
+ " log_prob_key=(\"agents\", \"sample_log_prob\"),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Critic network"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> The critic reads the observations and returns the corresponding value estimates."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In **Multi-Agent Proximal Policy Optimization (MAPPO)**, the critic is centralized and has access to the full state of the environment, providing **full observability**. This centralized critic enables better coordination between agents by evaluating the global state rather than just individual observations.\n",
+ "\n",
+ "In contrast, **Independent Proximal Policy Optimization (IPPO)** uses a **local, decentralized critic**. Similar to the policy, the critic in IPPO is based solely on the agent's local observations, making it more scalable and applicable to fully decentralized environments where agents do not have access to the full global state.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "share_parameters_critic = True\n",
+ "mappo = False # IPPO if False\n",
+ "\n",
+ "critic_net = MultiAgentMLP(\n",
+ " n_agent_inputs=env.observation_spec[\"agents\", \"observation\"].shape[-1],\n",
+ " n_agent_outputs=1, \n",
+ " n_agents=env.n_agents,\n",
+ " centralised=mappo,\n",
+ " share_params=share_parameters_critic,\n",
+ " device=device,\n",
+ " depth=4,\n",
+ " num_cells=64,\n",
+ " activation_class=torch.nn.ReLU,\n",
+ ")\n",
+ "\n",
+ "critic = TensorDictModule(\n",
+ " module=critic_net,\n",
+ " in_keys=[(\"agents\", \"observation\")],\n",
+ " out_keys=[(\"agents\", \"state_value\")],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> Let's try our policy and critic modules."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running policy: TensorDict(\n",
+ " fields={\n",
+ " agents: TensorDict(\n",
+ " fields={\n",
+ " action: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.int64, is_shared=False),\n",
+ " done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " episode_reward: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " logits: Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " mask: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " observation: Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " sample_log_prob: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([10]),\n",
+ " device=cpu,\n",
+ " is_shared=False),\n",
+ " done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([]),\n",
+ " device=cpu,\n",
+ " is_shared=False)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Running policy:\", policy(env.reset()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running value: TensorDict(\n",
+ " fields={\n",
+ " agents: TensorDict(\n",
+ " fields={\n",
+ " done: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " episode_reward: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " mask: Tensor(shape=torch.Size([10]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " observation: Tensor(shape=torch.Size([10, 2]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " state_value: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.float32, is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([10, 1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([10]),\n",
+ " device=cpu,\n",
+ " is_shared=False),\n",
+ " done: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " terminated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False),\n",
+ " truncated: Tensor(shape=torch.Size([1]), device=cpu, dtype=torch.bool, is_shared=False)},\n",
+ " batch_size=torch.Size([]),\n",
+ " device=cpu,\n",
+ " is_shared=False)\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"Running value:\", critic(env.reset()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Collector"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Collectors perform the following operations:\n",
+ "\n",
+ "1. **Reset Environment**: Initialize the environment.\n",
+ "2. **Compute Action**: Determine the next action using the policy and the latest observation.\n",
+ "3. **Execute Step**: Step through the environment with the computed action.\n",
+ "\n",
+ "These operations repeat until the environment signals to stop."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "collector = SyncDataCollector(\n",
+ " env,\n",
+ " policy,\n",
+ " device=device,\n",
+ " storing_device=device,\n",
+ " frames_per_batch=frames_per_batch,\n",
+ " total_frames=total_frames,\n",
+ ") "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Replay buffer"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In an on-policy setting, the replay buffer is refilled each time a batch of data is collected. The data within this buffer is then utilized multiple times over a specified number of epochs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "replay_buffer = ReplayBuffer(\n",
+ " storage=LazyTensorStorage(\n",
+ " frames_per_batch, device=device\n",
+ " ), \n",
+ " sampler=SamplerWithoutReplacement(),\n",
+ " batch_size=minibatch_size,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### PPO loss function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "An advantage estimation needs to be computed. An advantage is a value that reflects an expectancy over the return value while dealing with the bias/variance tradeoff. To compute the advantage we need to build the advantage module and pass each batch of data through it before each epoch. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "loss_module = ClipPPOLoss(\n",
+ " actor_network=policy,\n",
+ " critic_network=critic,\n",
+ " clip_epsilon=clip_epsilon,\n",
+ " entropy_coef=entropy_eps,\n",
+ " normalize_advantage=False,\n",
+ ")\n",
+ "loss_module.set_keys( \n",
+ " reward=env.reward_key, \n",
+ " action=env.action_key, \n",
+ " sample_log_prob=(\"agents\", \"sample_log_prob\"),\n",
+ " value=(\"agents\", \"state_value\"),\n",
+ " done=(\"agents\", \"done\"),\n",
+ " terminated=(\"agents\", \"terminated\"),\n",
+ ")\n",
+ "\n",
+ "loss_module.make_value_estimator(\n",
+ " ValueEstimators.GAE, gamma=gamma, lmbda=lmbda\n",
+ ") \n",
+ "\n",
+ "GAE = loss_module.value_estimator\n",
+ "\n",
+ "optim = torch.optim.Adam(loss_module.parameters(), lr)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Training loop"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "episode_reward_mean = -0.7358333468437195: 100%|██████████| 30/30 [02:56<00:00, 5.96s/it]"
+ ]
+ }
+ ],
+ "source": [
+ "pbar = tqdm(total=n_iters, desc=\"episode_reward_mean = 0\")\n",
+ "\n",
+ "episode_reward_mean_list = []\n",
+ "loss_values = []\n",
+ "loss_entropy = []\n",
+ "loss_objective = []\n",
+ "loss_critic = []\n",
+ "\n",
+ "for tensordict_data in collector: ##loops over frame_per_batch\n",
+ "\n",
+ " ## Generate the rollouts\n",
+ " tensordict_data.set(\n",
+ " (\"next\", \"agents\", \"done\"),\n",
+ " tensordict_data.get((\"next\", \"done\"))\n",
+ " .unsqueeze(-1)\n",
+ " .expand(tensordict_data.get_item_shape((\"next\", env.reward_key))), # Adjust index to start from 0\n",
+ " )\n",
+ " tensordict_data.set(\n",
+ " (\"next\", \"agents\", \"terminated\"),\n",
+ " tensordict_data.get((\"next\", \"terminated\"))\n",
+ " .unsqueeze(-1)\n",
+ " .expand(tensordict_data.get_item_shape((\"next\", env.reward_key))), # Adjust index to start from 0\n",
+ " )\n",
+ "\n",
+ " # Compute GAE for all agents\n",
+ " with torch.no_grad():\n",
+ " GAE(\n",
+ " tensordict_data,\n",
+ " params=loss_module.critic_network_params,\n",
+ " target_params=loss_module.target_critic_network_params,\n",
+ " )\n",
+ "\n",
+ " data_view = tensordict_data.reshape(-1) \n",
+ " replay_buffer.extend(data_view)\n",
+ "\n",
+ " ## Update the policies of the learning agents\n",
+ " for _ in range(num_epochs):\n",
+ " for _ in range(frames_per_batch // minibatch_size):\n",
+ " subdata = replay_buffer.sample()\n",
+ " loss_vals = loss_module(subdata)\n",
+ "\n",
+ " loss_value = (\n",
+ " loss_vals[\"loss_objective\"]\n",
+ " + loss_vals[\"loss_critic\"]\n",
+ " + loss_vals[\"loss_entropy\"]\n",
+ " )\n",
+ "\n",
+ " loss_value.backward()\n",
+ "\n",
+ " torch.nn.utils.clip_grad_norm_(\n",
+ " loss_module.parameters(), max_grad_norm\n",
+ " ) \n",
+ "\n",
+ " optim.step()\n",
+ " optim.zero_grad()\n",
+ "\n",
+ " loss_values.append(loss_value.item())\n",
+ "\n",
+ " loss_entropy.append(loss_vals[\"loss_entropy\"].item())\n",
+ "\n",
+ " loss_objective.append(loss_vals[\"loss_objective\"].item())\n",
+ "\n",
+ " loss_critic.append(loss_vals[\"loss_critic\"].item())\n",
+ "\n",
+ "\n",
+ " \n",
+ " collector.update_policy_weights_()\n",
+ " \n",
+ " # Logging\n",
+ " done = tensordict_data.get((\"next\", \"agents\", \"done\")) # Get done status for the group\n",
+ "\n",
+ " episode_reward_mean = (\n",
+ " tensordict_data.get((\"next\", \"agents\", \"episode_reward\"))[done].mean().item()\n",
+ " )\n",
+ " episode_reward_mean_list.append(episode_reward_mean)\n",
+ "\n",
+ "\n",
+ " pbar.set_description(f\"episode_reward_mean = {episode_reward_mean}\", refresh=False)\n",
+ " pbar.update()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> Check `\\plots` directory to find the plots created from this experiment."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n",
+ "WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from RouteRL.services import plotter\n",
+ "plotter(params[kc.PLOTTER])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "env.stop()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "> We can also plot the loss values to get an insight on whether the algorithm converged."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "torchrl",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tutorials/MLinPL/qmix_mutation.ipynb b/tutorials/MLinPL/qmix_mutation.ipynb
index a300d6f2e..e9793a20f 100644
--- a/tutorials/MLinPL/qmix_mutation.ipynb
+++ b/tutorials/MLinPL/qmix_mutation.ipynb
@@ -117,7 +117,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -153,12 +153,7 @@
"max_grad_norm = 1.0 # Maximum norm for the gradients\n",
"memory_size = 1000 # Size of the replay buffer\n",
"tau = 0.005\n",
- "\n",
- "# PPO\n",
- "clip_epsilon = 0.2 # clip value for PPO loss\n",
- "gamma = 0.99 # discount factor\n",
- "lmbda = 0.9 # lambda for generalised advantage estimation\n",
- "entropy_eps = 1e-4 # coefficient of the entropy term in the PPO loss"
+ "gamma = 0.99 # discount factor"
]
},
{
diff --git a/tutorials/MLinPL/vdn_mutation.ipynb b/tutorials/MLinPL/vdn_mutation.ipynb
index 3f1397763..b0374e38c 100644
--- a/tutorials/MLinPL/vdn_mutation.ipynb
+++ b/tutorials/MLinPL/vdn_mutation.ipynb
@@ -122,7 +122,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": null,
"metadata": {},
"outputs": [
{
@@ -158,12 +158,7 @@
"max_grad_norm = 1.0 # Maximum norm for the gradients\n",
"memory_size = 1000 # Size of the replay buffer\n",
"tau = 0.005\n",
- "\n",
- "# PPO\n",
- "clip_epsilon = 0.2 # clip value for PPO loss\n",
- "gamma = 0.99 # discount factor\n",
- "lmbda = 0.9 # lambda for generalised advantage estimation\n",
- "entropy_eps = 1e-4 # coefficient of the entropy term in the PPO loss"
+ "gamma = 0.99 # discount factor"
]
},
{
diff --git a/tutorials/PettingZooEnv/params_main.json b/tutorials/PettingZooEnv/params_main.json
index b1bca046b..af018a8b1 100644
--- a/tutorials/PettingZooEnv/params_main.json
+++ b/tutorials/PettingZooEnv/params_main.json
@@ -10,10 +10,10 @@
"runner_parameters":
{
- "num_episodes" : 200,
+ "num_episodes" : 1600,
"episode_length": 1,
"phases" : [0, 100],
- "phase_names" : ["Human learning", "Mutation"],
+ "phase_names" : ["Human learning", "Mutation - Machine learning"],
"remember_every" : 1,
"frequent_progressbar_update" : false
},
@@ -21,7 +21,7 @@
"phase_parameters":
{
"number_of_phases": 2,
- "number_episodes_each_phase": [100, 100]
+ "number_episodes_each_phase": [100, 1000]
},
@@ -52,9 +52,9 @@
"learning_phases" : [0, 2],
"model" : "gawron",
"alpha" : 0.2,
- "alpha_sigma": 0.1,
+ "alpha_sigma": 0.2,
"alpha_zero": 0.2,
- "beta" : 0.5,
+ "beta" : 0.01,
"beta_randomness" : 0.3,
"appearance_phase" : "No need for this, by default appears in 0",
"behavior" : "No need for this, by default selfish"
@@ -85,7 +85,6 @@
},
-
"plotter_parameters":
{
"smooth_by" : 50,
@@ -106,7 +105,7 @@
"losses_log_file_name": "losses.txt",
"detector_logs_folder": "detector",
"paths_csv_file_name": "paths.csv",
- "free_flow_times_csv_file_name": "free_flow_times.csv",
+ "free_flow_times_csv_file_name": "../../path_generation/free_flow_times.csv",
"plots_folder": "plots",
"reward_plot_file_name": "rewards.png",
@@ -126,7 +125,7 @@
{
"num_agents" : 20,
"new_machines_after_mutation": 10,
- "ratio_mutating" : 0.33,
+ "ratio_mutating" : 0.5,
"agent_attributes" : ["id", "origin", "destination", "start_time", "kind"],
"simulation_timesteps" : "${simulator_parameters.simulation_timesteps}",
"origins" : "${path_generation_parameters.origins}",