From f64079783fb3ec3a0bf1e23e6f857580595e4751 Mon Sep 17 00:00:00 2001 From: Mihai-Alexandru Hutan <88674929+hutanmihai@users.noreply.github.com> Date: Sat, 6 Jan 2024 10:51:39 +0200 Subject: [PATCH] Initial commit --- .gitignore | 15 +++++++ README.md | 64 +++++++++++++++++++++++++++++ src/dqn/agent.py | 82 ++++++++++++++++++++++++++++++++++++++ src/dqn/constants.py | 25 ++++++++++++ src/dqn/dqn.py | 37 +++++++++++++++++ src/dqn/main.py | 82 ++++++++++++++++++++++++++++++++++++++ src/dqn/replay_memory.py | 45 +++++++++++++++++++++ src/ppo/main.py | 27 +++++++++++++ src/utils/helpers.py | 24 +++++++++++ src/utils/preprocessing.py | 48 ++++++++++++++++++++++ 10 files changed, 449 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 src/dqn/agent.py create mode 100644 src/dqn/constants.py create mode 100644 src/dqn/dqn.py create mode 100644 src/dqn/main.py create mode 100644 src/dqn/replay_memory.py create mode 100644 src/ppo/main.py create mode 100644 src/utils/helpers.py create mode 100644 src/utils/preprocessing.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8923abe --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +# IDEs +.idea/ +.vscode/ + +# OS & miscellaneous +.DS_Store +out/ + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +models/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..b085db6 --- /dev/null +++ b/README.md @@ -0,0 +1,64 @@ +# reinforcement-learning-skiing + +## Required libraries + +```bash +conda create -n rl-skiing python=3.11.5 +conda activate rl-skiing +pip install numpy matplotlib jupyter opencv-python gymnasium +pip install "gymnasium[accept-rom-license, atari]" +``` + +- For windows (GPU): + +```bash +pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121 +``` + +- For windows (CPU) / macos (CPU) / linux (GPU): + +```bash +pip install torch torchvision +``` + +- For linux (CPU): + +```bash +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +``` + +## How to run the project + +### 1. Set the PYTHONPATH + +- Windows - Powershell: + +```bash +$env:PYTHONPATH='.' +``` + +- Windows - CMD: + +```bash +set PYTHONPATH=. +``` + +- Linux / MacOS: + +```bash +export PYTHONPATH=. +``` + +### 2. Run the project + +- Run DQN: + +```bash +python src/dqn/main.py +``` + +- Run PPO: + +```bash +python src/ppo/main.py +``` \ No newline at end of file diff --git a/src/dqn/agent.py b/src/dqn/agent.py new file mode 100644 index 0000000..ec3813e --- /dev/null +++ b/src/dqn/agent.py @@ -0,0 +1,82 @@ +import torch +from matplotlib import pyplot as plt + +from dqn import DQN +from replay_memory import ReplayMemory +from random import random +from src.dqn.constants import ( + POLICY_NET_PATH, + TARGET_NET_PATH, + MODELS_PATH, + EPSILON_MAX, + EPSILON_MIN, + EPSILON_DECAY, + GAMMA, + BATCH_SIZE, + DEVICE, +) +import numpy as np + +from src.utils.helpers import check_if_dirs_exist + + +class Agent: + def __init__(self, action_space): + self.action_space = action_space + self.gamma: float = GAMMA + self.device = DEVICE + + self.epsilon: float = EPSILON_MAX + self.epsilon_min: float = EPSILON_MIN + self.epsilon_decay: float = EPSILON_DECAY + + self.replay_memory: ReplayMemory = ReplayMemory() + + self.policy_net: DQN = DQN().to(self.device) + self.target_net: DQN = DQN().to(self.device) + self.update_target_net() + + def update_target_net(self): + self.target_net.load_state_dict(self.policy_net.state_dict()) + + def select_action(self, state): + if random() < self.epsilon: + return self.action_space.sample() + + if not torch.is_tensor(state): + state = torch.from_numpy(np.array(state)).float().unsqueeze(0).to(self.device) + + with torch.no_grad(): + action = torch.argmax(self.policy_net(state)) + + return action.item() + + def decay_epsilon(self): + self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min) + + def please_learn(self): + if len(self.replay_memory) < BATCH_SIZE: + return + + states, actions, rewards, dones, next_states = self.replay_memory.sample(self.device) + + predicted_qs = self.policy_net(states) + target_qs = self.target_net(next_states) + target_qs = torch.max(target_qs, dim=1).values.reshape(-1, 1) + target_qs[dones] = 0.0 + target_qs = rewards + (self.gamma * target_qs) + + loss = self.policy_net.loss(predicted_qs, target_qs) + self.policy_net.optimizer.zero_grad() + loss.backward() + self.policy_net.optimizer.step() + + def save(self): + check_if_dirs_exist([MODELS_PATH]) + torch.save(self.policy_net.state_dict(), POLICY_NET_PATH) + torch.save(self.target_net.state_dict(), TARGET_NET_PATH) + + def load(self): + self.policy_net.load_state_dict(torch.load(POLICY_NET_PATH)) + self.target_net.load_state_dict(torch.load(TARGET_NET_PATH)) + self.target_net.eval() diff --git a/src/dqn/constants.py b/src/dqn/constants.py new file mode 100644 index 0000000..65000d7 --- /dev/null +++ b/src/dqn/constants.py @@ -0,0 +1,25 @@ +from pathlib import Path +import torch + +DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +NUM_ACTIONS = 3 +INPUT_SHAPE = (1, 75, 70) # PyTorch uses (channels, height, width) format + +# TODO: Fine tuning +LEARNING_RATE = 1e-2 +MEMORY_CAPACITY = 32 +NUM_EPISODES = 10 +BATCH_SIZE = 16 +UPDATE_FREQUENCY = 20 + +# These might be good +GAMMA = 0.99 +EPSILON_MAX = 1.0 +EPSILON_MIN = 0.01 +EPSILON_DECAY = 0.99 + +MODELS_PATH = Path("models") +POLICY_NET_PATH = MODELS_PATH / "policy_net.pth" +TARGET_NET_PATH = MODELS_PATH / "target_net.pth" +MODEL_PATH = MODELS_PATH / "model.pth" diff --git a/src/dqn/dqn.py b/src/dqn/dqn.py new file mode 100644 index 0000000..ebae7b8 --- /dev/null +++ b/src/dqn/dqn.py @@ -0,0 +1,37 @@ +import torch +import torch.nn as nn +from torch import optim +from src.dqn.constants import INPUT_SHAPE, NUM_ACTIONS, LEARNING_RATE + + +class DQN(nn.Module): + def __init__(self): + super(DQN, self).__init__() + self.conv1 = nn.Conv2d(INPUT_SHAPE[0], 32, kernel_size=8, stride=4) + self.norm1 = nn.BatchNorm2d(32) + self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) + self.norm2 = nn.BatchNorm2d(64) + self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1) + self.norm3 = nn.BatchNorm2d(64) + + self.flatten = nn.Flatten() + + self.fc = nn.Linear(64 * 5 * 5, 512) + self.output = nn.Linear(512, NUM_ACTIONS) + + # TODO: Maybe use RMSProp? + self.optimizer = optim.Adam(self.parameters(), lr=LEARNING_RATE) + self.loss = nn.MSELoss() + + def _forward_features(self, x): + x = torch.relu(self.norm1(self.conv1(x))) + x = torch.relu(self.norm2(self.conv2(x))) + x = torch.relu(self.norm3(self.conv3(x))) + return x + + def forward(self, x): + x = self._forward_features(x) + x = self.flatten(x) + x = torch.relu(self.fc(x)) + x = self.output(x) + return x diff --git a/src/dqn/main.py b/src/dqn/main.py new file mode 100644 index 0000000..15fea4f --- /dev/null +++ b/src/dqn/main.py @@ -0,0 +1,82 @@ +import numpy as np +import torch +from gymnasium import make, Env +from src.dqn.agent import Agent +from src.dqn.constants import MEMORY_CAPACITY, MODEL_PATH, NUM_EPISODES, MODELS_PATH, UPDATE_FREQUENCY, BATCH_SIZE +from src.utils.helpers import show_image, check_if_dirs_exist +from src.utils.preprocessing import preprocess + + +def reset(env: Env): + state, _info = env.reset() + state = preprocess(state) + return state + + +def step(env: Env, action: int): + next_state, reward, terminated, truncated, info = env.step(action) + next_state = preprocess(next_state) + done = terminated or truncated + return next_state, reward, done, info + + +def fill_memory(env: Env, agent: Agent): + for _ in range(MEMORY_CAPACITY): + state = reset(env) + done = False + while not done: + action = agent.select_action(state) + next_state, reward, done, info = step(env, action) + agent.replay_memory.store(state, action, reward, done, next_state) + state = next_state + + +def train( + env, + agent: Agent, +): + fill_memory(env, agent) + print("Memory filled with random actions!") + + counter = 0 + reward_history = [] + best_score = -np.inf + + for episode in range(NUM_EPISODES): + state = reset(env) + done = False + episode_reward = 0 + + while not done: + action = agent.select_action(state) + next_state, reward, done, info = step(env, action) + agent.replay_memory.store(state, action, reward, done, next_state) + agent.please_learn() + + if counter % UPDATE_FREQUENCY == 0: + agent.update_target_net() + + state = next_state + episode_reward += reward + counter += 1 + + agent.decay_epsilon() + reward_history.append(episode_reward) + + current_avg_score = np.mean(reward_history[-20:]) # moving average over last 20 episodes + + print( + f"Episode: {episode + 1}, Reward: {episode_reward}, Avg. Reward: {current_avg_score}, Epsilon: {agent.epsilon}" + ) + + if current_avg_score > best_score: + best_score = current_avg_score + check_if_dirs_exist([MODELS_PATH]) + torch.save(agent.policy_net.state_dict(), MODEL_PATH) + + +if __name__ == "__main__": + env: Env = make("ALE/Skiing-v5", max_episode_steps=1000) + agent = Agent(action_space=env.action_space) + train(env, agent) + agent.save() diff --git a/src/dqn/replay_memory.py b/src/dqn/replay_memory.py new file mode 100644 index 0000000..1db0dd0 --- /dev/null +++ b/src/dqn/replay_memory.py @@ -0,0 +1,45 @@ +from random import sample +import torch +import numpy as np + +from src.dqn.constants import MEMORY_CAPACITY, BATCH_SIZE + + +class ReplayMemory: + def __init__(self): + self.capacity = MEMORY_CAPACITY + self.states = [] + self.actions = [] + self.rewards = [] + self.dones = [] + self.next_states = [] + self.index: int = 0 + + def store(self, state, action, reward, done, next_state): + if len(self.states) < self.capacity: + self.states.append(state) + self.actions.append(action) + self.rewards.append(reward) + self.dones.append(done) + self.next_states.append(next_state) + else: + self.states[self.index] = state + self.actions[self.index] = action + self.rewards[self.index] = reward + self.dones[self.index] = done + self.next_states[self.index] = next_state + + self.index = (self.index + 1) % self.capacity + + def sample(self, device): + indices_to_sample = sample(range(len(self)), BATCH_SIZE) + states = torch.from_numpy(np.array(self.states)[indices_to_sample]).float().to(device) + actions = torch.from_numpy(np.array(self.actions)[indices_to_sample]).to(device).reshape((-1, 1)) + rewards = torch.from_numpy(np.array(self.rewards)[indices_to_sample]).float().to(device).reshape((-1, 1)) + dones = torch.from_numpy(np.array(self.dones)[indices_to_sample]).to(device).reshape((-1, 1)) + next_states = torch.from_numpy(np.array(self.next_states)[indices_to_sample]).float().to(device) + + return states, actions, rewards, dones, next_states + + def __len__(self): + return len(self.states) diff --git a/src/ppo/main.py b/src/ppo/main.py new file mode 100644 index 0000000..2b2f3aa --- /dev/null +++ b/src/ppo/main.py @@ -0,0 +1,27 @@ +import numpy as np +import gymnasium + +env = gymnasium.make("ALE/Skiing-v5", render_mode="human") +env.metadata["render_fps"] = 60 + + +def run_episode(env, policy, render=False, max_steps=10000): + """Run a single episode with the given policy""" + obs = env.reset() + obs = obs[0] + for _ in range(max_steps): + if render: + env.render() + # time.sleep(0.5) + action = policy(obs) + next_state, reward, terminated, truncated, info = env.step(action) + obs = next_state + env.render() + + +def random_policy(obs): + """A random policy for the Skiing environment""" + return np.random.randint(0, 3) + + +run_episode(env, random_policy, render=True, max_steps=10000) diff --git a/src/utils/helpers.py b/src/utils/helpers.py new file mode 100644 index 0000000..1995f1c --- /dev/null +++ b/src/utils/helpers.py @@ -0,0 +1,24 @@ +import os +from pathlib import Path + +import numpy as np +import cv2 as cv + + +def show_image(image: np.ndarray, title: str = "image") -> None: + """ + Shows the image. + :param title: the title of the window, default is "image" + :param image: the image to show + :return: + """ + cv.namedWindow(title, cv.WINDOW_KEEPRATIO) + cv.imshow(title, image) + cv.waitKey(0) + cv.destroyAllWindows() + + +def check_if_dirs_exist(paths: list[Path] | str) -> None: + for path in paths: + if not os.path.exists(path): + os.makedirs(path) diff --git a/src/utils/preprocessing.py b/src/utils/preprocessing.py new file mode 100644 index 0000000..3f89f26 --- /dev/null +++ b/src/utils/preprocessing.py @@ -0,0 +1,48 @@ +import cv2 +import numpy as np + + +def crop(state: np.ndarray) -> np.ndarray: + """ + Crops the state image to the relevant part of the screen. + :param state: the state image + :return: the cropped image + """ + # Exact crop [30:180, 8:152] + # Rounded crop [30:180, 10:150] + # Maybe try with both of them + return state[30:180, 10:150] + + +def resize(state: np.ndarray, scale: int = 2) -> np.ndarray: + """ + Downsamples the state image. + :param state: the state image + :param scale: the scale to downsample by + :return: the downsampled image + """ + return state[::scale, ::scale] + + +def rgb2gray(rgb: np.ndarray) -> np.ndarray: + """ + Converts an rgb image array to a grey image array. + + :param rgb: the rgb image array. + :return: the converted array. + """ + grayscale = cv2.cvtColor(rgb, cv2.COLOR_RGB2GRAY) + grayscale = grayscale[np.newaxis, :, :] # (75, 70) -> (1, 75, 70) for PyTorch + return grayscale + + +def preprocess(state: np.ndarray) -> np.ndarray: + """ + Preprocesses the state image. + :param state: the state image + :return: the preprocessed image + """ + state = crop(state) + state = resize(state) + state = rgb2gray(state) + return state