From 548e0cc6905440183de5d8a1d7afc880481b22b0 Mon Sep 17 00:00:00 2001 From: matheusslr Date: Tue, 7 Feb 2023 16:54:05 -0300 Subject: [PATCH 1/2] fix: Updating Pytorch NN to use .pt as save/load file --- urnai/base/savable.py | 4 ++++ .../neural_network/pytorch.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/urnai/base/savable.py b/urnai/base/savable.py index a25eee0f..6e5c9d3a 100644 --- a/urnai/base/savable.py +++ b/urnai/base/savable.py @@ -95,6 +95,10 @@ def get_full_persistance_path(self, persist_path): """This method returns the default persistance path.""" return persist_path + os.path.sep + self.get_default_save_stamp() + def get_full_persistance_pytorch_path(self, persist_path): + """This method returns the default persistance pytorch path.""" + return persist_path + os.path.sep + self.get_default_save_stamp() + '.pt' + def save(self, savepath): """ This method saves pickle objects diff --git a/urnai/models/memory_representations/neural_network/pytorch.py b/urnai/models/memory_representations/neural_network/pytorch.py index 0a76f659..f4c19391 100644 --- a/urnai/models/memory_representations/neural_network/pytorch.py +++ b/urnai/models/memory_representations/neural_network/pytorch.py @@ -1,3 +1,5 @@ +import os + import numpy as np import torch import torch.nn as nn @@ -120,6 +122,22 @@ def create_base_model(self): model = self.SubDeepQNetwork() return model + def save_extra(self, persist_path): + torch.save( + self.model.model_layers.state_dict(), + self.get_full_persistance_pytorch_path(persist_path), + ) + + def load_extra(self, persist_path): + exists = os.path.isfile(self.get_full_persistance_pytorch_path(persist_path)) + + if exists: + self.__init__(self.action_output_size, self.state_input_shape, self.build_model, + self.gamma, self.alpha, self.seed, self.batch_size) + self.model.model_layers.load_state_dict(torch.load( + self.get_full_persistance_pytorch_path(persist_path), + )) + def copy_model_weights(self, model_to_copy): self.model.load_state_dict(model_to_copy.model.state_dict()) From 423f61e0480488d19f911e7fe72b8402ba6c603c Mon Sep 17 00:00:00 2001 From: matheusslr Date: Thu, 9 Feb 2023 09:51:35 -0300 Subject: [PATCH 2/2] feat: New Rewards and Actions Representations --- .../solve_simple64_sm_gridstate.py | 58 ++++++++++ urnai/agents/actions/sc2_wrapper.py | 102 +++++++++++++++++- urnai/agents/rewards/sc2.py | 63 +++++++++++ urnai/models/algorithms/ddql.py | 2 + urnai/models/algorithms/dql.py | 2 + urnai/models/base/abmodel.py | 5 +- 6 files changed, 226 insertions(+), 6 deletions(-) create mode 100644 experiments/solves/experiments/solve_simple64_sm_gridstate.py diff --git a/experiments/solves/experiments/solve_simple64_sm_gridstate.py b/experiments/solves/experiments/solve_simple64_sm_gridstate.py new file mode 100644 index 00000000..b6da3b12 --- /dev/null +++ b/experiments/solves/experiments/solve_simple64_sm_gridstate.py @@ -0,0 +1,58 @@ +import sys, pathlib + +sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent.parent)) + +from absl import app + +from urnai.envs.sc2 import SC2Env +from urnai.trainers.trainer import Trainer +from urnai.agents.sc2_agent import SC2Agent +from urnai.agents.actions.sc2_wrapper import SimpleTerranWrapper, SimpleMarineWrapper +from urnai.agents.rewards.sc2 import KilledUnitsReward, KilledUnitsRewardImproved, TStarBotReward +from urnai.agents.states.sc2 import Simple64GridState +from urnai.models.model_builder import ModelBuilder +from urnai.models.algorithms.ddql import DoubleDeepQLearning + +def declare_trainer(): + state_builder = Simple64GridState(grid_size=4) + action_wrapper = SimpleMarineWrapper() + + env = SC2Env(map_name="Simple64", render=False, self_play=False, + step_mul=16, realtime=False, player_race="terran", + enemy_race="random", difficulty="very_easy") + + helper = ModelBuilder() + helper.add_input_layer(100) + helper.add_fullyconn_layer(50) + helper.add_output_layer() + + dq_network = DoubleDeepQLearning(lib="pytorch", action_wrapper=action_wrapper, + state_builder=state_builder, build_model=helper.get_model_layout(), + epsilon_start=0.8, epsilon_decay=0.999, epsilon_linear_decay=True, + per_episode_epsilon_decay=True, use_memory=True, + epsilon_min=0.05, epsilon_decay_ep_start=200, + learning_rate=0.00013, min_memory_size=50000, memory_maxlen=120000) + + agent = SC2Agent(dq_network, KilledUnitsRewardImproved()) + + trainer = Trainer(env, agent, save_path='urnai/models/saved/sm_killed_reward_improved/', + file_name="sm_killed_reward_vs_random_lr_04", + save_every=100, enable_save=False, relative_path=True, + max_training_episodes=5000, max_steps_training=1200, + max_test_episodes=200, max_steps_testing=1200) + + return trainer + + +def main(unused_argv): + try: + trainer = declare_trainer() + trainer.train() + trainer.play() + + except KeyboardInterrupt: + pass + + +if __name__ == '__main__': + app.run(main) diff --git a/urnai/agents/actions/sc2_wrapper.py b/urnai/agents/actions/sc2_wrapper.py index 3f8c8c90..b077fa09 100644 --- a/urnai/agents/actions/sc2_wrapper.py +++ b/urnai/agents/actions/sc2_wrapper.py @@ -4,15 +4,16 @@ import numpy as np from pysc2.env import sc2_env -from pysc2.lib import units +from pysc2.lib import features, units # importing our action set file so that we can use its constants import urnai.agents.actions.sc2 as sc2 from urnai.agents.actions.sc2 import attack_distribute_army, attack_target_point, \ attack_target_point_spatial, build_gas_structure_raw_unit, build_structure_raw, \ build_structure_raw_pt, build_structure_raw_pt_spatial, can_queue_unit_terran, effect_units, \ - get_all_idle_workers, get_free_supply, get_units_by_type, harvest_gather_gas, \ - harvest_gather_minerals, harvest_gather_minerals_idle, move_target_point_spatial, no_op, \ - organize_queue, research_upgrade, select_army, train_unit, unit_exists + get_all_idle_workers, get_euclidean_distance, get_free_supply, get_units_by_type, \ + harvest_gather_gas, harvest_gather_minerals, harvest_gather_minerals_idle, \ + move_target_point_spatial, no_op, organize_queue, research_upgrade, select_army, train_unit, \ + unit_exists from .base.abwrapper import ActionWrapper @@ -107,6 +108,8 @@ ACTION_ATTACK_MY_BASE = 'attackmybase' ACTION_ATTACK_MY_SECOND_BASE = 'attackmysecondbase' ACTION_ATTACK_DISTRIBUTE_ARMY = 'attackdistributearmy' +ACTION_ATTACK_NERBY = 'attacknerby' + # Selects random idle scv > sends him to harvest minerals ACTION_HARVEST_MINERALS_IDLE = 'harvestmineralsidle' ACTION_HARVEST_MINERALS_FROM_GAS = 'harvestmineralsfromgas' @@ -114,6 +117,7 @@ ACTION_ATTACK_POINT = 'attackpoint' ACTION_MOVE_TROOPS_POINT = 'movetroopspoint' +ACTION_SEND_SCOUT = 'sendscout' class SC2Wrapper(ActionWrapper): @@ -746,6 +750,17 @@ def harvestgasfromminerals_exclude(excluded_actions, gi): if not gi.has_scv or not gi.has_refinery: excluded_actions.append(ACTION_HARVEST_GAS_FROM_MINERALS) + def attacknerby_exclude(excluded_actions, gi): + if not gi.has_marinemarauder or not gi.has_barracks: + excluded_actions.append(ACTION_ATTACK_NERBY) + + def attackenemybase_exclude(excluded_actions, gi): + if not gi.has_marinemarauder or not gi.has_barracks: + excluded_actions.append(ACTION_ATTACK_ENEMY_BASE) + + def attackenemysecondbase_exclude(excluded_actions, gi): + if not gi.has_marinemarauder or not gi.has_barracks: + excluded_actions.append(ACTION_ATTACK_ENEMY_SECOND_BASE) # endregion # region LIST OF ACTIONS @@ -1236,6 +1251,29 @@ def attackenemysecondbase(self, obs): action, self.actions_queue = organize_queue(actions, self.actions_queue) return action + def sendscout(self, obs): + target = self.enemy_base_xy + if not self.base_top_left: + target = (63 - target[0] - 5, 63 - target[1] + 5) + idle_worker = get_all_idle_workers(obs, sc2_env.Race.terran) + if idle_worker != sc2._NO_UNITS: + actions = attack_target_point_spatial(idle_worker, target) + action, self.actions_queue = organize_queue(actions, self.actions_queue) + return action + return no_op() + + def attacknerby(self, obs): + targets = [unit for unit in obs.raw_units + if unit.alliance == features.PlayerRelative.ENEMY] + troops = select_army(obs, sc2_env.Race.terran) + for troop in troops: + for target in targets: + if get_euclidean_distance([troop.x, troop.y], [target.x, target.y]) < 10: + actions = attack_target_point_spatial(troops, [target.x, target.y]) + action, self.actions_queue = organize_queue(actions, self.actions_queue) + return action + return no_op() + # endregion # endregion @@ -1449,6 +1487,62 @@ def __init__(self, use_atk_grid=False, atk_grid_x=4, atk_grid_y=4): } +class SimpleMarineWrapper(TerranWrapper): + def __init__(self, use_atk_grid=False, atk_grid_x=4, atk_grid_y=4): + TerranWrapper.__init__(self) + + self.use_atk_grid = use_atk_grid + self.atk_grid_x = int(atk_grid_x) + self.atk_grid_y = int(atk_grid_y) + + self.named_actions = [ + ACTION_DO_NOTHING, + + # BUILDING + ACTION_BUILD_COMMAND_CENTER, + ACTION_BUILD_BARRACKS, + ACTION_BUILD_SUPPLY_DEPOT, + + # TRAINS + ACTION_TRAIN_SCV, + ACTION_TRAIN_MARINE, + + # SCOUT + ACTION_SEND_SCOUT, + + # HAVERST + ACTION_HARVEST_MINERALS_FROM_GAS, + ] + + if self.use_atk_grid: + xgridsize = 64/self.atk_grid_x + ygridsize = 64/self.atk_grid_y + + for i in range(self.atk_grid_x): + for j in range(self.atk_grid_y): + x = xgridsize * (i + 1) - (xgridsize / 2) + y = ygridsize*(j+1) - (ygridsize/2) + self.named_actions.append(ACTION_ATTACK_POINT + '_' + str(x) + '_' + str(y)) + else: + self.named_actions.append(ACTION_ATTACK_NERBY) + self.named_actions.append(ACTION_ATTACK_ENEMY_BASE) + self.named_actions.append(ACTION_ATTACK_ENEMY_SECOND_BASE) + + self.action_indices = [idx for idx in range(len(self.named_actions))] + + self.building_positions = { + 'command_center': [[19, 23], [41, 21]], + 'supply_depot': [[16, 27], [18, 27], [20, 27], [22, 27], [16, 29], [18, 29], [20, 29]], + 'barracks': [[25, 18], [24, 20], [30, 24]], + } + + self.building_amounts = { + 'command_center': 2, + 'supply_depot': 7, + 'barracks': 3, + } + + class ProtossWrapper(SC2Wrapper): def __init__(self): SC2Wrapper.__init__(self) # Imports self variables from SC2Wrapper diff --git a/urnai/agents/rewards/sc2.py b/urnai/agents/rewards/sc2.py index 5cc0b1e6..6c4f0037 100644 --- a/urnai/agents/rewards/sc2.py +++ b/urnai/agents/rewards/sc2.py @@ -184,6 +184,69 @@ def get_reward(self, obs, reward, done): return new_reward +class KilledUnitsRewardImproved(RewardBuilder): + def __init__(self): + + self.KILLED_UNIT_SCORE = 0.015 + self.KILLED_BUIDING_SCORE = 0.02 + self.PENALTY_DEAD_ALLY = -0.01 + + # Properties keep track of the change of values used in our reward system + self._previous_killed_unit_score = 0 + self._previous_killed_building_score = 0 + self._previous_army_count = 0 + + # When the episode is over, the values we use to compute our reward should be reset. + def reset(self): + self._previous_killed_unit_score = 0 + self._previous_killed_building_score = 0 + self._previous_army_count = 0 + + def get_reward(self, obs, reward, done): + new_reward = 0 + + # Rewards + if((obs.score_cumulative.killed_value_units - self._previous_killed_unit_score) > 0): + new_reward += self.KILLED_UNIT_SCORE + if((obs.score_cumulative.killed_value_structures - + self._previous_killed_building_score) > 0): + new_reward += self.KILLED_BUIDING_SCORE + # Penalties + if(obs.player.army_count < self._previous_army_count): + new_reward += self.PENALTY_DEAD_ALLY + + self._previous_killed_unit_score = obs.score_cumulative.killed_value_units + self._previous_killed_building_score = obs.score_cumulative.killed_value_structures + self._previous_army_count = obs.player.army_count + + if done: + self.reset() + + if reward == 1: + new_reward = 10 + if reward == -1: + new_reward = -10 + + return new_reward + + +class TStarBotReward(RewardBuilder): + """ + A sparse reward function based on the TStarbot article + see more in: https://arxiv.org/pdf/1809.07193.pdf + """ + + def get_reward(self, obs, reward, done): + new_reward = 0 + + if reward == 1: + new_reward = 1 + elif reward == -1: + new_reward = -1 + + return new_reward + + """ Ideas for new reward builders or improvements for current ones: diff --git a/urnai/models/algorithms/ddql.py b/urnai/models/algorithms/ddql.py index 4a7e7998..64e756d9 100644 --- a/urnai/models/algorithms/ddql.py +++ b/urnai/models/algorithms/ddql.py @@ -129,6 +129,7 @@ def __init__( neural_net_class=None, epsilon_linear_decay=False, lr_linear_decay=False, + epsilon_decay_ep_start=0, ): super().__init__( action_wrapper, @@ -154,6 +155,7 @@ def __init__( neural_net_class, epsilon_linear_decay, lr_linear_decay, + epsilon_decay_ep_start, ) self.target_update_counter = 0 diff --git a/urnai/models/algorithms/dql.py b/urnai/models/algorithms/dql.py index 864ad055..d7c70f11 100644 --- a/urnai/models/algorithms/dql.py +++ b/urnai/models/algorithms/dql.py @@ -122,6 +122,7 @@ def __init__( neural_net_class=None, epsilon_linear_decay=False, lr_linear_decay=False, + epsilon_decay_ep_start=0, ): super().__init__( action_wrapper, @@ -140,6 +141,7 @@ def __init__( cpu_only, epsilon_linear_decay, lr_linear_decay, + epsilon_decay_ep_start, ) self.batch_size = batch_size diff --git a/urnai/models/base/abmodel.py b/urnai/models/base/abmodel.py index 92148872..a518c37a 100644 --- a/urnai/models/base/abmodel.py +++ b/urnai/models/base/abmodel.py @@ -57,7 +57,7 @@ def __init__(self, action_wrapper: ActionWrapper, state_builder: StateBuilder, g epsilon_start, epsilon_min, epsilon_decay_rate, per_episode_epsilon_decay=False, learning_rate_decay_ep_cutoff=0, name=None, seed_value=None, cpu_only=False, epsilon_linear_decay=False, - lr_linear_decay=False): + lr_linear_decay=False, epsilon_decay_ep_start=0): super(LearningModel, self).__init__() self.seed_value = seed_value @@ -84,6 +84,7 @@ def __init__(self, action_wrapper: ActionWrapper, state_builder: StateBuilder, g self.epsilon_decay_rate = epsilon_decay_rate self.per_episode_epsilon_decay = per_episode_epsilon_decay self.epsilon_linear_decay = epsilon_linear_decay + self.epsilon_decay_ep_start = epsilon_decay_ep_start # self.tensorboard_callback_logdir = '' self.tensorboard_callback = None @@ -143,7 +144,7 @@ def ep_reset(self, episode=0): This method is mainly used to enact the decay_epsilon and decay_lr at the end of every episode. """ - if self.per_episode_epsilon_decay: + if self.per_episode_epsilon_decay and episode >= self.epsilon_decay_ep_start: self.decay_epsilon() if episode > self.learning_rate_decay_ep_cutoff and self.learning_rate_decay != 1: