Skip to content

Commit

Permalink
Merge pull request #49 from UFRN-URNAI/feat-agent-improvement
Browse files Browse the repository at this point in the history
feat: Agent Improvement
  • Loading branch information
matheusslr authored Feb 13, 2023
2 parents 2c6927d + 423f61e commit 80f99c0
Show file tree
Hide file tree
Showing 8 changed files with 248 additions and 6 deletions.
58 changes: 58 additions & 0 deletions experiments/solves/experiments/solve_simple64_sm_gridstate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import sys, pathlib

sys.path.insert(0, str(pathlib.Path(__file__).parent.parent.parent.parent))

from absl import app

from urnai.envs.sc2 import SC2Env
from urnai.trainers.trainer import Trainer
from urnai.agents.sc2_agent import SC2Agent
from urnai.agents.actions.sc2_wrapper import SimpleTerranWrapper, SimpleMarineWrapper
from urnai.agents.rewards.sc2 import KilledUnitsReward, KilledUnitsRewardImproved, TStarBotReward
from urnai.agents.states.sc2 import Simple64GridState
from urnai.models.model_builder import ModelBuilder
from urnai.models.algorithms.ddql import DoubleDeepQLearning

def declare_trainer():
state_builder = Simple64GridState(grid_size=4)
action_wrapper = SimpleMarineWrapper()

env = SC2Env(map_name="Simple64", render=False, self_play=False,
step_mul=16, realtime=False, player_race="terran",
enemy_race="random", difficulty="very_easy")

helper = ModelBuilder()
helper.add_input_layer(100)
helper.add_fullyconn_layer(50)
helper.add_output_layer()

dq_network = DoubleDeepQLearning(lib="pytorch", action_wrapper=action_wrapper,
state_builder=state_builder, build_model=helper.get_model_layout(),
epsilon_start=0.8, epsilon_decay=0.999, epsilon_linear_decay=True,
per_episode_epsilon_decay=True, use_memory=True,
epsilon_min=0.05, epsilon_decay_ep_start=200,
learning_rate=0.00013, min_memory_size=50000, memory_maxlen=120000)

agent = SC2Agent(dq_network, KilledUnitsRewardImproved())

trainer = Trainer(env, agent, save_path='urnai/models/saved/sm_killed_reward_improved/',
file_name="sm_killed_reward_vs_random_lr_04",
save_every=100, enable_save=False, relative_path=True,
max_training_episodes=5000, max_steps_training=1200,
max_test_episodes=200, max_steps_testing=1200)

return trainer


def main(unused_argv):
try:
trainer = declare_trainer()
trainer.train()
trainer.play()

except KeyboardInterrupt:
pass


if __name__ == '__main__':
app.run(main)
102 changes: 98 additions & 4 deletions urnai/agents/actions/sc2_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@

import numpy as np
from pysc2.env import sc2_env
from pysc2.lib import units
from pysc2.lib import features, units
# importing our action set file so that we can use its constants
import urnai.agents.actions.sc2 as sc2
from urnai.agents.actions.sc2 import attack_distribute_army, attack_target_point, \
attack_target_point_spatial, build_gas_structure_raw_unit, build_structure_raw, \
build_structure_raw_pt, build_structure_raw_pt_spatial, can_queue_unit_terran, effect_units, \
get_all_idle_workers, get_free_supply, get_units_by_type, harvest_gather_gas, \
harvest_gather_minerals, harvest_gather_minerals_idle, move_target_point_spatial, no_op, \
organize_queue, research_upgrade, select_army, train_unit, unit_exists
get_all_idle_workers, get_euclidean_distance, get_free_supply, get_units_by_type, \
harvest_gather_gas, harvest_gather_minerals, harvest_gather_minerals_idle, \
move_target_point_spatial, no_op, organize_queue, research_upgrade, select_army, train_unit, \
unit_exists

from .base.abwrapper import ActionWrapper

Expand Down Expand Up @@ -107,13 +108,16 @@
ACTION_ATTACK_MY_BASE = 'attackmybase'
ACTION_ATTACK_MY_SECOND_BASE = 'attackmysecondbase'
ACTION_ATTACK_DISTRIBUTE_ARMY = 'attackdistributearmy'
ACTION_ATTACK_NERBY = 'attacknerby'

# Selects random idle scv > sends him to harvest minerals
ACTION_HARVEST_MINERALS_IDLE = 'harvestmineralsidle'
ACTION_HARVEST_MINERALS_FROM_GAS = 'harvestmineralsfromgas'
ACTION_HARVEST_GAS_FROM_MINERALS = 'harvestgasfromminerals'

ACTION_ATTACK_POINT = 'attackpoint'
ACTION_MOVE_TROOPS_POINT = 'movetroopspoint'
ACTION_SEND_SCOUT = 'sendscout'


class SC2Wrapper(ActionWrapper):
Expand Down Expand Up @@ -746,6 +750,17 @@ def harvestgasfromminerals_exclude(excluded_actions, gi):
if not gi.has_scv or not gi.has_refinery:
excluded_actions.append(ACTION_HARVEST_GAS_FROM_MINERALS)

def attacknerby_exclude(excluded_actions, gi):
if not gi.has_marinemarauder or not gi.has_barracks:
excluded_actions.append(ACTION_ATTACK_NERBY)

def attackenemybase_exclude(excluded_actions, gi):
if not gi.has_marinemarauder or not gi.has_barracks:
excluded_actions.append(ACTION_ATTACK_ENEMY_BASE)

def attackenemysecondbase_exclude(excluded_actions, gi):
if not gi.has_marinemarauder or not gi.has_barracks:
excluded_actions.append(ACTION_ATTACK_ENEMY_SECOND_BASE)
# endregion

# region LIST OF ACTIONS
Expand Down Expand Up @@ -1236,6 +1251,29 @@ def attackenemysecondbase(self, obs):
action, self.actions_queue = organize_queue(actions, self.actions_queue)
return action

def sendscout(self, obs):
target = self.enemy_base_xy
if not self.base_top_left:
target = (63 - target[0] - 5, 63 - target[1] + 5)
idle_worker = get_all_idle_workers(obs, sc2_env.Race.terran)
if idle_worker != sc2._NO_UNITS:
actions = attack_target_point_spatial(idle_worker, target)
action, self.actions_queue = organize_queue(actions, self.actions_queue)
return action
return no_op()

def attacknerby(self, obs):
targets = [unit for unit in obs.raw_units
if unit.alliance == features.PlayerRelative.ENEMY]
troops = select_army(obs, sc2_env.Race.terran)
for troop in troops:
for target in targets:
if get_euclidean_distance([troop.x, troop.y], [target.x, target.y]) < 10:
actions = attack_target_point_spatial(troops, [target.x, target.y])
action, self.actions_queue = organize_queue(actions, self.actions_queue)
return action
return no_op()

# endregion
# endregion

Expand Down Expand Up @@ -1449,6 +1487,62 @@ def __init__(self, use_atk_grid=False, atk_grid_x=4, atk_grid_y=4):
}


class SimpleMarineWrapper(TerranWrapper):
def __init__(self, use_atk_grid=False, atk_grid_x=4, atk_grid_y=4):
TerranWrapper.__init__(self)

self.use_atk_grid = use_atk_grid
self.atk_grid_x = int(atk_grid_x)
self.atk_grid_y = int(atk_grid_y)

self.named_actions = [
ACTION_DO_NOTHING,

# BUILDING
ACTION_BUILD_COMMAND_CENTER,
ACTION_BUILD_BARRACKS,
ACTION_BUILD_SUPPLY_DEPOT,

# TRAINS
ACTION_TRAIN_SCV,
ACTION_TRAIN_MARINE,

# SCOUT
ACTION_SEND_SCOUT,

# HAVERST
ACTION_HARVEST_MINERALS_FROM_GAS,
]

if self.use_atk_grid:
xgridsize = 64/self.atk_grid_x
ygridsize = 64/self.atk_grid_y

for i in range(self.atk_grid_x):
for j in range(self.atk_grid_y):
x = xgridsize * (i + 1) - (xgridsize / 2)
y = ygridsize*(j+1) - (ygridsize/2)
self.named_actions.append(ACTION_ATTACK_POINT + '_' + str(x) + '_' + str(y))
else:
self.named_actions.append(ACTION_ATTACK_NERBY)
self.named_actions.append(ACTION_ATTACK_ENEMY_BASE)
self.named_actions.append(ACTION_ATTACK_ENEMY_SECOND_BASE)

self.action_indices = [idx for idx in range(len(self.named_actions))]

self.building_positions = {
'command_center': [[19, 23], [41, 21]],
'supply_depot': [[16, 27], [18, 27], [20, 27], [22, 27], [16, 29], [18, 29], [20, 29]],
'barracks': [[25, 18], [24, 20], [30, 24]],
}

self.building_amounts = {
'command_center': 2,
'supply_depot': 7,
'barracks': 3,
}


class ProtossWrapper(SC2Wrapper):
def __init__(self):
SC2Wrapper.__init__(self) # Imports self variables from SC2Wrapper
Expand Down
63 changes: 63 additions & 0 deletions urnai/agents/rewards/sc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,69 @@ def get_reward(self, obs, reward, done):
return new_reward


class KilledUnitsRewardImproved(RewardBuilder):
def __init__(self):

self.KILLED_UNIT_SCORE = 0.015
self.KILLED_BUIDING_SCORE = 0.02
self.PENALTY_DEAD_ALLY = -0.01

# Properties keep track of the change of values used in our reward system
self._previous_killed_unit_score = 0
self._previous_killed_building_score = 0
self._previous_army_count = 0

# When the episode is over, the values we use to compute our reward should be reset.
def reset(self):
self._previous_killed_unit_score = 0
self._previous_killed_building_score = 0
self._previous_army_count = 0

def get_reward(self, obs, reward, done):
new_reward = 0

# Rewards
if((obs.score_cumulative.killed_value_units - self._previous_killed_unit_score) > 0):
new_reward += self.KILLED_UNIT_SCORE
if((obs.score_cumulative.killed_value_structures -
self._previous_killed_building_score) > 0):
new_reward += self.KILLED_BUIDING_SCORE
# Penalties
if(obs.player.army_count < self._previous_army_count):
new_reward += self.PENALTY_DEAD_ALLY

self._previous_killed_unit_score = obs.score_cumulative.killed_value_units
self._previous_killed_building_score = obs.score_cumulative.killed_value_structures
self._previous_army_count = obs.player.army_count

if done:
self.reset()

if reward == 1:
new_reward = 10
if reward == -1:
new_reward = -10

return new_reward


class TStarBotReward(RewardBuilder):
"""
A sparse reward function based on the TStarbot article
see more in: https://arxiv.org/pdf/1809.07193.pdf
"""

def get_reward(self, obs, reward, done):
new_reward = 0

if reward == 1:
new_reward = 1
elif reward == -1:
new_reward = -1

return new_reward


"""
Ideas for new reward builders or improvements for current ones:
Expand Down
4 changes: 4 additions & 0 deletions urnai/base/savable.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def get_full_persistance_path(self, persist_path):
"""This method returns the default persistance path."""
return persist_path + os.path.sep + self.get_default_save_stamp()

def get_full_persistance_pytorch_path(self, persist_path):
"""This method returns the default persistance pytorch path."""
return persist_path + os.path.sep + self.get_default_save_stamp() + '.pt'

def save(self, savepath):
"""
This method saves pickle objects
Expand Down
2 changes: 2 additions & 0 deletions urnai/models/algorithms/ddql.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ def __init__(
neural_net_class=None,
epsilon_linear_decay=False,
lr_linear_decay=False,
epsilon_decay_ep_start=0,
):
super().__init__(
action_wrapper,
Expand All @@ -154,6 +155,7 @@ def __init__(
neural_net_class,
epsilon_linear_decay,
lr_linear_decay,
epsilon_decay_ep_start,
)

self.target_update_counter = 0
Expand Down
2 changes: 2 additions & 0 deletions urnai/models/algorithms/dql.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def __init__(
neural_net_class=None,
epsilon_linear_decay=False,
lr_linear_decay=False,
epsilon_decay_ep_start=0,
):
super().__init__(
action_wrapper,
Expand All @@ -140,6 +141,7 @@ def __init__(
cpu_only,
epsilon_linear_decay,
lr_linear_decay,
epsilon_decay_ep_start,
)

self.batch_size = batch_size
Expand Down
5 changes: 3 additions & 2 deletions urnai/models/base/abmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def __init__(self, action_wrapper: ActionWrapper, state_builder: StateBuilder, g
epsilon_start, epsilon_min, epsilon_decay_rate, per_episode_epsilon_decay=False,
learning_rate_decay_ep_cutoff=0,
name=None, seed_value=None, cpu_only=False, epsilon_linear_decay=False,
lr_linear_decay=False):
lr_linear_decay=False, epsilon_decay_ep_start=0):
super(LearningModel, self).__init__()

self.seed_value = seed_value
Expand All @@ -84,6 +84,7 @@ def __init__(self, action_wrapper: ActionWrapper, state_builder: StateBuilder, g
self.epsilon_decay_rate = epsilon_decay_rate
self.per_episode_epsilon_decay = per_episode_epsilon_decay
self.epsilon_linear_decay = epsilon_linear_decay
self.epsilon_decay_ep_start = epsilon_decay_ep_start

# self.tensorboard_callback_logdir = ''
self.tensorboard_callback = None
Expand Down Expand Up @@ -143,7 +144,7 @@ def ep_reset(self, episode=0):
This method is mainly used to enact the decay_epsilon and decay_lr
at the end of every episode.
"""
if self.per_episode_epsilon_decay:
if self.per_episode_epsilon_decay and episode >= self.epsilon_decay_ep_start:
self.decay_epsilon()

if episode > self.learning_rate_decay_ep_cutoff and self.learning_rate_decay != 1:
Expand Down
18 changes: 18 additions & 0 deletions urnai/models/memory_representations/neural_network/pytorch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

import numpy as np
import torch
import torch.nn as nn
Expand Down Expand Up @@ -120,6 +122,22 @@ def create_base_model(self):
model = self.SubDeepQNetwork()
return model

def save_extra(self, persist_path):
torch.save(
self.model.model_layers.state_dict(),
self.get_full_persistance_pytorch_path(persist_path),
)

def load_extra(self, persist_path):
exists = os.path.isfile(self.get_full_persistance_pytorch_path(persist_path))

if exists:
self.__init__(self.action_output_size, self.state_input_shape, self.build_model,
self.gamma, self.alpha, self.seed, self.batch_size)
self.model.model_layers.load_state_dict(torch.load(
self.get_full_persistance_pytorch_path(persist_path),
))

def copy_model_weights(self, model_to_copy):
self.model.load_state_dict(model_to_copy.model.state_dict())

Expand Down

0 comments on commit 80f99c0

Please sign in to comment.