Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Training #1

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
385 changes: 267 additions & 118 deletions FireMap.py

Large diffs are not rendered by default.

Binary file added PPO_Agent_Misc/Agent_weights_actor
Binary file not shown.
Binary file added PPO_Agent_Misc/Agent_weights_critic
Binary file not shown.
116 changes: 96 additions & 20 deletions PPO_Agent_Misc/PPOContinuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,19 @@
import torch.optim as optim
from torch.distributions.categorical import Categorical
import time
import h5py

device = T.device("cuda" if T.cuda.is_available() else "cpu")
T.autograd.set_detect_anomaly(True)

class relu30(nn.Module):
def __init__(self):
super(relu30, self).__init__()

def forward(self, x):
return T.min(T.max(T.tensor(0), x), T.tensor(30))


class PPOMemory:
def __init__(self, batch_size):
# we keep memory in mind with lists
Expand Down Expand Up @@ -45,6 +54,7 @@ def get_memory_batch(self):

# retrieves batch_size memories
states_T = T.stack(self.states[:self.batch_size]).to(device)
states_T = T.squeeze(states_T, dim=1)
act_logprob_tens = T.stack(self.logprobs[:self.batch_size]).to(device)
adv_tensor = T.tensor(self.adv[:self.batch_size]).to(device)
vals_tens = T.tensor(self.vals[:self.batch_size], dtype=T.float64).to(device)
Expand Down Expand Up @@ -76,29 +86,38 @@ class ActorModel(nn.Module):
this exact same thing. The code will look similar.

'''
def __init__(self, input_shape, n_actions,
def __init__(self, input_shape, n_actions, c2,
min_tens = T.tensor((-1, 1)), max_tens = T.tensor((-1, 1))):
super(ActorModel, self).__init__()

self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1)
self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=2, padding=1)
self.flat = nn.Flatten()

# base model
self.fc1 = nn.Linear(in_features=input_shape, out_features=64).to(device)
self.fc2 = nn.Linear(in_features=64, out_features=64).to(device)
self.fc1 = nn.Linear(in_features=input_shape, out_features=265).to(device)
self.fc2 = nn.Linear(in_features=265, out_features=265).to(device)

# distributions
self.mean = nn.Linear(in_features=64, out_features=n_actions).to(device)
self.mean = nn.Linear(in_features=265, out_features=n_actions).to(device)

# Constant variance
self.c2 = c2
self.var = T.diag(T.ones(n_actions)).to(device)*0.5

# misc
self.min_tens = min_tens
self.max_tens = max_tens
self.relu30 = relu30()

def forward(self, x):
''' We create a class that computes the distribution of our actions. '''

x = F.tanh(self.conv1(x))
x = F.tanh(self.conv2(x))
x = self.flat(x)
# base computation
x = F.tanh(self.fc1(x)).to(device)
x = F.tanh(self.fc1(x).to(device))
x = F.tanh(self.fc2(x)).to(device)

# we note the mean value goes through a tanh as it corresponds with the
Expand All @@ -107,20 +126,25 @@ def forward(self, x):
# for our environment, depending on how we represent the raw actions,
# we can have positive and negative values, if we center in the middle of the board
# for example.
mean = self.mean(x).to(device)
mean = F.tanh(self.mean(x)).to(device)
#activation = self.relu30(mean)

return mean
return T.squeeze(mean)

def get_action_logprob(self, x):
mean = self.forward(x)

calc_mean = mean.to(device)
#print(f'{calc_mean = }')

mean = T.clamp(mean, 0, 29)

# get action from distribution distribution
dist = T.distributions.MultivariateNormal(calc_mean, self.var)

action = dist.sample()
logprob = dist.log_prob(action)


return action.to(device), logprob, mean.to(device)

Expand All @@ -131,26 +155,46 @@ def calc_action_logprob_entropy(self, x, action):
dist = T.distributions.MultivariateNormal(calc_mean, self.var)

logprob = dist.log_prob(action)
ent = dist.entropy()

return logprob
return logprob, ent

def calc_c2(self):
new_c2 = self.c2*0.95
self.c2 = max(new_c2, 0.5)

class CriticModel(nn.Module):
def __init__(self, input_shape):
super(CriticModel, self).__init__()

self.fc1 = nn.Linear(input_shape, 64).to(device)
self.fc2 = nn.Linear(64, 64).to(device)
self.output = nn.Linear(64, 1).to(device)
self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1)
self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=2, padding=1)
self.flat = nn.Flatten()

self.fc1 = nn.Linear(input_shape, 265).to(device)
self.fc2 = nn.Linear(265, 265).to(device)
self.output = nn.Linear(265, 1).to(device)

def forward(self, x):
'''
Overwrites the basic call function
'''
x = F.tanh(self.conv1(x))
x = F.tanh(self.conv2(x))
x = self.flat(x)
x = F.tanh(self.fc1(x)).to(device)
x = F.tanh(self.fc2(x)).to(device)
x = F.tanh(self.output(x)).to(device)
x = self.output(x).to(device)

return T.squeeze(x)

def save(self, save_to_path: str):
save_to_path += '_critic'
T.save(self.target_model.state_dict(), save_to_path)

return x
def load(self, load_to_path):
self.target_model.load_state_dict(T.load(load_to_path))
self.model.load_state_dict(T.load(load_to_path))

class Agent(nn.Module):
# An interesting note - implementations exist where actor and critic share
Expand All @@ -175,7 +219,7 @@ def __init__(self, n_actions, c1, c2, input_dims, action_min, action_max, gamma=
self.n_actions = n_actions

# --- Actor Critic ---
self.actor = ActorModel(input_dims, n_actions).float().to(device)
self.actor = ActorModel(input_dims, n_actions, 2).float().to(device)
self.optimizer_actor = T.optim.Adam(self.actor.parameters(), LR)

self.critic = CriticModel(input_dims).float().to(device)
Expand All @@ -188,8 +232,8 @@ def __init__(self, n_actions, c1, c2, input_dims, action_min, action_max, gamma=
self.criterion = nn.MSELoss()
self.annealing = annealing
if annealing == True:
self.anneal_lr_actor = T.optim.lr_scheduler.StepLR(self.optimizer_actor, buffer_size*5, gamma=0.3)
self.anneal_lr_critic = T.optim.lr_scheduler.StepLR(self.optimizer_critic, buffer_size*5, gamma=0.3)
self.anneal_lr_actor = T.optim.lr_scheduler.StepLR(self.optimizer_actor, self.buffer_size, gamma=0.95)
self.anneal_lr_critic = T.optim.lr_scheduler.StepLR(self.optimizer_critic, self.buffer_size, gamma=0.95)

self.device = T.device("cuda" if T.cuda.is_available() else "cpu")

Expand Down Expand Up @@ -262,6 +306,11 @@ def advantage_and_return(self, rewards, values, not_dones):

return advantages.clone().detach(), returns.clone().detach()

def calculate_boundary_penalty(self, action_position: T.tensor) -> T.tensor:
left = np.clip(action_position, a_min=-np.inf,a_max=0)
right = np.clip(action_position-30,a_min=0,a_max=np.inf)
return T.tensor(np.sum(np.max(np.vstack([abs(left),abs(right)]),axis=0))).to(device)

def learn(self):
'''
This function iterates over our entire buffer size and trains over minibatches.
Expand Down Expand Up @@ -289,7 +338,9 @@ def learn(self):


# get logprob of action and entropy
new_logprobs = self.actor.calc_action_logprob_entropy(state_tens, act_tens)
new_logprobs, entropy = self.actor.calc_action_logprob_entropy(state_tens, act_tens)

entropy = T.mean(entropy)

# Get probability raio
prob_ratios = T.exp(new_logprobs - logprob_tens).to(device)
Expand Down Expand Up @@ -329,7 +380,6 @@ def learn(self):

# --- Total Loss ---


# Apply Advantages to Policy Loss
policy_loss = adv_tensor*policy_loss
policy_loss = T.mean(policy_loss).to(device)
Expand All @@ -342,7 +392,7 @@ def learn(self):
#crit_loss = T.max(crit_loss, crit_loss_clipped).to(device)
crit_loss = self.c1*crit_loss.float().to(device)

loss = -policy_loss + crit_loss # - self.c2*entropy_loss
loss = -policy_loss + crit_loss #- self.c2*entropy


# --- Backpropogate ---
Expand All @@ -364,13 +414,39 @@ def learn(self):
total_pol_loss += policy_loss.detach().numpy()
total_critic_loss += crit_loss.detach().numpy()
total_loss += loss.detach().numpy()

self.actor.calc_c2()


return total_pol_loss, total_critic_loss, total_loss

def save(self, save_to):
save_to_critic = save_to + '_critic'
T.save(self.critic.state_dict(), save_to_critic)

save_to_actor = save_to + '_actor'
T.save(self.actor.state_dict(), save_to_actor)

def load(self, load_actor, load_critic):
# Assuming the path to your .pkl file
# Load the pickled state dictionary

#with open(load_actor, 'rb') as file:
# state_dict = h5py.load(file)

# Assuming you have an instance of your model
#model = CriticModel(input_shape=100) # Adjust `CriticModel` and `input_shape` as necessary

# Load the state dictionary into your model
#model.load_state_dict(state_dict)

# Optionally, put the model in evaluation mode if you're doing inference
#model.eval()
self.actor.load_state_dict(T.load(load_actor))
self.critic.load_state_dict(T.load(load_critic))




##A


Binary file modified PPO_Agent_Misc/__pycache__/PPOContinuous.cpython-310.pyc
Binary file not shown.
Binary file added PPO_Agent_Misc/actor_weights/data.pkl
Binary file not shown.
Binary file added PPO_Agent_Misc/actor_weights/data/0
Binary file not shown.
2 changes: 2 additions & 0 deletions PPO_Agent_Misc/actor_weights/data/1
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
��ͻ��3�8�<Zn<V�+��� �s��.5��x:^1�;�Zs�޺�����DZ;Sa��ꓺa�u��c��J��;0��2 �[D��w��>�;+��;G!��2;(6!���:|<uf�;ӯF��q'�g���#1<��$�۶�;��;V]�u���
;g(D��#���;e�x n;�f<x�:�<�㲸� �;N�<���:aD;V��;]} <�Y2�?!A;���Ve ��`7r�e�{����7���n�;
Binary file added PPO_Agent_Misc/actor_weights/data/2
Binary file not shown.
Binary file added PPO_Agent_Misc/actor_weights/data/3
Binary file not shown.
Binary file added PPO_Agent_Misc/actor_weights/data/4
Binary file not shown.
Binary file added PPO_Agent_Misc/actor_weights/data/5
Binary file not shown.
1 change: 1 addition & 0 deletions PPO_Agent_Misc/actor_weights/version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3
Binary file added PPO_Agent_Misc/critic_weights/data.pkl
Binary file not shown.
Binary file added PPO_Agent_Misc/critic_weights/data/0
Binary file not shown.
Binary file added PPO_Agent_Misc/critic_weights/data/1
Binary file not shown.
Binary file added PPO_Agent_Misc/critic_weights/data/2
Binary file not shown.
Binary file added PPO_Agent_Misc/critic_weights/data/3
Binary file not shown.
Binary file added PPO_Agent_Misc/critic_weights/data/4
Binary file not shown.
1 change: 1 addition & 0 deletions PPO_Agent_Misc/critic_weights/data/5
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�=��
1 change: 1 addition & 0 deletions PPO_Agent_Misc/critic_weights/version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3
20 changes: 11 additions & 9 deletions PPO_Agent_Misc/toy_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import tqdm


env = gym.make("LunarLander-v2", continuous=True)
env = gym.make("CarRacing-v2")

episode_seed = np.random.randint(0, 100)
observation, info = env.reset(seed=episode_seed)
Expand All @@ -17,30 +17,32 @@
EPOCHS = 25
NUM_MINIBATCHES = 32
MINIBATCH = 16
EPISODES = 100
EPISODES = 400
TS_PER_ITER = 2000

# Continuous Parameters
action_min = T.tensor((0.0, -1.0))
action_max = T.tensor((1.0, 1.0))


PPO_Agent = ContPPO(n_actions=2, c1=0.5, c2=0.1, input_dims=8, action_min=action_min, action_max=action_max,
PPO_Agent = ContPPO(n_actions=3, c1=0.5, c2=0.1, input_dims=9216, action_min=action_min, action_max=action_max,
gamma=0.99, gae_lambda=0.95, policy_clip=0.2, batch_size=MINIBATCH,
buffer_size=MINIBATCH*NUM_MINIBATCHES, n_epochs=EPISODES, LR=1e-3, annealing=False)

action = env.action_space.sample()
obs, rewards, dones, info, _ = env.step(action)
print(obs.shape)

full_episode_loss = []
avg_policy_loss = []
avg_crit_loss = []
episode_max_ratio = []
ep_mean_rewards = []

obs = T.tensor(obs).to(PPO_Agent.device)
obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)
print(obs.shape)

for episode in tqdm.tqdm(range(50)):
for episode in tqdm.tqdm(range(EPISODES)):
episode_loss = 0.0
episode_policy_loss = 0.0
episode_crit_loss = 0.0
Expand All @@ -51,7 +53,7 @@
action, logprob, mean, prev_vf = PPO_Agent.get_action_and_vf(prev_obs)
obs, rewards, dones, info, _ = env.step(action.numpy())

obs = T.tensor(obs).to(PPO_Agent.device)
obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)

next_vf = PPO_Agent.critic.forward(obs)
advantage = PPO_Agent.get_gae(rewards, prev_vf, next_vf)
Expand Down Expand Up @@ -90,20 +92,20 @@


# Render games at the end
env = gym.make("LunarLander-v2", continuous=True, render_mode="human")
env = gym.make("CarRacing-v2", render_mode="human")

env.reset()
action = env.action_space.sample()
obs, rewards, dones, info, _ = env.step(action)

obs = T.tensor(obs, dtype=T.float32)
obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)

for e in range(TS_PER_ITER):
prev_obs = obs.clone().detach()
action, logprob, mean, prev_vf = PPO_Agent.get_action_and_vf(prev_obs)
print(action)
obs, rewards, dones, info, _ = env.step(np.array(action))
obs = T.tensor(obs).to(PPO_Agent.device)
obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)

ep_tot_rewards += rewards

Expand Down
2 changes: 1 addition & 1 deletion Simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def step(self, action):
"""
self.state.next(action)
obs = self.state.state
reward = self.state.get_reward()
reward = self.state.get_reward(action)
done = self.state.get_done()
info = self.state.get_info()
return obs, reward, done, info
Expand Down
Binary file modified __pycache__/FireMap.cpython-310.pyc
Binary file not shown.
Binary file modified __pycache__/Simulation.cpython-310.pyc
Binary file not shown.
Loading