uvicaiclub · tristongrayston · Feb 29, 2024 · Mar 1, 2024 · Mar 11, 2024
diff --git a/FireMap.py b/FireMap.py
diff --git a/PPO_Agent_Misc/Agent_weights_actor b/PPO_Agent_Misc/Agent_weights_actor
diff --git a/PPO_Agent_Misc/Agent_weights_critic b/PPO_Agent_Misc/Agent_weights_critic
diff --git a/PPO_Agent_Misc/PPOContinuous.py b/PPO_Agent_Misc/PPOContinuous.py
@@ -6,10 +6,19 @@
 import torch.optim as optim
 from torch.distributions.categorical import Categorical
 import time
+import h5py
 
 device = T.device("cuda" if T.cuda.is_available() else "cpu")
 T.autograd.set_detect_anomaly(True)
 
+class relu30(nn.Module):
+    def __init__(self):
+        super(relu30, self).__init__()
+
+    def forward(self, x):
+        return T.min(T.max(T.tensor(0), x), T.tensor(30))
+
+
 class PPOMemory:
     def __init__(self, batch_size):
         # we keep memory in mind with lists
@@ -45,6 +54,7 @@ def get_memory_batch(self):
 
         # retrieves batch_size memories
         states_T = T.stack(self.states[:self.batch_size]).to(device)
+        states_T = T.squeeze(states_T, dim=1)
         act_logprob_tens = T.stack(self.logprobs[:self.batch_size]).to(device)
         adv_tensor = T.tensor(self.adv[:self.batch_size]).to(device)
         vals_tens = T.tensor(self.vals[:self.batch_size], dtype=T.float64).to(device)
@@ -76,29 +86,38 @@ class ActorModel(nn.Module):
         this exact same thing. The code will look similar.
 
     '''
-    def __init__(self, input_shape, n_actions, 
+    def __init__(self, input_shape, n_actions, c2,
                  min_tens = T.tensor((-1, 1)), max_tens = T.tensor((-1, 1))):
         super(ActorModel, self).__init__()
 
+        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=2, padding=1)
+        self.flat = nn.Flatten()
+
         # base model
-        self.fc1 = nn.Linear(in_features=input_shape, out_features=64).to(device)
-        self.fc2 = nn.Linear(in_features=64, out_features=64).to(device)
+        self.fc1 = nn.Linear(in_features=input_shape, out_features=265).to(device)
+        self.fc2 = nn.Linear(in_features=265, out_features=265).to(device)
 
         # distributions
-        self.mean = nn.Linear(in_features=64, out_features=n_actions).to(device)
+        self.mean = nn.Linear(in_features=265, out_features=n_actions).to(device)
 
         # Constant variance
+        self.c2 = c2
         self.var = T.diag(T.ones(n_actions)).to(device)*0.5
 
         # misc
         self.min_tens = min_tens
         self.max_tens = max_tens
+        self.relu30 = relu30()
 
     def forward(self, x):
         ''' We create a class that computes the distribution of our actions. '''
 
+        x = F.tanh(self.conv1(x))
+        x = F.tanh(self.conv2(x))
+        x = self.flat(x)
         # base computation
-        x = F.tanh(self.fc1(x)).to(device)
+        x = F.tanh(self.fc1(x).to(device))
         x = F.tanh(self.fc2(x)).to(device)
 
         # we note the mean value goes through a tanh as it corresponds with the 
@@ -107,20 +126,25 @@ def forward(self, x):
         # for our environment, depending on how we represent the raw actions, 
         # we can have positive and negative values, if we center in the middle of the board
         # for example.
-        mean = self.mean(x).to(device)
+        mean = F.tanh(self.mean(x)).to(device)
+        #activation = self.relu30(mean)
 
-        return mean
+        return T.squeeze(mean)
 
     def get_action_logprob(self, x):
         mean = self.forward(x)
 
         calc_mean = mean.to(device)
+        #print(f'{calc_mean = }')
+
+        mean = T.clamp(mean, 0, 29)
 
         # get action from distribution distribution
         dist = T.distributions.MultivariateNormal(calc_mean, self.var)
 
         action = dist.sample()
         logprob = dist.log_prob(action)
+
 
         return action.to(device), logprob, mean.to(device)
 
@@ -131,26 +155,46 @@ def calc_action_logprob_entropy(self, x, action):
         dist = T.distributions.MultivariateNormal(calc_mean, self.var)
 
         logprob = dist.log_prob(action)
+        ent = dist.entropy()
 
-        return logprob
+        return logprob, ent
 
+    def calc_c2(self):
+        new_c2 = self.c2*0.95
+        self.c2 = max(new_c2, 0.5)
+
 class CriticModel(nn.Module):
     def __init__(self, input_shape):
         super(CriticModel, self).__init__()
 
-        self.fc1 = nn.Linear(input_shape, 64).to(device)
-        self.fc2 = nn.Linear(64, 64).to(device)
-        self.output = nn.Linear(64, 1).to(device)
+        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=2, padding=1)
+        self.conv2 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3, stride=2, padding=1)
+        self.flat = nn.Flatten()
+
+        self.fc1 = nn.Linear(input_shape, 265).to(device)
+        self.fc2 = nn.Linear(265, 265).to(device)
+        self.output = nn.Linear(265, 1).to(device)
 
     def forward(self, x):
         '''
         Overwrites the basic call function
         '''
+        x = F.tanh(self.conv1(x))
+        x = F.tanh(self.conv2(x))
+        x = self.flat(x)
         x = F.tanh(self.fc1(x)).to(device)
         x = F.tanh(self.fc2(x)).to(device)
-        x = F.tanh(self.output(x)).to(device)
+        x = self.output(x).to(device)
+
+        return T.squeeze(x)
+
+    def save(self, save_to_path: str):
+        save_to_path += '_critic'
+        T.save(self.target_model.state_dict(), save_to_path)
 
-        return x
+    def load(self, load_to_path):
+        self.target_model.load_state_dict(T.load(load_to_path))
+        self.model.load_state_dict(T.load(load_to_path))
 
 class Agent(nn.Module):
         # An interesting note - implementations exist where actor and critic share 
@@ -175,7 +219,7 @@ def __init__(self, n_actions, c1, c2, input_dims, action_min, action_max, gamma=
         self.n_actions = n_actions
 
         #           --- Actor Critic ---
-        self.actor = ActorModel(input_dims, n_actions).float().to(device)
+        self.actor = ActorModel(input_dims, n_actions, 2).float().to(device)
         self.optimizer_actor = T.optim.Adam(self.actor.parameters(), LR)
 
         self.critic = CriticModel(input_dims).float().to(device)
@@ -188,8 +232,8 @@ def __init__(self, n_actions, c1, c2, input_dims, action_min, action_max, gamma=
         self.criterion = nn.MSELoss()
         self.annealing = annealing
         if annealing == True:
-            self.anneal_lr_actor = T.optim.lr_scheduler.StepLR(self.optimizer_actor, buffer_size*5, gamma=0.3)
-            self.anneal_lr_critic = T.optim.lr_scheduler.StepLR(self.optimizer_critic, buffer_size*5, gamma=0.3)
+            self.anneal_lr_actor = T.optim.lr_scheduler.StepLR(self.optimizer_actor, self.buffer_size, gamma=0.95)
+            self.anneal_lr_critic = T.optim.lr_scheduler.StepLR(self.optimizer_critic, self.buffer_size, gamma=0.95)
 
         self.device = T.device("cuda" if T.cuda.is_available() else "cpu")
 
@@ -262,6 +306,11 @@ def advantage_and_return(self, rewards, values, not_dones):
 
         return advantages.clone().detach(), returns.clone().detach()
 
+    def calculate_boundary_penalty(self, action_position: T.tensor) -> T.tensor:
+        left = np.clip(action_position, a_min=-np.inf,a_max=0)
+        right = np.clip(action_position-30,a_min=0,a_max=np.inf)
+        return T.tensor(np.sum(np.max(np.vstack([abs(left),abs(right)]),axis=0))).to(device)
+
     def learn(self):
         '''
         This function iterates over our entire buffer size and trains over minibatches.
@@ -289,7 +338,9 @@ def learn(self):
 
 
             # get logprob of action and entropy
-            new_logprobs = self.actor.calc_action_logprob_entropy(state_tens, act_tens)      
+            new_logprobs, entropy = self.actor.calc_action_logprob_entropy(state_tens, act_tens) 
+
+            entropy = T.mean(entropy)     
 
             # Get probability raio
             prob_ratios = T.exp(new_logprobs - logprob_tens).to(device)
@@ -329,7 +380,6 @@ def learn(self):
 
             #           --- Total Loss ---
 
-
             # Apply Advantages to Policy Loss
             policy_loss = adv_tensor*policy_loss
             policy_loss = T.mean(policy_loss).to(device)
@@ -342,7 +392,7 @@ def learn(self):
             #crit_loss = T.max(crit_loss, crit_loss_clipped).to(device)
             crit_loss = self.c1*crit_loss.float().to(device)
 
-            loss = -policy_loss + crit_loss # - self.c2*entropy_loss
+            loss = -policy_loss + crit_loss #- self.c2*entropy
 
 
             #           --- Backpropogate ---
@@ -364,13 +414,39 @@ def learn(self):
             total_pol_loss += policy_loss.detach().numpy()
             total_critic_loss += crit_loss.detach().numpy()
             total_loss += loss.detach().numpy()
+
+        self.actor.calc_c2()
 
 
         return total_pol_loss, total_critic_loss, total_loss
+
+    def save(self, save_to):
+        save_to_critic = save_to + '_critic'
+        T.save(self.critic.state_dict(), save_to_critic)
 
+        save_to_actor = save_to + '_actor'
+        T.save(self.actor.state_dict(), save_to_actor)
+
+    def load(self, load_actor, load_critic):
+        # Assuming the path to your .pkl file
+        # Load the pickled state dictionary
+
+        #with open(load_actor, 'rb') as file:
+        #    state_dict = h5py.load(file)
+
+        # Assuming you have an instance of your model
+        #model = CriticModel(input_shape=100)  # Adjust `CriticModel` and `input_shape` as necessary
+
+        # Load the state dictionary into your model
+        #model.load_state_dict(state_dict)
+
+        # Optionally, put the model in evaluation mode if you're doing inference
+        #model.eval()
+        self.actor.load_state_dict(T.load(load_actor))
+        self.critic.load_state_dict(T.load(load_critic))
+
 
 
 
-##A
 
 
diff --git a/PPO_Agent_Misc/__pycache__/PPOContinuous.cpython-310.pyc b/PPO_Agent_Misc/__pycache__/PPOContinuous.cpython-310.pyc
diff --git a/PPO_Agent_Misc/actor_weights/data.pkl b/PPO_Agent_Misc/actor_weights/data.pkl
diff --git a/PPO_Agent_Misc/actor_weights/data/0 b/PPO_Agent_Misc/actor_weights/data/0
diff --git a/PPO_Agent_Misc/actor_weights/data/1 b/PPO_Agent_Misc/actor_weights/data/1
@@ -0,0 +1,2 @@
+��ͻ��3�8�<Zn<V�+���	�s��.5��x:^1�;�Zs�޺�����Ǳ;Sa��ꓺa�u��c��J��;0��2	�[D��w��>�;+��;G!��2;(6!���:|<uf�;ӯF��q'�g���#1<��$�۶�;��;V]�u���
+;g(D��#���;e�x n;�f<x�:�<�㲸�	�;N�<���:aD;V��;]} <�Y2�?!A;���Ve ��`7r�e�{����7���n�;
diff --git a/PPO_Agent_Misc/actor_weights/data/2 b/PPO_Agent_Misc/actor_weights/data/2
diff --git a/PPO_Agent_Misc/actor_weights/data/3 b/PPO_Agent_Misc/actor_weights/data/3
diff --git a/PPO_Agent_Misc/actor_weights/data/4 b/PPO_Agent_Misc/actor_weights/data/4
diff --git a/PPO_Agent_Misc/actor_weights/data/5 b/PPO_Agent_Misc/actor_weights/data/5
diff --git a/PPO_Agent_Misc/actor_weights/version b/PPO_Agent_Misc/actor_weights/version
@@ -0,0 +1 @@
+3
diff --git a/PPO_Agent_Misc/critic_weights/data.pkl b/PPO_Agent_Misc/critic_weights/data.pkl
diff --git a/PPO_Agent_Misc/critic_weights/data/0 b/PPO_Agent_Misc/critic_weights/data/0
diff --git a/PPO_Agent_Misc/critic_weights/data/1 b/PPO_Agent_Misc/critic_weights/data/1
diff --git a/PPO_Agent_Misc/critic_weights/data/2 b/PPO_Agent_Misc/critic_weights/data/2
diff --git a/PPO_Agent_Misc/critic_weights/data/3 b/PPO_Agent_Misc/critic_weights/data/3
diff --git a/PPO_Agent_Misc/critic_weights/data/4 b/PPO_Agent_Misc/critic_weights/data/4
diff --git a/PPO_Agent_Misc/critic_weights/data/5 b/PPO_Agent_Misc/critic_weights/data/5
@@ -0,0 +1 @@
+�=��
diff --git a/PPO_Agent_Misc/critic_weights/version b/PPO_Agent_Misc/critic_weights/version
@@ -0,0 +1 @@
+3
diff --git a/PPO_Agent_Misc/toy_env.py b/PPO_Agent_Misc/toy_env.py
@@ -8,7 +8,7 @@
 import tqdm
 
 
-env = gym.make("LunarLander-v2", continuous=True)
+env = gym.make("CarRacing-v2")
 
 episode_seed = np.random.randint(0, 100)
 observation, info = env.reset(seed=episode_seed)
@@ -17,30 +17,32 @@
 EPOCHS = 25
 NUM_MINIBATCHES = 32
 MINIBATCH = 16
-EPISODES = 100
+EPISODES = 400
 TS_PER_ITER = 2000
 
 # Continuous Parameters
 action_min = T.tensor((0.0, -1.0))
 action_max = T.tensor((1.0, 1.0))
 
 
-PPO_Agent = ContPPO(n_actions=2, c1=0.5, c2=0.1, input_dims=8, action_min=action_min, action_max=action_max, 
+PPO_Agent = ContPPO(n_actions=3, c1=0.5, c2=0.1, input_dims=9216, action_min=action_min, action_max=action_max, 
                     gamma=0.99, gae_lambda=0.95, policy_clip=0.2, batch_size=MINIBATCH, 
                     buffer_size=MINIBATCH*NUM_MINIBATCHES, n_epochs=EPISODES, LR=1e-3, annealing=False)
 
 action = env.action_space.sample()
 obs, rewards, dones, info, _ = env.step(action)
+print(obs.shape)
 
 full_episode_loss = []
 avg_policy_loss = []
 avg_crit_loss = []
 episode_max_ratio = []
 ep_mean_rewards = []
 
-obs = T.tensor(obs).to(PPO_Agent.device)
+obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)
+print(obs.shape)
 
-for episode in tqdm.tqdm(range(50)):
+for episode in tqdm.tqdm(range(EPISODES)):
     episode_loss = 0.0
     episode_policy_loss = 0.0
     episode_crit_loss = 0.0
@@ -51,7 +53,7 @@
         action, logprob, mean, prev_vf = PPO_Agent.get_action_and_vf(prev_obs)
         obs, rewards, dones, info, _ = env.step(action.numpy())
 
-        obs = T.tensor(obs).to(PPO_Agent.device)
+        obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)
 
         next_vf = PPO_Agent.critic.forward(obs)
         advantage = PPO_Agent.get_gae(rewards, prev_vf, next_vf)
@@ -90,20 +92,20 @@
 
 
 # Render games at the end
-env = gym.make("LunarLander-v2", continuous=True, render_mode="human")
+env = gym.make("CarRacing-v2", render_mode="human")
 
 env.reset()
 action = env.action_space.sample()
 obs, rewards, dones, info, _ = env.step(action)
 
-obs = T.tensor(obs, dtype=T.float32)
+obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)
 
 for e in range(TS_PER_ITER):
     prev_obs = obs.clone().detach()
     action, logprob, mean, prev_vf = PPO_Agent.get_action_and_vf(prev_obs)
     print(action)
     obs, rewards, dones, info, _ = env.step(np.array(action))
-    obs = T.tensor(obs).to(PPO_Agent.device)
+    obs = T.tensor(np.expand_dims(np.swapaxes(obs, 2, 0), axis=0)).float().to(PPO_Agent.device)
 
     ep_tot_rewards += rewards
 

diff --git a/Simulation.py b/Simulation.py
@@ -26,7 +26,7 @@ def step(self, action):
         """
         self.state.next(action)
         obs = self.state.state
-        reward = self.state.get_reward()
+        reward = self.state.get_reward(action)
         done = self.state.get_done()
         info = self.state.get_info()
         return obs, reward, done, info

diff --git a/__pycache__/FireMap.cpython-310.pyc b/__pycache__/FireMap.cpython-310.pyc
diff --git a/__pycache__/Simulation.cpython-310.pyc b/__pycache__/Simulation.cpython-310.pyc
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		��ͻ��3�8�<Zn<V�+�� s��.5��x:^1�;�Zs�޺��Ǳ;Sa��ꓺa�u��c��J��;0��2 �[D��w��>�;+��;G!��2;(6!��:\|<uf�;ӯF��q'�g��#1<��$�۶�;��;V]�u��
		;g(D��#��;e�x n;�f<x�:�<�㲸� �;N�<��:aD;V��;]} <�Y2�?!A;��Ve ��`7r�e�{��7��n�;