daveey
diff --git a/‎.vscode/launch.json
+2-1 b/‎.vscode/launch.json
+2-1
diff --git a/‎agent/metta_agent.py
+14-36 b/‎agent/metta_agent.py
+14-36
diff --git a/‎configs/agent/agent.yaml
-2 b/‎configs/agent/agent.yaml
-2
diff --git a/‎configs/framework/pufferlib/defaults.yaml
+96 b/‎configs/framework/pufferlib/defaults.yaml
+96
diff --git a/‎configs/framework/pufferlib/train.yaml
+4 b/‎configs/framework/pufferlib/train.yaml
+4
diff --git a/‎configs/framework/pufferlib/train/sand.yaml
+1 b/‎configs/framework/pufferlib/train/sand.yaml
+1
diff --git a/‎experiments/sweep_config.yaml
-152 b/‎experiments/sweep_config.yaml
-152
diff --git a/‎metta.code-workspace
+3 b/‎metta.code-workspace
+3
diff --git a/‎rl_framework/pufferlib/__init__.py
-8 b/‎rl_framework/pufferlib/__init__.py
-8
diff --git a/‎rl_framework/pufferlib/clean_pufferl.py
+1 b/‎rl_framework/pufferlib/clean_pufferl.py
+1
@@ -19,8 +19,9 @@
                 // "sample_factory.aux_loss_coeff=0.1",
 
                 "+framework.device=cpu",
-                "+framework.experiment=baseline.v0.4.1",
+                "framework.experiment=mac.puffer.2",
                 "+framework.train_dir=./train_dir",
+                // "framework.wandb.track=True",
 
                 // "sample_factory.wandb_project=mp_rws"
                 // "+sample_factory.restart_behavior=restart",
 
@@ -26,6 +26,7 @@ def __init__(
     ):
         super().__init__()
         cfg = OmegaConf.create(cfg)
+        self._cfg = cfg
         self._observation_space = obs_space
         self._action_space = action_space
 
@@ -35,44 +36,34 @@ def __init__(
             cfg.fc.layers,
             cfg.fc.output_dim
         )
-
         self._core = ModelCoreRNN(cfg.core, cfg.fc.output_dim)
 
         self._decoder = hydra.utils.instantiate(
             cfg.decoder,
             self._core.get_out_size())
 
-        decoder_out_size: int = self._decoder.get_out_size()
-
-        self._critic_linear = nn.Linear(decoder_out_size, 1)
-        self._action_parameterization = self.get_action_parameterization(decoder_out_size)
-        self._last_action_distribution = None
+        self._critic_linear = nn.Linear(self.decoder_out_size, 1)
 
         self.apply(self.initialize_weights)
 
+    @property
+    def decoder_out_size(self):
+        return self._decoder.get_out_size()
+
+    @property
+    def core_out_size(self):
+        return self._cfg.core.rnn_size
+
     def encode_observations(self, td: TensorDict):
-        td["encoded_"] = self._encoder(obs_dict)
-        return x
+        td["encoded_obs"] = self._encoder(td["obs"])
 
     def forward_core(self, head_output: Tensor, rnn_states):
         x, new_rnn_states = self._core(head_output, rnn_states)
         return x, new_rnn_states
 
-    def forward_tail(self, core_output, values_only: bool, sample_actions: bool) -> TensorDict:
-        decoder_output = self._decoder(core_output)
-        values = self._critic_linear(decoder_output).squeeze()
-
-        result = TensorDict({"values": values}, batch_size=values.size(0))
-        if values_only:
-            return result
-
-        action_distribution_params, self._last_action_distribution = self._action_parameterization(decoder_output)
-
-        # `action_logits` is not the best name here, better would be "action distribution parameters"
-        result["action_logits"] = action_distribution_params
-
-        self._maybe_sample_actions(sample_actions, result)
-        return result
+    def forward_tail(self, td: TensorDict):
+        td["decoder_output"] = self._decoder(td["core_output"])
+        td["values"] = self._critic_linear(td["decoder_output"]).squeeze()
 
     def forward(self, normalized_obs_dict, rnn_states, values_only=False) -> TensorDict:
         x = self.encode_observations(normalized_obs_dict)
@@ -98,16 +89,3 @@ def initialize_weights(self, layer):
             # I never noticed much difference between different initialization schemes, and here it seems safer to
             # go with default initialization,
             pass
-
-    def get_action_parameterization(self, decoder_output_size: int):
-        return ActionParameterizationDefault({}, decoder_output_size, self._action_space)
-
-    def _maybe_sample_actions(self, sample_actions: bool, result: TensorDict) -> None:
-        if sample_actions:
-            # for non-trivial action spaces it is faster to do these together
-            actions, result["log_prob_actions"] = sample_actions_log_probs(self._last_action_distribution)
-            assert actions.dim() == 2  # TODO: remove this once we test everything
-            result["actions"] = actions.squeeze(dim=1)
-
-    def action_distribution(self):
-        return self._last_action_distribution
@@ -56,8 +56,6 @@ decoder:
 
 
 core:
-  _target_: sample_factory.model.core.ModelCoreRNN
-
   rnn_type: gru
   rnn_num_layers: 1
   rnn_size: 512
 
@@ -0,0 +1,96 @@
+# @package framework
+
+_target_: rl_framework.pufferlib.pufferlib.PufferLibFramework
+
+package: metta
+
+
+baseline: False
+render_mode: auto
+vectorization: serial
+eval_mode_path: ""
+mode: train
+backend: clean_pufferl
+experiment: ???
+
+wandb:
+  track: False
+  project: metta
+  entity: platypus
+  group: platypus
+  name: ${framework.experiment}
+
+
+train:
+  exp_id: ${framework.experiment}
+  track: False
+  env: ${env.name}
+  seed: 1
+  torch_deterministic: True
+  cpu_offload: False
+  device: cpu
+  total_timesteps: 10_000_000
+  learning_rate: 2.5e-4
+  num_steps: 128
+  anneal_lr: True
+  gamma: 0.99
+  gae_lambda: 0.95
+  num_minibatches: 4
+  update_epochs: 4
+  norm_adv: True
+  clip_coef: 0.1
+  clip_vloss: True
+  ent_coef: 0.01
+  vf_coef: 0.5
+  max_grad_norm: 0.5
+  target_kl: null
+
+  num_envs: 8
+  num_workers: 8
+  env_batch_size: null
+  zero_copy: True
+  verbose: True
+  data_dir: experiments
+  checkpoint_interval: 200
+  batch_size: 1024
+  minibatch_size: 512
+  bptt_horizon: 16
+  vf_clip_coef: 0.1
+  compile: False
+  compile_mode: reduce-overhead
+
+sweep:
+  method: bayes
+  name: sweep
+  metric:
+    goal: maximize
+    name: environment/episode_return
+  # Nested parameters name required by WandB API
+  parameters:
+    train:
+      parameters:
+        ent_coef: {
+          'distribution': 'log_uniform_values',
+          'min': 1e-3,
+          'max': 5e-2,
+        }
+        gamma: {
+          'values': [0.90, 0.925, 0.95, 0.975, 0.99],
+        }
+        gae_lambda: {
+          'values': [0.90, 0.925, 0.95, 0.975, 0.99],
+        }
+        learning_rate: {
+          'distribution': 'log_uniform_values',
+          'min': 1e-4,
+          'max': 1e-1,
+        }
+        batch_size: {
+          'values': [4096, 8192, 16384, 32768, 65536],
+        }
+        minibatch_size: {
+          'values': [1024, 2048, 4096, 8192, 16384],
+        }
+        bptt_horizon: {
+          'values': [1, 2, 4, 8, 16],
+        }
@@ -0,0 +1,4 @@
+# @package framework
+
+defaults:
+  - defaults
@@ -20,3 +20,4 @@ train:
   minibatch_size: 1024
   compile: False
   anneal_lr: False
+  checkpoint_interval: 10
@@ -83,6 +83,9 @@
 			"__verbose_abort": "cpp",
 			"execution": "cpp"
 		},
+		"python.analysis.extraPaths": [
+			"./third_party/pufferlib"
+		],
 
 	},
 
 
@@ -506,6 +506,7 @@ def run(self):
             else:
                 self.gpu_util.append(1)
                 self.gpu_mem.append(1)
+
             time.sleep(self.delay)
 
     def stop(self):