Merge branch 'main' of https://github.com/nicolasFontbonne/RLCoopExp …

…into main
PaulEcoffet · Dec 4, 2020 · df7d500 · df7d500
2 parents 2022c2d + b67fda1
commit df7d500
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 41 deletions.
diff --git a/PartnerChoiceEnv.py b/PartnerChoiceEnv.py
@@ -28,6 +28,8 @@ def payoff(xi, xj, *, a=5, b=5, temp=None):
 class PartnerChoiceFakeSites(MultiAgentEnv):
 
     def __init__(self, env_config):
+        if "env_config" in env_config:
+            raise ValueError("Do not provide the full config but only the env_config")
         if "bad_site_prob" in env_config:
             raise ValueError("deprecated key bad_site_prob")
 
@@ -37,12 +39,13 @@ def __init__(self, env_config):
         self.new_x_each_interaction = env_config.get("new_x_each_interaction", True)
         self.max_action = env_config.get("max_action", 15.0)
         self.good_site_prob = env_config.get("good_site_prob", 1)
+        self.eval_mode = env_config.get("eval_mode", False)
         self.iteration_count = 0
         self.true_site = [True for i in range(self.nb_agents)]
         self.agents_names = ['inv' + '{:02d}'.format(i) for i in range(self.nb_agents)]
         self.agents_names += ['choice' + '{:02d}'.format(i) for i in range(self.nb_agents)]
         self.inv = [0.0 for i in range(self.nb_agents)]
-        self.cur_opp = [-1 for i in range(self.nb_agents)]
+        self.cur_opp = [None for i in range(self.nb_agents)]
 
         self.cur_its = np.array([0 for i in range(self.nb_agents)], dtype=int)
 
@@ -51,16 +54,24 @@ def __init__(self, env_config):
 
         self.site_action = np.linspace(0, self.max_action, self.nb_sites)
         self.site_acceptance_threshold = np.copy(self.site_action)
+        self.force_opp = None
+
+        assert(not self.eval_mode or self.good_site_prob == 1)
 
-    def reset(self):
+    def reset(self, *, force_cur_opp=None):
+        if force_cur_opp is not None and not self.eval_mode:
+            raise ValueError("force config only allowed in eval mode")
+        self.force_opp = force_cur_opp
         self.cur_its = np.array([0 for i in range(self.nb_agents)], dtype=int)
-        self.cur_opp = [None for i in range(self.nb_agents)]
+        self.cur_opp = [force_cur_opp for i in range(self.nb_agents)]
         # Make all individuals make their investment choice
         return {
             f'inv{i:02}': np.array([0], dtype=np.float32) for i in range(self.nb_agents)
         }
 
     def _find_opp(self):
+        if self.force_opp is not None:
+            return self.force_opp, True
         true_site = True
         if np.random.rand() < 1 - self.good_site_prob:
             true_site = False
@@ -80,15 +91,16 @@ def step(self, action_dict):
             choice = f"choice{ind:02d}"
             # If we get a investment action
             if agent_name.startswith('inv'):
-                self.inv[ind] = action_dict[agent_name][0]
+                self.inv[ind] = np.asarray(action_dict[agent_name]).flatten()[0]
                 self.cur_opp[ind], self.true_site[ind] = self._find_opp()
                 obs[choice] = np.array([self.site_action[self.cur_opp[ind]], self.inv[ind]])
                 reward[choice] = 0  # dummy reward at init
             else:  # if it's a choice action
                 self.cur_its[ind] += 1
 
                 curopp = self.cur_opp[ind]
-                info[inv] = {'inv': self.inv[ind], 'other': self.site_acceptance_threshold[curopp],
+                assert(isinstance(curopp, int))
+                info[inv] = {'inv': self.inv[ind], 'other': self.site_action[curopp],
                                 'accept': action_dict[choice]}
                 # if they both agree
                 if action_dict[choice] == 1 and self.inv[ind] >= self.site_acceptance_threshold[curopp]\
@@ -101,10 +113,11 @@ def step(self, action_dict):
                     done[choice] = True
                     done[inv] = True
                     obs[inv] = np.array([0], dtype=np.float32)
-                    self.cur_its[ind] = self.max_it
+                    if not self.eval_mode:
+                        self.cur_its[ind] = self.max_it  # force end of experiment
                 else:  # if at least one disagree or not a real site
                     done[choice] = False
-                    self.cur_opp[ind] = self._find_opp()
+                    self.cur_opp[ind], self.true_site[ind] = self._find_opp()
                     if self.new_x_each_interaction:
                         obs[inv] = np.array([0], dtype=np.float32)
                         reward[inv] = 0

diff --git a/cma_test.py b/cma_test.py
@@ -10,6 +10,7 @@
 from gym.spaces import Discrete, Box
 from ray.rllib.evaluation import collect_metrics
 from ray.rllib.models import ModelCatalog
+from ray.rllib.policy.policy import clip_action
 from ray.rllib.policy import build_torch_policy
 from ray.rllib.utils.filter import get_filter
 from ray.tune import register_env
@@ -129,7 +130,8 @@ def train(config, reporter):
             timestep_total += 1
             act = {}
             for key in obs:
-                act[key] = policies[key].compute_actions([obs[key]])[0][0]
+                act[key] = clip_action(policies[key].compute_actions([obs[key]])[0],
+                                       policies[key].action_space_struct)[0]
             obs, reward, done, info = env.step(act)
             for key in reward:
                 totrewards[key] += reward[key]
@@ -185,7 +187,8 @@ def evaluate(best, env, i_episode, policies, reporter, timestep_total):
             stepcount += 1
             act = {}
             for key in obs:
-                act[key] = policies[key].compute_actions([obs[key]])[0][0]
+                act[key] = clip_action(policies[key].compute_actions([obs[key]])[0],
+                                       policies[key].action_space_struct)[0]
             if "inv00" in act:
                 inv_through_eval.append(act["inv00"])
             obs, reward, done, info = env.step(act)

diff --git a/evaluate_at_checkpoint.py b/evaluate_at_checkpoint.py
@@ -1,45 +1,143 @@
-from ray.tune import register_env
+import glob
+import logging
+import re
+from copy import copy
+from pathlib import Path
+from typing import List
+from typing.re import Pattern
+import numpy as np
+import pickle
+import tqdm
 
+from ray.tune import register_env
+from ray.tune.logger import pretty_print
+from timer import timer
 from PartnerChoiceEnv import PartnerChoiceFakeSites
 import ray
 from cma_test import CMAESTorchPolicy
-from ray.rllib.agents.ppo import PPOTrainer
-
+from ray.rllib.agents.ppo import PPOTrainer, PPOTorchPolicy
+import pandas as pd
 from main_test import init_setup, select_policy, MyCallbacks
 
+
+logging.basicConfig(level=logging.DEBUG)
+
 policies = init_setup()
 
 config = {
-        "num_envs_per_worker": 16,
-        "multiagent": {
-            "policies": policies,
-            "policy_mapping_fn": select_policy,
-        },
-        "clip_actions": True,
-        "framework": "torch",
-        "no_done_at_end": True,
-        "gamma": 1,
-        "lr": 5e-3,
-        "num_sgd_iter": 10,
-        "callbacks": MyCallbacks,
-        "env": "partner_choice",
-        "env_config":
-            {
-                "good_site_prob": 1,
-                "max_it": 100
-            }
-    }
-
-def main():
-    ray.init(local_mode=True)
-    register_env("partner_choice",
-                 lambda config: PartnerChoiceFakeSites(config))
+    "num_envs_per_worker": 16,
+    "num_workers": 0,
+    "multiagent": {
+        "policies": policies,
+        "policy_mapping_fn": select_policy,
+    },
+    "clip_actions": True,
+    "framework": "torch",
+    "no_done_at_end": True,
+    "gamma": 1,
+    "lr": 5e-3,
+    "num_sgd_iter": 10,
+    "callbacks": MyCallbacks,
+    "env": "partner_choice",
+    "env_config":
+        {
+            "good_site_prob": 1,
+            "max_it": 100
+        }
+}
+
+
+@timer
+def bench(path):
+    if "cma" in path:
+        agent = loadcma(path)
+    else:
+        agent = loadppo(path)
+    config["env_config"]["eval_mode"] = True
+    config["env_config"]["good_site_prob"] = 1
+    config["env_config"]["max_it"] = 100
+    env = PartnerChoiceFakeSites(config["env_config"])
+    logs = []
+    for i_opp in range(env.nb_sites):
+        obs = env.reset(force_cur_opp=i_opp)
+        done = {"__all__": False}
+        while not done["__all__"]:
+            act = {}
+            for key in obs:
+                act[key] = agent.get_policy(key).compute_actions(obs[key].reshape(1, -1))[0].flatten()[0]
+            obs, reward, done, info = env.step(act)
+            if "inv00" in info:
+                assert (isinstance(info["inv00"]["other"], float))
+                logs.append(copy(info["inv00"]))
+    df = pd.DataFrame(logs)
+    return df
+
+
+@timer
+def loadppo(path):
     agent = PPOTrainer(config)
-    agent.restore("/Users/paulecoffet/Documents/isir/These/data/RLCoopExp/logs/paperrun/e200000/ppobiglr/goodsiteprob_20201120-221749/PPO_partner_choice_d7c84_00023_23_good_site_prob=1.0,max_it=100.0_2020-11-21_00-43-53/checkpoint_594/checkpoint-594")
+    agent.load_checkpoint(path)
+    return agent
+
+@timer
+def loadcma(path):
+    # hackish lookalike
+    class FakeAgentDict(dict):
+        def get_policy(self, policy):
+            return self[policy]
+    agent = FakeAgentDict()
+    with open(path, "rb") as f:
+        bests = pickle.load(f)
+    for key, params in config["multiagent"]["policies"].items():
+        agent[key] = CMAESTorchPolicy(*params[1:])
+        agent[key].set_flat_weights(bests[key])
+    return agent
+
 
-    print(agent)
-    agent.compute_action()
+def get_highest(vals: List[str], *, pattern: Pattern = ""):
+    if vals is None or len(vals) == 0:
+        return None
+    m = np.argmax([float(re.search(pattern, val).group("target")) for val in vals])
+    return vals[m]
 
 
 if __name__ == "__main__":
-    main()
+    ray.init(local_mode=True)
+    register_env("partner_choice",
+                 lambda config: PartnerChoiceFakeSites(config))
+
+    main_path = Path("/Users/paulecoffet/Documents/isir/These/data/RLCoopExp/logs/paperrun2/e1000000/ppobiglr/")
+    glob_path = main_path
+    alldfs = []
+    with timer("glob"):
+        allpaths = list(main_path.rglob("**/*"))
+    for path in tqdm.tqdm(allpaths):
+        res = re.search(r"partner_choice_(?P<trialid>.*)_(?P<runid>\d+)_good_site_prob=(?P<prob>[0-9.]*),", str(path))
+        if not res:
+            continue
+        run_id = res.group("runid")
+        trial_id = res.group("trialid")
+        good_site_prob = res.group("prob")
+        if "cma" in str(path):
+            checkpoint_path = path / "checkpoint200000/best.pkl"
+        else:
+            checkpoint_path = get_highest([str(c) for c in path.glob("checkpoint*/*")
+                                       if "tune_metadata" not in str(c) and ".is_check" not in str(c)],
+                                      pattern=r"checkpoint[-_]?(?P<target>[-0-9]+)/")
+        if not checkpoint_path or not Path(checkpoint_path).exists():
+            print("no checkpoint for", path)
+            continue
+        try:
+            df = bench(str(checkpoint_path))
+        except Exception as e:
+            print(checkpoint_path)
+            print(type(e), e)
+        else:
+            df["run_id"] = run_id
+            df["trial_id"] = trial_id
+            df["good_site_prob"] = df["p"] = good_site_prob
+            df["checkpoint_path"] = checkpoint_path
+            alldfs.append(df)
+    with timer("saving"):
+        fuldf = pd.concat(alldfs)
+        fuldf.to_csv(main_path / "postmortem.csv.gz")
diff --git a/main_test.py b/main_test.py
@@ -87,11 +87,13 @@ def init_setup():
     choicemodel_dict = {
         "model": {
             "fcnet_hiddens": [3],
+            "max_seq_len": 9999999
         }
     }
     investormodel_dict = {
         "model": {
-            "fcnet_hiddens": []
+            "fcnet_hiddens": [],
+            "max_seq_len": 9999999
         }
     }
     policies = {inv_id[i]: (None, inv_obs_space, inv_act_space, investormodel_dict) for i in range(nb_agents)}