diff --git a/PartnerChoiceEnv.py b/PartnerChoiceEnv.py index d7af9dc..b88c02f 100644 --- a/PartnerChoiceEnv.py +++ b/PartnerChoiceEnv.py @@ -28,6 +28,8 @@ def payoff(xi, xj, *, a=5, b=5, temp=None): class PartnerChoiceFakeSites(MultiAgentEnv): def __init__(self, env_config): + if "env_config" in env_config: + raise ValueError("Do not provide the full config but only the env_config") if "bad_site_prob" in env_config: raise ValueError("deprecated key bad_site_prob") @@ -37,12 +39,13 @@ def __init__(self, env_config): self.new_x_each_interaction = env_config.get("new_x_each_interaction", True) self.max_action = env_config.get("max_action", 15.0) self.good_site_prob = env_config.get("good_site_prob", 1) + self.eval_mode = env_config.get("eval_mode", False) self.iteration_count = 0 self.true_site = [True for i in range(self.nb_agents)] self.agents_names = ['inv' + '{:02d}'.format(i) for i in range(self.nb_agents)] self.agents_names += ['choice' + '{:02d}'.format(i) for i in range(self.nb_agents)] self.inv = [0.0 for i in range(self.nb_agents)] - self.cur_opp = [-1 for i in range(self.nb_agents)] + self.cur_opp = [None for i in range(self.nb_agents)] self.cur_its = np.array([0 for i in range(self.nb_agents)], dtype=int) @@ -51,16 +54,24 @@ def __init__(self, env_config): self.site_action = np.linspace(0, self.max_action, self.nb_sites) self.site_acceptance_threshold = np.copy(self.site_action) + self.force_opp = None + + assert(not self.eval_mode or self.good_site_prob == 1) - def reset(self): + def reset(self, *, force_cur_opp=None): + if force_cur_opp is not None and not self.eval_mode: + raise ValueError("force config only allowed in eval mode") + self.force_opp = force_cur_opp self.cur_its = np.array([0 for i in range(self.nb_agents)], dtype=int) - self.cur_opp = [None for i in range(self.nb_agents)] + self.cur_opp = [force_cur_opp for i in range(self.nb_agents)] # Make all individuals make their investment choice return { f'inv{i:02}': np.array([0], dtype=np.float32) for i in range(self.nb_agents) } def _find_opp(self): + if self.force_opp is not None: + return self.force_opp, True true_site = True if np.random.rand() < 1 - self.good_site_prob: true_site = False @@ -80,7 +91,7 @@ def step(self, action_dict): choice = f"choice{ind:02d}" # If we get a investment action if agent_name.startswith('inv'): - self.inv[ind] = action_dict[agent_name][0] + self.inv[ind] = np.asarray(action_dict[agent_name]).flatten()[0] self.cur_opp[ind], self.true_site[ind] = self._find_opp() obs[choice] = np.array([self.site_action[self.cur_opp[ind]], self.inv[ind]]) reward[choice] = 0 # dummy reward at init @@ -88,7 +99,8 @@ def step(self, action_dict): self.cur_its[ind] += 1 curopp = self.cur_opp[ind] - info[inv] = {'inv': self.inv[ind], 'other': self.site_acceptance_threshold[curopp], + assert(isinstance(curopp, int)) + info[inv] = {'inv': self.inv[ind], 'other': self.site_action[curopp], 'accept': action_dict[choice]} # if they both agree if action_dict[choice] == 1 and self.inv[ind] >= self.site_acceptance_threshold[curopp]\ @@ -101,10 +113,11 @@ def step(self, action_dict): done[choice] = True done[inv] = True obs[inv] = np.array([0], dtype=np.float32) - self.cur_its[ind] = self.max_it + if not self.eval_mode: + self.cur_its[ind] = self.max_it # force end of experiment else: # if at least one disagree or not a real site done[choice] = False - self.cur_opp[ind] = self._find_opp() + self.cur_opp[ind], self.true_site[ind] = self._find_opp() if self.new_x_each_interaction: obs[inv] = np.array([0], dtype=np.float32) reward[inv] = 0 diff --git a/cma_test.py b/cma_test.py index 76f8299..5e2a85a 100644 --- a/cma_test.py +++ b/cma_test.py @@ -10,6 +10,7 @@ from gym.spaces import Discrete, Box from ray.rllib.evaluation import collect_metrics from ray.rllib.models import ModelCatalog +from ray.rllib.policy.policy import clip_action from ray.rllib.policy import build_torch_policy from ray.rllib.utils.filter import get_filter from ray.tune import register_env @@ -129,7 +130,8 @@ def train(config, reporter): timestep_total += 1 act = {} for key in obs: - act[key] = policies[key].compute_actions([obs[key]])[0][0] + act[key] = clip_action(policies[key].compute_actions([obs[key]])[0], + policies[key].action_space_struct)[0] obs, reward, done, info = env.step(act) for key in reward: totrewards[key] += reward[key] @@ -185,7 +187,8 @@ def evaluate(best, env, i_episode, policies, reporter, timestep_total): stepcount += 1 act = {} for key in obs: - act[key] = policies[key].compute_actions([obs[key]])[0][0] + act[key] = clip_action(policies[key].compute_actions([obs[key]])[0], + policies[key].action_space_struct)[0] if "inv00" in act: inv_through_eval.append(act["inv00"]) obs, reward, done, info = env.step(act) diff --git a/evaluate_at_checkpoint.py b/evaluate_at_checkpoint.py index 6a487ea..1038409 100644 --- a/evaluate_at_checkpoint.py +++ b/evaluate_at_checkpoint.py @@ -1,45 +1,143 @@ -from ray.tune import register_env +import glob +import logging +import re +from copy import copy +from pathlib import Path +from typing import List +from typing.re import Pattern +import numpy as np +import pickle +import tqdm +from ray.tune import register_env +from ray.tune.logger import pretty_print +from timer import timer from PartnerChoiceEnv import PartnerChoiceFakeSites import ray from cma_test import CMAESTorchPolicy -from ray.rllib.agents.ppo import PPOTrainer - +from ray.rllib.agents.ppo import PPOTrainer, PPOTorchPolicy +import pandas as pd from main_test import init_setup, select_policy, MyCallbacks + +logging.basicConfig(level=logging.DEBUG) + policies = init_setup() config = { - "num_envs_per_worker": 16, - "multiagent": { - "policies": policies, - "policy_mapping_fn": select_policy, - }, - "clip_actions": True, - "framework": "torch", - "no_done_at_end": True, - "gamma": 1, - "lr": 5e-3, - "num_sgd_iter": 10, - "callbacks": MyCallbacks, - "env": "partner_choice", - "env_config": - { - "good_site_prob": 1, - "max_it": 100 - } - } - -def main(): - ray.init(local_mode=True) - register_env("partner_choice", - lambda config: PartnerChoiceFakeSites(config)) + "num_envs_per_worker": 16, + "num_workers": 0, + "multiagent": { + "policies": policies, + "policy_mapping_fn": select_policy, + }, + "clip_actions": True, + "framework": "torch", + "no_done_at_end": True, + "gamma": 1, + "lr": 5e-3, + "num_sgd_iter": 10, + "callbacks": MyCallbacks, + "env": "partner_choice", + "env_config": + { + "good_site_prob": 1, + "max_it": 100 + } +} + + +@timer +def bench(path): + if "cma" in path: + agent = loadcma(path) + else: + agent = loadppo(path) + config["env_config"]["eval_mode"] = True + config["env_config"]["good_site_prob"] = 1 + config["env_config"]["max_it"] = 100 + env = PartnerChoiceFakeSites(config["env_config"]) + logs = [] + for i_opp in range(env.nb_sites): + obs = env.reset(force_cur_opp=i_opp) + done = {"__all__": False} + while not done["__all__"]: + act = {} + for key in obs: + act[key] = agent.get_policy(key).compute_actions(obs[key].reshape(1, -1))[0].flatten()[0] + obs, reward, done, info = env.step(act) + if "inv00" in info: + assert (isinstance(info["inv00"]["other"], float)) + logs.append(copy(info["inv00"])) + df = pd.DataFrame(logs) + return df + + +@timer +def loadppo(path): agent = PPOTrainer(config) - agent.restore("/Users/paulecoffet/Documents/isir/These/data/RLCoopExp/logs/paperrun/e200000/ppobiglr/goodsiteprob_20201120-221749/PPO_partner_choice_d7c84_00023_23_good_site_prob=1.0,max_it=100.0_2020-11-21_00-43-53/checkpoint_594/checkpoint-594") + agent.load_checkpoint(path) + return agent + +@timer +def loadcma(path): + # hackish lookalike + class FakeAgentDict(dict): + def get_policy(self, policy): + return self[policy] + agent = FakeAgentDict() + with open(path, "rb") as f: + bests = pickle.load(f) + for key, params in config["multiagent"]["policies"].items(): + agent[key] = CMAESTorchPolicy(*params[1:]) + agent[key].set_flat_weights(bests[key]) + return agent + - print(agent) - agent.compute_action() +def get_highest(vals: List[str], *, pattern: Pattern = ""): + if vals is None or len(vals) == 0: + return None + m = np.argmax([float(re.search(pattern, val).group("target")) for val in vals]) + return vals[m] if __name__ == "__main__": - main() + ray.init(local_mode=True) + register_env("partner_choice", + lambda config: PartnerChoiceFakeSites(config)) + + main_path = Path("/Users/paulecoffet/Documents/isir/These/data/RLCoopExp/logs/paperrun2/e1000000/ppobiglr/") + glob_path = main_path + alldfs = [] + with timer("glob"): + allpaths = list(main_path.rglob("**/*")) + for path in tqdm.tqdm(allpaths): + res = re.search(r"partner_choice_(?P.*)_(?P\d+)_good_site_prob=(?P[0-9.]*),", str(path)) + if not res: + continue + run_id = res.group("runid") + trial_id = res.group("trialid") + good_site_prob = res.group("prob") + if "cma" in str(path): + checkpoint_path = path / "checkpoint200000/best.pkl" + else: + checkpoint_path = get_highest([str(c) for c in path.glob("checkpoint*/*") + if "tune_metadata" not in str(c) and ".is_check" not in str(c)], + pattern=r"checkpoint[-_]?(?P[-0-9]+)/") + if not checkpoint_path or not Path(checkpoint_path).exists(): + print("no checkpoint for", path) + continue + try: + df = bench(str(checkpoint_path)) + except Exception as e: + print(checkpoint_path) + print(type(e), e) + else: + df["run_id"] = run_id + df["trial_id"] = trial_id + df["good_site_prob"] = df["p"] = good_site_prob + df["checkpoint_path"] = checkpoint_path + alldfs.append(df) + with timer("saving"): + fuldf = pd.concat(alldfs) + fuldf.to_csv(main_path / "postmortem.csv.gz") diff --git a/main_test.py b/main_test.py index 02cc916..3e12276 100644 --- a/main_test.py +++ b/main_test.py @@ -87,11 +87,13 @@ def init_setup(): choicemodel_dict = { "model": { "fcnet_hiddens": [3], + "max_seq_len": 9999999 } } investormodel_dict = { "model": { - "fcnet_hiddens": [] + "fcnet_hiddens": [], + "max_seq_len": 9999999 } } policies = {inv_id[i]: (None, inv_obs_space, inv_act_space, investormodel_dict) for i in range(nb_agents)}