Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…into main
  • Loading branch information
PaulEcoffet committed Dec 4, 2020
2 parents 2022c2d + b67fda1 commit df7d500
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 41 deletions.
27 changes: 20 additions & 7 deletions PartnerChoiceEnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def payoff(xi, xj, *, a=5, b=5, temp=None):
class PartnerChoiceFakeSites(MultiAgentEnv):

def __init__(self, env_config):
if "env_config" in env_config:
raise ValueError("Do not provide the full config but only the env_config")
if "bad_site_prob" in env_config:
raise ValueError("deprecated key bad_site_prob")

Expand All @@ -37,12 +39,13 @@ def __init__(self, env_config):
self.new_x_each_interaction = env_config.get("new_x_each_interaction", True)
self.max_action = env_config.get("max_action", 15.0)
self.good_site_prob = env_config.get("good_site_prob", 1)
self.eval_mode = env_config.get("eval_mode", False)
self.iteration_count = 0
self.true_site = [True for i in range(self.nb_agents)]
self.agents_names = ['inv' + '{:02d}'.format(i) for i in range(self.nb_agents)]
self.agents_names += ['choice' + '{:02d}'.format(i) for i in range(self.nb_agents)]
self.inv = [0.0 for i in range(self.nb_agents)]
self.cur_opp = [-1 for i in range(self.nb_agents)]
self.cur_opp = [None for i in range(self.nb_agents)]

self.cur_its = np.array([0 for i in range(self.nb_agents)], dtype=int)

Expand All @@ -51,16 +54,24 @@ def __init__(self, env_config):

self.site_action = np.linspace(0, self.max_action, self.nb_sites)
self.site_acceptance_threshold = np.copy(self.site_action)
self.force_opp = None

assert(not self.eval_mode or self.good_site_prob == 1)

def reset(self):
def reset(self, *, force_cur_opp=None):
if force_cur_opp is not None and not self.eval_mode:
raise ValueError("force config only allowed in eval mode")
self.force_opp = force_cur_opp
self.cur_its = np.array([0 for i in range(self.nb_agents)], dtype=int)
self.cur_opp = [None for i in range(self.nb_agents)]
self.cur_opp = [force_cur_opp for i in range(self.nb_agents)]
# Make all individuals make their investment choice
return {
f'inv{i:02}': np.array([0], dtype=np.float32) for i in range(self.nb_agents)
}

def _find_opp(self):
if self.force_opp is not None:
return self.force_opp, True
true_site = True
if np.random.rand() < 1 - self.good_site_prob:
true_site = False
Expand All @@ -80,15 +91,16 @@ def step(self, action_dict):
choice = f"choice{ind:02d}"
# If we get a investment action
if agent_name.startswith('inv'):
self.inv[ind] = action_dict[agent_name][0]
self.inv[ind] = np.asarray(action_dict[agent_name]).flatten()[0]
self.cur_opp[ind], self.true_site[ind] = self._find_opp()
obs[choice] = np.array([self.site_action[self.cur_opp[ind]], self.inv[ind]])
reward[choice] = 0 # dummy reward at init
else: # if it's a choice action
self.cur_its[ind] += 1

curopp = self.cur_opp[ind]
info[inv] = {'inv': self.inv[ind], 'other': self.site_acceptance_threshold[curopp],
assert(isinstance(curopp, int))
info[inv] = {'inv': self.inv[ind], 'other': self.site_action[curopp],
'accept': action_dict[choice]}
# if they both agree
if action_dict[choice] == 1 and self.inv[ind] >= self.site_acceptance_threshold[curopp]\
Expand All @@ -101,10 +113,11 @@ def step(self, action_dict):
done[choice] = True
done[inv] = True
obs[inv] = np.array([0], dtype=np.float32)
self.cur_its[ind] = self.max_it
if not self.eval_mode:
self.cur_its[ind] = self.max_it # force end of experiment
else: # if at least one disagree or not a real site
done[choice] = False
self.cur_opp[ind] = self._find_opp()
self.cur_opp[ind], self.true_site[ind] = self._find_opp()
if self.new_x_each_interaction:
obs[inv] = np.array([0], dtype=np.float32)
reward[inv] = 0
Expand Down
7 changes: 5 additions & 2 deletions cma_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from gym.spaces import Discrete, Box
from ray.rllib.evaluation import collect_metrics
from ray.rllib.models import ModelCatalog
from ray.rllib.policy.policy import clip_action
from ray.rllib.policy import build_torch_policy
from ray.rllib.utils.filter import get_filter
from ray.tune import register_env
Expand Down Expand Up @@ -129,7 +130,8 @@ def train(config, reporter):
timestep_total += 1
act = {}
for key in obs:
act[key] = policies[key].compute_actions([obs[key]])[0][0]
act[key] = clip_action(policies[key].compute_actions([obs[key]])[0],
policies[key].action_space_struct)[0]
obs, reward, done, info = env.step(act)
for key in reward:
totrewards[key] += reward[key]
Expand Down Expand Up @@ -185,7 +187,8 @@ def evaluate(best, env, i_episode, policies, reporter, timestep_total):
stepcount += 1
act = {}
for key in obs:
act[key] = policies[key].compute_actions([obs[key]])[0][0]
act[key] = clip_action(policies[key].compute_actions([obs[key]])[0],
policies[key].action_space_struct)[0]
if "inv00" in act:
inv_through_eval.append(act["inv00"])
obs, reward, done, info = env.step(act)
Expand Down
160 changes: 129 additions & 31 deletions evaluate_at_checkpoint.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,143 @@
from ray.tune import register_env
import glob
import logging
import re
from copy import copy
from pathlib import Path
from typing import List
from typing.re import Pattern
import numpy as np
import pickle
import tqdm

from ray.tune import register_env
from ray.tune.logger import pretty_print
from timer import timer
from PartnerChoiceEnv import PartnerChoiceFakeSites
import ray
from cma_test import CMAESTorchPolicy
from ray.rllib.agents.ppo import PPOTrainer

from ray.rllib.agents.ppo import PPOTrainer, PPOTorchPolicy
import pandas as pd
from main_test import init_setup, select_policy, MyCallbacks


logging.basicConfig(level=logging.DEBUG)

policies = init_setup()

config = {
"num_envs_per_worker": 16,
"multiagent": {
"policies": policies,
"policy_mapping_fn": select_policy,
},
"clip_actions": True,
"framework": "torch",
"no_done_at_end": True,
"gamma": 1,
"lr": 5e-3,
"num_sgd_iter": 10,
"callbacks": MyCallbacks,
"env": "partner_choice",
"env_config":
{
"good_site_prob": 1,
"max_it": 100
}
}

def main():
ray.init(local_mode=True)
register_env("partner_choice",
lambda config: PartnerChoiceFakeSites(config))
"num_envs_per_worker": 16,
"num_workers": 0,
"multiagent": {
"policies": policies,
"policy_mapping_fn": select_policy,
},
"clip_actions": True,
"framework": "torch",
"no_done_at_end": True,
"gamma": 1,
"lr": 5e-3,
"num_sgd_iter": 10,
"callbacks": MyCallbacks,
"env": "partner_choice",
"env_config":
{
"good_site_prob": 1,
"max_it": 100
}
}


@timer
def bench(path):
if "cma" in path:
agent = loadcma(path)
else:
agent = loadppo(path)
config["env_config"]["eval_mode"] = True
config["env_config"]["good_site_prob"] = 1
config["env_config"]["max_it"] = 100
env = PartnerChoiceFakeSites(config["env_config"])
logs = []
for i_opp in range(env.nb_sites):
obs = env.reset(force_cur_opp=i_opp)
done = {"__all__": False}
while not done["__all__"]:
act = {}
for key in obs:
act[key] = agent.get_policy(key).compute_actions(obs[key].reshape(1, -1))[0].flatten()[0]
obs, reward, done, info = env.step(act)
if "inv00" in info:
assert (isinstance(info["inv00"]["other"], float))
logs.append(copy(info["inv00"]))
df = pd.DataFrame(logs)
return df


@timer
def loadppo(path):
agent = PPOTrainer(config)
agent.restore("/Users/paulecoffet/Documents/isir/These/data/RLCoopExp/logs/paperrun/e200000/ppobiglr/goodsiteprob_20201120-221749/PPO_partner_choice_d7c84_00023_23_good_site_prob=1.0,max_it=100.0_2020-11-21_00-43-53/checkpoint_594/checkpoint-594")
agent.load_checkpoint(path)
return agent

@timer
def loadcma(path):
# hackish lookalike
class FakeAgentDict(dict):
def get_policy(self, policy):
return self[policy]
agent = FakeAgentDict()
with open(path, "rb") as f:
bests = pickle.load(f)
for key, params in config["multiagent"]["policies"].items():
agent[key] = CMAESTorchPolicy(*params[1:])
agent[key].set_flat_weights(bests[key])
return agent


print(agent)
agent.compute_action()
def get_highest(vals: List[str], *, pattern: Pattern = ""):
if vals is None or len(vals) == 0:
return None
m = np.argmax([float(re.search(pattern, val).group("target")) for val in vals])
return vals[m]


if __name__ == "__main__":
main()
ray.init(local_mode=True)
register_env("partner_choice",
lambda config: PartnerChoiceFakeSites(config))

main_path = Path("/Users/paulecoffet/Documents/isir/These/data/RLCoopExp/logs/paperrun2/e1000000/ppobiglr/")
glob_path = main_path
alldfs = []
with timer("glob"):
allpaths = list(main_path.rglob("**/*"))
for path in tqdm.tqdm(allpaths):
res = re.search(r"partner_choice_(?P<trialid>.*)_(?P<runid>\d+)_good_site_prob=(?P<prob>[0-9.]*),", str(path))
if not res:
continue
run_id = res.group("runid")
trial_id = res.group("trialid")
good_site_prob = res.group("prob")
if "cma" in str(path):
checkpoint_path = path / "checkpoint200000/best.pkl"
else:
checkpoint_path = get_highest([str(c) for c in path.glob("checkpoint*/*")
if "tune_metadata" not in str(c) and ".is_check" not in str(c)],
pattern=r"checkpoint[-_]?(?P<target>[-0-9]+)/")
if not checkpoint_path or not Path(checkpoint_path).exists():
print("no checkpoint for", path)
continue
try:
df = bench(str(checkpoint_path))
except Exception as e:
print(checkpoint_path)
print(type(e), e)
else:
df["run_id"] = run_id
df["trial_id"] = trial_id
df["good_site_prob"] = df["p"] = good_site_prob
df["checkpoint_path"] = checkpoint_path
alldfs.append(df)
with timer("saving"):
fuldf = pd.concat(alldfs)
fuldf.to_csv(main_path / "postmortem.csv.gz")
4 changes: 3 additions & 1 deletion main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,13 @@ def init_setup():
choicemodel_dict = {
"model": {
"fcnet_hiddens": [3],
"max_seq_len": 9999999
}
}
investormodel_dict = {
"model": {
"fcnet_hiddens": []
"fcnet_hiddens": [],
"max_seq_len": 9999999
}
}
policies = {inv_id[i]: (None, inv_obs_space, inv_act_space, investormodel_dict) for i in range(nb_agents)}
Expand Down

0 comments on commit df7d500

Please sign in to comment.