diff --git a/pomdp_py/algorithms/po_rollout.pxd b/pomdp_py/algorithms/po_rollout.pxd index 1c5523ae..8bbd180b 100644 --- a/pomdp_py/algorithms/po_rollout.pxd +++ b/pomdp_py/algorithms/po_rollout.pxd @@ -1,4 +1,4 @@ -from pomdp_py.framework.basics cimport Action, State, Observation, Agent +from pomdp_py.framework.basics cimport Action, State, Observation, Agent, Response from pomdp_py.framework.planner cimport Planner from pomdp_py.algorithms.po_uct cimport RolloutPolicy, ActionPrior @@ -11,7 +11,7 @@ cdef class PORollout(Planner): cdef float _discount_factor cdef bint _particles cdef Agent _agent - cdef float _last_best_reward + cdef Response _last_best_response cpdef _search(self) cpdef _rollout(self, State state, int depth) diff --git a/pomdp_py/algorithms/po_rollout.pyx b/pomdp_py/algorithms/po_rollout.pyx index 324cf3d2..d8f3cbe5 100644 --- a/pomdp_py/algorithms/po_rollout.pyx +++ b/pomdp_py/algorithms/po_rollout.pyx @@ -15,7 +15,7 @@ it will do the rollouts and action selection as described. from pomdp_py.framework.basics cimport Action, Agent, POMDP, State, Observation,\ ObservationModel, TransitionModel, GenerativeDistribution, PolicyModel,\ - sample_generative_model + sample_generative_model, Response from pomdp_py.framework.planner cimport Planner from pomdp_py.representations.distribution.particles cimport Particles from pomdp_py.representations.belief.particles cimport particle_reinvigoration @@ -46,58 +46,60 @@ cdef class PORollout(Planner): self._particles = particles self._agent = None - self._last_best_reward = float('-inf') + self._last_best_response = None @property - def last_best_reward(self): - return self._last_best_reward + def last_best_response(self): + return self._last_best_response cpdef public plan(self, Agent agent): self._agent = agent - best_action, best_reward = self._search() - self._last_best_reward = best_reward + best_action, best_response = self._search() + self._last_best_response = best_response return best_action cpdef _search(self): cdef Action best_action - cdef float best_reward, reward_avg, total_discounted_reward + cdef Response best_response + cdef Response response_avg + cdef Response total_discounted_response cdef set legal_actions - cdef list rewards + cdef list responses - best_action, best_reward = None, float("-inf") + best_action, best_response = None, Response(float("-inf")) legal_actions = self._agent.valid_actions(history=self._agent.history) for action in legal_actions: - rewards = [] + responses = [] for i in range(self._num_sims // len(legal_actions)): state = self._agent.belief.random() - total_discounted_reward = self._rollout(state, 0) - rewards.append(total_discounted_reward) - reward_avg = sum(rewards) / len(rewards) - if reward_avg > best_reward: + total_discounted_response = self._rollout(state, 0) + responses.append(total_discounted_response) + response_avg = sum(responses) / len(responses) + if response_avg > best_response: best_action = action - best_reward = reward_avg - return best_action, best_reward + best_response = response_avg + return best_action, best_response cpdef _rollout(self, State state, int depth): # Rollout without a tree. cdef Action action cdef float discount = 1.0 - cdef float total_discounted_reward = 0 + cdef Response total_discounted_response = Response() cdef State next_state cdef Observation observation - cdef float reward + cdef Response response cdef int nsteps cdef tuple history = self._agent.history while depth <= self._max_depth: action = self._rollout_policy.rollout(state, history=history) - next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action) + next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action) history = history + ((action, observation),) depth += 1 - total_discounted_reward += reward * discount + total_discounted_response = total_discounted_response + response * discount discount *= self._discount_factor state = next_state - return total_discounted_reward + return total_discounted_response cpdef update(self, Agent agent, Action real_action, Observation real_observation, state_transform_func=None): @@ -110,7 +112,7 @@ cdef class PORollout(Planner): if not isinstance(cur_belief, Particles): raise ValueError("Agent's belief is not in particles.") for state in cur_belief.particles: - next_state, observation, reward, nsteps = sample_generative_model(agent, state, + next_state, observation, response, nsteps = sample_generative_model(agent, state, real_action) if observation == real_observation: new_belief.add(next_state) @@ -128,8 +130,8 @@ cdef class PORollout(Planner): def clear_agent(self): """clear_agent(self)""" self._agent = None # forget about current agent so that can plan for another agent. - self._last_best_reward = float('-inf') - + self._last_best_response = Response(float('-inf')) + cpdef set_rollout_policy(self, RolloutPolicy rollout_policy): """ set_rollout_policy(self, RolloutPolicy rollout_policy) diff --git a/pomdp_py/algorithms/po_uct.pxd b/pomdp_py/algorithms/po_uct.pxd index 6f66fffd..3517d8d7 100644 --- a/pomdp_py/algorithms/po_uct.pxd +++ b/pomdp_py/algorithms/po_uct.pxd @@ -1,5 +1,5 @@ from pomdp_py.framework.planner cimport Planner -from pomdp_py.framework.basics cimport Agent, PolicyModel, Action, State, Observation +from pomdp_py.framework.basics cimport Agent, PolicyModel, Action, State, Observation, Response cdef class TreeNode: cdef public dict children @@ -7,10 +7,11 @@ cdef class TreeNode: cdef public float value cdef class QNode(TreeNode): - pass + cpdef void update(QNode self, Response response) cdef class VNode(TreeNode): cpdef argmax(VNode self) + cpdef void update(VNode self) cdef class RootVNode(VNode): cdef public tuple history diff --git a/pomdp_py/algorithms/po_uct.pyx b/pomdp_py/algorithms/po_uct.pyx index c0f02665..838360c4 100644 --- a/pomdp_py/algorithms/po_uct.pyx +++ b/pomdp_py/algorithms/po_uct.pyx @@ -35,7 +35,7 @@ the prior knowledge. from pomdp_py.framework.basics cimport Action, Agent, POMDP, State, Observation,\ ObservationModel, TransitionModel, GenerativeDistribution, PolicyModel,\ - sample_generative_model + sample_generative_model, Response from pomdp_py.framework.planner cimport Planner from pomdp_py.representations.distribution.particles cimport Particles from pomdp_py.utils import typ @@ -64,13 +64,20 @@ cdef class QNode(TreeNode): self.num_visits = num_visits self.value = value self.children = {} # o -> VNode + def __str__(self): return typ.red("QNode") + "(%.3f, %.3f | %s)" % (self.num_visits, self.value, str(self.children.keys())) + def __repr__(self): return self.__str__() + cpdef void update(QNode self, Response response): + self.num_visits += 1 + self.value = self.value + (response.reward - self.value) / self.num_visits + + cdef class VNode(TreeNode): def __init__(self, num_visits, **kwargs): self.num_visits = num_visits @@ -98,6 +105,9 @@ cdef class VNode(TreeNode): best_value = self[action].value return best_action + cpdef void update(VNode self): + self.num_visits += 1 + @property def value(self): best_action = max(self.children, key=lambda action: self.children[action].value) @@ -361,7 +371,7 @@ cdef class POUCT(Planner): State state, tuple history, VNode root, QNode parent, Observation observation, int depth): if depth > self._max_depth: - return 0 + return self._agent.response_model.create_response() if root is None: if self._agent.tree is None: root = self._VNode(root=True) @@ -373,46 +383,46 @@ cdef class POUCT(Planner): if parent is not None: parent[observation] = root self._expand_vnode(root, history, state=state) - rollout_reward = self._rollout(state, history, root, depth) - return rollout_reward + rollout_response = self._rollout(state, history, root, depth) + return rollout_response cdef int nsteps action = self._ucb(root) - next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action) + next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action) if nsteps == 0: # This indicates the provided action didn't lead to transition # Perhaps the action is not allowed to be performed for the given state # (for example, the state is not in the initiation set of the option, # or the state is a terminal state) - return reward + return response - total_reward = reward + (self._discount_factor**nsteps)*self._simulate(next_state, + total_response = response + (self._discount_factor**nsteps)*self._simulate(next_state, history + ((action, observation),), root[action][observation], root[action], observation, depth+nsteps) - root.num_visits += 1 - root[action].num_visits += 1 - root[action].value = root[action].value + (total_reward - root[action].value) / (root[action].num_visits) - return total_reward + + root.update() + root[action].update(total_response) + return total_response cpdef _rollout(self, State state, tuple history, VNode root, int depth): cdef Action action cdef float discount = 1.0 - cdef float total_discounted_reward = 0 + cdef Response total_discounted_response = self._agent.response_model.create_response() cdef State next_state cdef Observation observation - cdef float reward + cdef Response response while depth < self._max_depth: action = self._rollout_policy.rollout(state, history) - next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action) + next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action) history = history + ((action, observation),) depth += nsteps - total_discounted_reward += reward * discount + total_discounted_response = total_discounted_response + response * discount discount *= (self._discount_factor**nsteps) state = next_state - return total_discounted_reward + return total_discounted_response cpdef Action _ucb(self, VNode root): """UCB1""" @@ -436,15 +446,15 @@ cdef class POUCT(Planner): ''' cdef State next_state cdef Observation observation - cdef float reward + cdef Response response if self._agent.transition_model is None: - next_state, observation, reward = self._agent.generative_model.sample(state, action) + next_state, observation, response = self._agent.generative_model.sample(state, action) else: next_state = self._agent.transition_model.sample(state, action) observation = self._agent.observation_model.sample(next_state, action) - reward = self._agent.reward_model.sample(state, action, next_state) - return next_state, observation, reward + response = self._agent.response_model.sample(state, action, next_state) + return next_state, observation, response def _VNode(self, root=False, **kwargs): """Returns a VNode with default values; The function naming makes it clear diff --git a/pomdp_py/algorithms/pomcp.pyx b/pomdp_py/algorithms/pomcp.pyx index 349b8127..52804cd6 100644 --- a/pomdp_py/algorithms/pomcp.pyx +++ b/pomdp_py/algorithms/pomcp.pyx @@ -128,10 +128,10 @@ cdef class POMCP(POUCT): cpdef _simulate(POMCP self, State state, tuple history, VNode root, QNode parent, Observation observation, int depth): - total_reward = POUCT._simulate(self, state, history, root, parent, observation, depth) + total_response = POUCT._simulate(self, state, history, root, parent, observation, depth) if depth == 1 and root is not None: root.belief.add(state) # belief update happens as simulation goes. - return total_reward + return total_response def _VNode(self, root=False, **kwargs): """Returns a VNode with default values; The function naming makes it clear diff --git a/pomdp_py/algorithms/value_iteration.pyx b/pomdp_py/algorithms/value_iteration.pyx index 680e083e..52a3f8ea 100644 --- a/pomdp_py/algorithms/value_iteration.pyx +++ b/pomdp_py/algorithms/value_iteration.pyx @@ -48,8 +48,8 @@ cdef class _PolicyTreeNode: subtree_value = self.children[o].values[sp] # corresponds to V_{oi(p)} in paper else: subtree_value = 0.0 - reward = self._agent.reward_model.sample(s, self.action, sp) - expected_future_value += trans_prob * obsrv_prob * (reward + discount_factor*subtree_value) + response = self._agent.response_model.sample(s, self.action, sp) + expected_future_value += trans_prob * obsrv_prob * (response.reward + discount_factor*subtree_value) values[s] = expected_future_value return values diff --git a/pomdp_py/framework/basics.pxd b/pomdp_py/framework/basics.pxd index b3824538..70f1a5ad 100644 --- a/pomdp_py/framework/basics.pxd +++ b/pomdp_py/framework/basics.pxd @@ -8,6 +8,12 @@ cdef class TransitionModel: pass cdef class PolicyModel: pass + +cdef class ResponseModel: + cdef dict _model_dict + cdef Response _response + cdef dict __dict__ + cdef class BlackboxModel: pass cdef class RewardModel: @@ -27,6 +33,12 @@ cdef class State: cdef class Observation: pass +cdef class Vector(list): + pass + +cdef class Response: + cdef float _reward + cdef class Agent: cdef GenerativeDistribution _init_belief cdef PolicyModel _policy_model @@ -41,7 +53,7 @@ cdef class Agent: cdef class Environment: cdef State _init_state cdef TransitionModel _transition_model - cdef RewardModel _reward_model + cdef ResponseModel _response_model cdef BlackboxModel _blackbox_model cdef State _cur_state @@ -49,4 +61,4 @@ cdef class Option(Action): pass cpdef sample_generative_model(Agent agent, State state, Action action, float discount_factor=*) -cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R, State state, Action a, float discount_factor=*) +cpdef sample_explict_models(TransitionModel T, ObservationModel O, ResponseModel R, State state, Action a, float discount_factor=*) diff --git a/pomdp_py/framework/basics.pyx b/pomdp_py/framework/basics.pyx index d53c0b35..7da4af41 100644 --- a/pomdp_py/framework/basics.pyx +++ b/pomdp_py/framework/basics.pyx @@ -186,6 +186,100 @@ cdef class RewardModel: Returns the underlying distribution of the model""" raise NotImplementedError +cdef class ResponseModel: + """A ResponseModel returns a real or simulated response + after the agent interacts with the real or a simulated environment. + The implementation of this model contains a collection of more + specific models such as reward and cost models.""" + def __init__(self, response): + self._model_dict = dict() + self._response = response + + @staticmethod + def generate_response_model(model_dict, response=Response()): + """ + Generate a response model based on a dictionary of model attributes. This is a + convenience method to make it easier to build a Response model. + + Args: + model_dict (dict): A dictionary of models in the form {model_type: model} (e.g., {reward: reward_model}) + response (Response): A response that will be used to generate new responses. + + Returns: + The response model. + """ + # Do a sanity check to ensure the response model and response are compatible. + for name in model_dict.keys(): + if not hasattr(response, name): + raise AttributeError(f"The response {type(response)} does not have the attribute {name}.") + + # Create the response model and add the models. + model = ResponseModel(response) + model.add_models(model_dict) + return model + + def add_attrs(self, attr_dict): + """ + Adds attributes to this object dynamically. + + Args: + attr_dict: A dictionary of attribute names and values. + """ + if not isinstance(attr_dict, dict): + raise TypeError(f"attr_dict must be type dict, but got {type(attr_dict)}.") + + for ak, av in attr_dict.items(): + if hasattr(self, ak): + raise KeyError(f"The attribute {ak} already exists.") + setattr(self, ak, None) + + def add_models(self, model_dict): + """ + Add models to the response. + + Args: + model_dict: A dictionary of models in the form {model_type: model} (e.g., {reward: reward_model}). + """ + if not isinstance(model_dict, dict): + raise TypeError(f"model_dict must be type dict, but got {type(model_dict)}.") + + for model_name, model in model_dict.items(): + # Perform a sanity check. + if not hasattr(model, "sample"): + raise AttributeError(f"The model {model_name} does not have a sample(...) function.") + + # Store the model name for quick access in sample(...) function. + self._model_dict[model_name] = model + + # Add the models to the response model. + self.add_attrs(model_dict) + + def sample(self, state, action, next_state, **kwargs): + """sample(self, state, action, next_state) + Returns a randomly sampled response according to the + distribution of the internal models. + + Args: + state (~pomdp_py.framework.basics.State): the next state :math:`s` + action (~pomdp_py.framework.basics.Action): the action :math:`a` + next_state (State): the next state :math:`s'` + Returns: + Response: the response + """ + return self.create_response(**dict([ + (name, model.sample(state, action, next_state, **kwargs)) + for name, model in self._model_dict.items() + ])) + + def create_response(self, *args, **kwargs): + """ + Create a response with the given arguments. + + Returns: + An instance of : class : ` Response ` with the given parameters. + """ + return self._response.new(*args, **kwargs) + cdef class BlackboxModel: """ A BlackboxModel is the generative distribution :math:`G(s,a)` @@ -317,33 +411,132 @@ cdef class Observation: def __ne__(self, other): return not self.__eq__(other) +cdef class Vector(list): + """ + The Vector class. Provides an implementation of a vector for multi-valued response models. + """ + def __init__(self, values=list()): + if not isinstance(values, list): + raise TypeError(f"values must be type list, but got {type(values)}.") + for v in values: + self.append(v) + + def __eq__(self, other): + if not isinstance(other, (Vector, list)): + raise TypeError(f"other must be type Vector or list, but got {type(other)}.") + return len(self) == len(other) and all(v0 == v1 for v0, v1 in zip(self, other)) + + def __add__(self, other): + if isinstance(other, (float, int)): + vec = [other] * len(self) + elif isinstance(other, Vector): + vec = other + else: + raise TypeError(f"other must be type Vector, float, or int, but got {type(other)}.") + return Vector([v0 + v1 for v0, v1 in zip(self, vec)]) + + def __radd__(self, other): + return self.__add__(other) + + def __mul__(self, other): + if not isinstance(other, (float, int)): + raise TypeError(f"other must be type float or int, but got {type(other)}.") + return Vector([v * other for v in self]) + + def __rmul__(self, other): + return self.__mul__(other) + + +cdef class Response: + """ + A Response class that only handles a scalar reward. Subclasses of Response can add + more (scalar or vector) variables. But the subclasses must implement how to handle + arithmetic and comparison operations. + """ + def __init__(self, reward=0.0): + super().__init__() + self._reward = reward + + @property + def reward(self): + return self._reward + + @classmethod + def new(cls, reward=0.0): + return cls(reward=reward) + + def _check_reward_compatibility(self, value): + if not isinstance(value, (float, int, Response)): + raise TypeError(f"other must be type Response, float, or int, but got {type(value)}.") + + def _get_value(self, value): + self._check_reward_compatibility(value) + if isinstance(value, Response): + value = value.reward + return value + + def __add__(self, other): + return Response(self._reward + self._get_value(other)) + + def __radd__(self, other): + return self.__add__(other) + + def __mul__(self, other): + if not isinstance(other, (float, int)): + raise TypeError("other must be type float or int.") + return Response(self._reward * other) + + def __rmul__(self, other): + return self.__mul__(other) + + def __eq__(self, other): + return self._reward == self._get_value(other) + + def __ne__(self, other): + return self._reward != self._get_value(other) + + def __lt__(self, other): + return self._reward < self._get_value(other) + + def __le__(self, other): + return self._reward <= self._get_value(other) + + def __gt__(self, other): + return self._reward > self._get_value(other) + + def __ge__(self, other): + return self._reward >= self._get_value(other) + + def __str__(self): + return f"reward={self._reward}" + cdef class Agent: """ An Agent operates in an environment by taking actions, receiving observations, and updating its belief. Taking actions is the job of a planner (:class:`Planner`), and the belief update is the job taken care of by the belief representation or the planner. But, the Agent supplies the - :class:`TransitionModel`, :class:`ObservationModel`, :class:`RewardModel`, + :class:`TransitionModel`, :class:`ObservationModel`, :class:`ResponseModel`, OR :class:`BlackboxModel` to the planner or the belief update algorithm. __init__(self, init_belief, policy_model, transition_model=None, observation_model=None, - reward_model=None, + response_model=None, blackbox_model=None) """ def __init__(self, init_belief, policy_model=None, transition_model=None, observation_model=None, - reward_model=None, + response_model=None, blackbox_model=None): self._init_belief = init_belief self._policy_model = policy_model self._transition_model = transition_model self._observation_model = observation_model - self._reward_model = reward_model + self._response_model = response_model self._blackbox_model = blackbox_model # For online planning @@ -399,8 +592,8 @@ cdef class Agent: return self._transition_model @property - def reward_model(self): - return self._reward_model + def response_model(self): + return self._response_model @property def policy_model(self): @@ -415,14 +608,14 @@ cdef class Agent: return self.blackbox_model def set_models(self, transition_model=None, observation_model=None, - reward_model=None, blackbox_model=None, policy_model=None): + response_model=None, blackbox_model=None, policy_model=None): """Re-assign the models to be the ones given.""" if transition_model is not None: self._transition_model = transition_model if observation_model is not None: self._observation_model = observation_model - if reward_model is not None: - self._reward_model = reward_model + if response_model is not None: + self._response_model = response_model if blackbox_model is not None: self._blackbox_model = blackbox_model if policy_model is not None: @@ -478,17 +671,17 @@ cdef class Environment: __init__(self, init_state, transition_model=None, - reward_model=None, + response_model=None, blackbox_model=None) """ def __init__(self, init_state, transition_model=None, - reward_model=None, + response_model=None, blackbox_model=None): self._init_state = init_state self._cur_state = init_state self._transition_model = transition_model - self._reward_model = reward_model + self._response_model = response_model self._blackbox_model = blackbox_model @property @@ -507,21 +700,21 @@ cdef class Environment: return self._transition_model @property - def reward_model(self): - """The :class:`RewardModel` underlying the environment""" - return self._reward_model + def response_model(self): + """The :class:`ResponseModel` underlying the environment""" + return self._response_model @property def blackbox_model(self): """The :class:`BlackboxModel` underlying the environment""" return self._blackbox_model - def set_models(self, transition_model=None, reward_model=None, blackbox_model=None): + def set_models(self, transition_model=None, response_model=None, blackbox_model=None): """Re-assign the models to be the ones given.""" if transition_model is not None: self._transition_model = transition_model - if reward_model is not None: - self._reward_model = reward_model + if response_model is not None: + self._response_model = response_model if blackbox_model is not None: self._blackbox_model = blackbox_model @@ -538,17 +731,17 @@ cdef class Environment: factor when executing actions following an option's policy until reaching terminal condition. Returns: - float or tuple: reward as a result of `action` and state transition, if `execute` is True - (next_state, reward) if `execute` is False. + Response or tuple: response as a result of `action` and state transition, if `execute` is True + (next_state, response) if `execute` is False. """ - next_state, reward, _ = sample_explict_models(self.transition_model, None, self.reward_model, + next_state, response, _ = sample_explict_models(self.transition_model, None, self.response_model, self.state, action, discount_factor=discount_factor) if execute: self.apply_transition(next_state) - return reward + return response else: - return next_state, reward + return next_state, response def apply_transition(self, next_state): """ @@ -558,9 +751,9 @@ cdef class Environment: self._cur_state = next_state def execute(self, action, observation_model): - reward = self.state_transition(action, execute=True) + response = self.state_transition(action, execute=True) observation = self.provide_observation(observation_model, action) - return (observation, reward) + return (observation, response) def provide_observation(self, observation_model, action): """ @@ -652,21 +845,21 @@ cpdef sample_generative_model(Agent agent, State state, Action action, float dis else: result = sample_explict_models(agent.transition_model, agent.observation_model, - agent.reward_model, + agent.response_model, state, action, discount_factor) return result -cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R, +cpdef sample_explict_models(TransitionModel T, ObservationModel O, ResponseModel R, State state, Action action, float discount_factor=1.0): """ - sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R, State state, Action action, float discount_factor=1.0) + sample_explict_models(TransitionModel T, ObservationModel O, ResponseModel R, State state, Action action, float discount_factor=1.0) """ cdef State next_state cdef Observation observation - cdef float reward + cdef Response response = Response() cdef Option option cdef int nsteps = 0 @@ -682,17 +875,17 @@ cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R # action will lead to no state change, no observation, and 0 reward, # because nothing happened. if O is not None: - return state, None, 0, 0 + return state, None, 0, response else: - return state, 0, 0 + return state, 0, response - reward = 0 + # response = 0 step_discount_factor = 1.0 while not option.termination(state): action = option.sample(state) next_state = T.sample(state, action) # For now, we don't care about intermediate observations (future work?). - reward += step_discount_factor * R.sample(state, action, next_state) + response = response + step_discount_factor * R.sample(state, action, next_state) step_discount_factor *= discount_factor state = next_state nsteps += 1 @@ -700,10 +893,10 @@ cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R # (doesn't quite make sense to just use option as the action at this point.) else: next_state = T.sample(state, action) - reward = R.sample(state, action, next_state) + response = R.sample(state, action, next_state) nsteps += 1 if O is not None: observation = O.sample(next_state, action) - return next_state, observation, reward, nsteps + return next_state, observation, response, nsteps else: - return next_state, reward, nsteps + return next_state, response, nsteps diff --git a/pomdp_py/problems/load_unload/load_unload.py b/pomdp_py/problems/load_unload/load_unload.py index 197ea5db..05c73823 100644 --- a/pomdp_py/problems/load_unload/load_unload.py +++ b/pomdp_py/problems/load_unload/load_unload.py @@ -215,15 +215,18 @@ def get_all_actions(self, **kwargs): class LoadUnloadProblem(pomdp_py.POMDP): def __init__(self, init_state, init_belief): """init_belief is a Distribution.""" + import copy + + response_model = pomdp_py.ResponseModel.generate_response_model({"reward": LURewardModel()}) agent = pomdp_py.Agent( init_belief, LUPolicyModel(), LUTransitionModel(), LUObservationModel(), - LURewardModel(), + copy.deepcopy(response_model), ) - env = pomdp_py.Environment(init_state, LUTransitionModel(), LURewardModel()) + env = pomdp_py.Environment(init_state, LUTransitionModel(), copy.deepcopy(response_model)) super().__init__(agent, env, name="LoadUnloadProblem") @@ -267,7 +270,8 @@ def update(t): print("==== Step %d ====" % (t + 1)) action = planner.plan(load_unload_problem.agent) - env_reward = load_unload_problem.env.state_transition(action, execute=True) + env_response = load_unload_problem.env.state_transition(action, execute=True) + env_reward = env_response.reward true_state = copy.deepcopy(load_unload_problem.env.state) real_observation = load_unload_problem.env.provide_observation( diff --git a/pomdp_py/problems/multi_object_search/agent/agent.py b/pomdp_py/problems/multi_object_search/agent/agent.py index b1525706..f2da2d3a 100644 --- a/pomdp_py/problems/multi_object_search/agent/agent.py +++ b/pomdp_py/problems/multi_object_search/agent/agent.py @@ -60,7 +60,7 @@ def __init__( policy_model, transition_model=transition_model, observation_model=observation_model, - reward_model=reward_model, + response_model=pomdp_py.ResponseModel.generate_response_model(dict(reward=reward_model)), ) def clear_history(self): diff --git a/pomdp_py/problems/multi_object_search/env/env.py b/pomdp_py/problems/multi_object_search/env/env.py index da4fce30..04ce9563 100644 --- a/pomdp_py/problems/multi_object_search/env/env.py +++ b/pomdp_py/problems/multi_object_search/env/env.py @@ -33,7 +33,7 @@ def __init__(self, dim, init_state, sensors, obstacles=set({})): if not isinstance(init_state.object_states[objid], RobotState) } reward_model = GoalRewardModel(self.target_objects) - super().__init__(init_state, transition_model, reward_model) + super().__init__(init_state, transition_model, pomdp_py.ResponseModel.generate_response_model(dict(reward=reward_model))) @property def robot_ids(self): @@ -52,8 +52,8 @@ def state_transition(self, action, execute=True, robot_id=None): become the current state. Returns: - float or tuple: reward as a result of `action` and state - transition, if `execute` is True (next_state, reward) if `execute` + Response or tuple: response as a result of `action` and state + transition, if `execute` is True (next_state, response) if `execute` is False. """ @@ -66,9 +66,10 @@ def state_transition(self, action, execute=True, robot_id=None): self.state, action ) - reward = self.reward_model.sample( + response = self.response_model.sample( self.state, action, next_state, robot_id=robot_id ) + reward = response.reward if execute: self.apply_transition(next_state) return reward diff --git a/pomdp_py/problems/rocksample/rocksample_problem.py b/pomdp_py/problems/rocksample/rocksample_problem.py index 2980af5a..5485c3a5 100644 --- a/pomdp_py/problems/rocksample/rocksample_problem.py +++ b/pomdp_py/problems/rocksample/rocksample_problem.py @@ -434,17 +434,18 @@ def __init__( self, n, k, init_state, rock_locs, init_belief, half_efficiency_dist=20 ): self._n, self._k = n, k + reponse_model = pomdp_py.ResponseModel.generate_response_model(dict(reward=RSRewardModel(rock_locs, self.in_exit_area))) agent = pomdp_py.Agent( init_belief, RSPolicyModel(n, k), RSTransitionModel(n, rock_locs, self.in_exit_area), RSObservationModel(rock_locs, half_efficiency_dist=half_efficiency_dist), - RSRewardModel(rock_locs, self.in_exit_area), + copy.deepcopy(reponse_model), ) env = pomdp_py.Environment( init_state, RSTransitionModel(n, rock_locs, self.in_exit_area), - RSRewardModel(rock_locs, self.in_exit_area), + copy.deepcopy(reponse_model), ) self._rock_locs = rock_locs super().__init__(agent, env, name="RockSampleProblem") @@ -461,7 +462,7 @@ def test_planner(rocksample, planner, nsteps=3, discount=0.95): # max_depth=5, anonymize=False) true_state = copy.deepcopy(rocksample.env.state) - env_reward = rocksample.env.state_transition(action, execute=True) + env_response = rocksample.env.state_transition(action, execute=True) true_next_state = copy.deepcopy(rocksample.env.state) real_observation = rocksample.env.provide_observation( @@ -469,20 +470,20 @@ def test_planner(rocksample, planner, nsteps=3, discount=0.95): ) rocksample.agent.update_history(action, real_observation) planner.update(rocksample.agent, action, real_observation) - total_reward += env_reward - total_discounted_reward += env_reward * gamma + total_reward += env_response.reward + total_discounted_reward += env_response.reward * gamma gamma *= discount print("True state: %s" % true_state) print("Action: %s" % str(action)) print("Observation: %s" % str(real_observation)) - print("Reward: %s" % str(env_reward)) + print("Reward: %s" % str(env_response.reward)) print("Reward (Cumulative): %s" % str(total_reward)) print("Reward (Cumulative Discounted): %s" % str(total_discounted_reward)) if isinstance(planner, pomdp_py.POUCT): print("__num_sims__: %d" % planner.last_num_sims) print("__plan_time__: %.5f" % planner.last_planning_time) if isinstance(planner, pomdp_py.PORollout): - print("__best_reward__: %d" % planner.last_best_reward) + print("__best_reward__: %d" % planner.last_best_response.reward) print("World:") rocksample.print_state() @@ -537,7 +538,7 @@ def create_instance(n, k, **kwargs): def main(): - rocksample = debug_instance() # create_instance(7, 8) + rocksample = create_instance(7, 8) rocksample.print_state() print("*** Testing POMCP ***") diff --git a/pomdp_py/problems/tag/agent/agent.py b/pomdp_py/problems/tag/agent/agent.py index 1a166d9b..47fbce70 100644 --- a/pomdp_py/problems/tag/agent/agent.py +++ b/pomdp_py/problems/tag/agent/agent.py @@ -118,7 +118,7 @@ def __init__(self, init_belief, grid_map, pr_stay=0.2, small=1, big=10): policy_model, transition_model=transition_model, observation_model=observation_model, - reward_model=reward_model, + response_model=pomdp_py.ResponseModel.generate_response_model({"reward": reward_model}), ) def clear_history(self): diff --git a/pomdp_py/problems/tag/env/env.py b/pomdp_py/problems/tag/env/env.py index 47211aff..f6a69e0b 100644 --- a/pomdp_py/problems/tag/env/env.py +++ b/pomdp_py/problems/tag/env/env.py @@ -14,7 +14,7 @@ def __init__(self, init_state, grid_map, pr_stay=0.2, small=1, big=10): target_motion_policy = TagTargetMotionPolicy(grid_map, pr_stay) transition_model = TagTransitionModel(grid_map, target_motion_policy) reward_model = TagRewardModel(small=small, big=big) - super().__init__(init_state, transition_model, reward_model) + super().__init__(init_state, transition_model, pomdp_py.ResponseModel.generate_response_model({"reward": reward_model})) @property def width(self): diff --git a/pomdp_py/problems/tag/problem.py b/pomdp_py/problems/tag/problem.py index a158af64..9172ffba 100644 --- a/pomdp_py/problems/tag/problem.py +++ b/pomdp_py/problems/tag/problem.py @@ -87,7 +87,7 @@ def solve( break # no more time to update. # Execute action - reward = problem.env.state_transition(real_action, execute=True) + response = problem.env.state_transition(real_action, execute=True) # Receive observation _start = time.time() @@ -104,13 +104,13 @@ def solve( _time_used += time.time() - _start # Info and render - _total_reward += reward - _total_discounted_reward += reward * _discount + _total_reward += response.reward + _total_discounted_reward += response.reward * _discount _discount = _discount * discount_factor print("==== Step %d ====" % (i + 1)) print("Action: %s" % str(real_action)) print("Observation: %s" % str(real_observation)) - print("Reward: %s" % str(reward)) + print("Reward: %s" % str(response.reward)) print("Reward (Cumulative): %s" % str(_total_reward)) print("Reward (Discounted): %s" % str(_total_discounted_reward)) print("Find Actions Count: %d" % _find_actions_count) diff --git a/pomdp_py/problems/tiger/tiger_problem.py b/pomdp_py/problems/tiger/tiger_problem.py index 67a378ba..777529f4 100644 --- a/pomdp_py/problems/tiger/tiger_problem.py +++ b/pomdp_py/problems/tiger/tiger_problem.py @@ -212,14 +212,15 @@ class TigerProblem(pomdp_py.POMDP): def __init__(self, obs_noise, init_true_state, init_belief): """init_belief is a Distribution.""" + response_model = pomdp_py.ResponseModel.generate_response_model(dict(reward=RewardModel())) agent = pomdp_py.Agent( init_belief, PolicyModel(), TransitionModel(), ObservationModel(obs_noise), - RewardModel(), + copy.deepcopy(response_model), ) - env = pomdp_py.Environment(init_true_state, TransitionModel(), RewardModel()) + env = pomdp_py.Environment(init_true_state, TransitionModel(), copy.deepcopy(response_model)) super().__init__(agent, env, name="TigerProblem") @staticmethod @@ -273,10 +274,10 @@ def test_planner(tiger_problem, planner, nsteps=3, debug_tree=False): # in real world); In that case, you could skip # the state transition and re-estimate the state # (e.g. through the perception stack on the robot). - reward = tiger_problem.env.reward_model.sample( + response = tiger_problem.env.response_model.sample( tiger_problem.env.state, action, None ) - print("Reward:", reward) + print("Reward:", response.reward) # Let's create some simulated real observation; # Here, we use observation based on true state for sanity diff --git a/tests/test_response.py b/tests/test_response.py new file mode 100644 index 00000000..fe4a5f1f --- /dev/null +++ b/tests/test_response.py @@ -0,0 +1,38 @@ +from pomdp_py.framework.basics import Response + +description = "testing framework basics response" + + +def test_assign(): + r = Response() + assert r.reward == 0.0 + + r = Response(34.0) + assert r.reward == 34.0 + + +def test_add(): + r = Response() + r = r + Response(42.0) + assert r.reward == 42.0 + + r = Response() + r = r + 61.0 + assert r.reward == 61.0 + + +def test_multiply(): + r = Response(1.0) + r = r * 1000.0 + assert r.reward == 1000.0 + + +def run(): + test_assign() + test_add() + test_multiply() + + +if __name__ == "__main__": + run() + \ No newline at end of file