Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A Response and Response model #61

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pomdp_py/algorithms/po_rollout.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pomdp_py.framework.basics cimport Action, State, Observation, Agent
from pomdp_py.framework.basics cimport Action, State, Observation, Agent, Response
from pomdp_py.framework.planner cimport Planner
from pomdp_py.algorithms.po_uct cimport RolloutPolicy, ActionPrior

Expand All @@ -11,7 +11,7 @@ cdef class PORollout(Planner):
cdef float _discount_factor
cdef bint _particles
cdef Agent _agent
cdef float _last_best_reward
cdef Response _last_best_response

cpdef _search(self)
cpdef _rollout(self, State state, int depth)
Expand Down
50 changes: 26 additions & 24 deletions pomdp_py/algorithms/po_rollout.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ it will do the rollouts and action selection as described.

from pomdp_py.framework.basics cimport Action, Agent, POMDP, State, Observation,\
ObservationModel, TransitionModel, GenerativeDistribution, PolicyModel,\
sample_generative_model
sample_generative_model, Response
from pomdp_py.framework.planner cimport Planner
from pomdp_py.representations.distribution.particles cimport Particles
from pomdp_py.representations.belief.particles cimport particle_reinvigoration
Expand Down Expand Up @@ -46,58 +46,60 @@ cdef class PORollout(Planner):
self._particles = particles

self._agent = None
self._last_best_reward = float('-inf')
self._last_best_response = None

@property
def last_best_reward(self):
return self._last_best_reward
def last_best_response(self):
return self._last_best_response

cpdef public plan(self, Agent agent):
self._agent = agent
best_action, best_reward = self._search()
self._last_best_reward = best_reward
best_action, best_response = self._search()
self._last_best_response = best_response
return best_action

cpdef _search(self):
cdef Action best_action
cdef float best_reward, reward_avg, total_discounted_reward
cdef Response best_response
cdef Response response_avg
cdef Response total_discounted_response
cdef set legal_actions
cdef list rewards
cdef list responses

best_action, best_reward = None, float("-inf")
best_action, best_response = None, Response(float("-inf"))
legal_actions = self._agent.valid_actions(history=self._agent.history)
for action in legal_actions:
rewards = []
responses = []
for i in range(self._num_sims // len(legal_actions)):
state = self._agent.belief.random()
total_discounted_reward = self._rollout(state, 0)
rewards.append(total_discounted_reward)
reward_avg = sum(rewards) / len(rewards)
if reward_avg > best_reward:
total_discounted_response = self._rollout(state, 0)
responses.append(total_discounted_response)
response_avg = sum(responses) / len(responses)
if response_avg > best_response:
best_action = action
best_reward = reward_avg
return best_action, best_reward
best_response = response_avg
return best_action, best_response

cpdef _rollout(self, State state, int depth):
# Rollout without a tree.
cdef Action action
cdef float discount = 1.0
cdef float total_discounted_reward = 0
cdef Response total_discounted_response = Response()
cdef State next_state
cdef Observation observation
cdef float reward
cdef Response response
cdef int nsteps
cdef tuple history = self._agent.history

while depth <= self._max_depth:
action = self._rollout_policy.rollout(state, history=history)
next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action)
next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action)
history = history + ((action, observation),)
depth += 1
total_discounted_reward += reward * discount
total_discounted_response = total_discounted_response + response * discount
discount *= self._discount_factor
state = next_state
return total_discounted_reward
return total_discounted_response

cpdef update(self, Agent agent, Action real_action, Observation real_observation,
state_transform_func=None):
Expand All @@ -110,7 +112,7 @@ cdef class PORollout(Planner):
if not isinstance(cur_belief, Particles):
raise ValueError("Agent's belief is not in particles.")
for state in cur_belief.particles:
next_state, observation, reward, nsteps = sample_generative_model(agent, state,
next_state, observation, response, nsteps = sample_generative_model(agent, state,
real_action)
if observation == real_observation:
new_belief.add(next_state)
Expand All @@ -128,8 +130,8 @@ cdef class PORollout(Planner):
def clear_agent(self):
"""clear_agent(self)"""
self._agent = None # forget about current agent so that can plan for another agent.
self._last_best_reward = float('-inf')

self._last_best_response = Response(float('-inf'))
cpdef set_rollout_policy(self, RolloutPolicy rollout_policy):
"""
set_rollout_policy(self, RolloutPolicy rollout_policy)
Expand Down
5 changes: 3 additions & 2 deletions pomdp_py/algorithms/po_uct.pxd
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
from pomdp_py.framework.planner cimport Planner
from pomdp_py.framework.basics cimport Agent, PolicyModel, Action, State, Observation
from pomdp_py.framework.basics cimport Agent, PolicyModel, Action, State, Observation, Response

cdef class TreeNode:
cdef public dict children
cdef public int num_visits
cdef public float value

cdef class QNode(TreeNode):
pass
cpdef void update(QNode self, Response response)

cdef class VNode(TreeNode):
cpdef argmax(VNode self)
cpdef void update(VNode self)

cdef class RootVNode(VNode):
cdef public tuple history
Expand Down
50 changes: 30 additions & 20 deletions pomdp_py/algorithms/po_uct.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ the prior knowledge.

from pomdp_py.framework.basics cimport Action, Agent, POMDP, State, Observation,\
ObservationModel, TransitionModel, GenerativeDistribution, PolicyModel,\
sample_generative_model
sample_generative_model, Response
from pomdp_py.framework.planner cimport Planner
from pomdp_py.representations.distribution.particles cimport Particles
from pomdp_py.utils import typ
Expand Down Expand Up @@ -64,13 +64,20 @@ cdef class QNode(TreeNode):
self.num_visits = num_visits
self.value = value
self.children = {} # o -> VNode

def __str__(self):
return typ.red("QNode") + "(%.3f, %.3f | %s)" % (self.num_visits,
self.value,
str(self.children.keys()))

def __repr__(self):
return self.__str__()

cpdef void update(QNode self, Response response):
self.num_visits += 1
self.value = self.value + (response.reward - self.value) / self.num_visits


cdef class VNode(TreeNode):
def __init__(self, num_visits, **kwargs):
self.num_visits = num_visits
Expand Down Expand Up @@ -98,6 +105,9 @@ cdef class VNode(TreeNode):
best_value = self[action].value
return best_action

cpdef void update(VNode self):
self.num_visits += 1

@property
def value(self):
best_action = max(self.children, key=lambda action: self.children[action].value)
Expand Down Expand Up @@ -361,7 +371,7 @@ cdef class POUCT(Planner):
State state, tuple history, VNode root, QNode parent,
Observation observation, int depth):
if depth > self._max_depth:
return 0
return self._agent.response_model.create_response()
if root is None:
if self._agent.tree is None:
root = self._VNode(root=True)
Expand All @@ -373,46 +383,46 @@ cdef class POUCT(Planner):
if parent is not None:
parent[observation] = root
self._expand_vnode(root, history, state=state)
rollout_reward = self._rollout(state, history, root, depth)
return rollout_reward
rollout_response = self._rollout(state, history, root, depth)
return rollout_response
cdef int nsteps
action = self._ucb(root)
next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action)
next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action)
if nsteps == 0:
# This indicates the provided action didn't lead to transition
# Perhaps the action is not allowed to be performed for the given state
# (for example, the state is not in the initiation set of the option,
# or the state is a terminal state)
return reward
return response

total_reward = reward + (self._discount_factor**nsteps)*self._simulate(next_state,
total_response = response + (self._discount_factor**nsteps)*self._simulate(next_state,
history + ((action, observation),),
root[action][observation],
root[action],
observation,
depth+nsteps)
root.num_visits += 1
root[action].num_visits += 1
root[action].value = root[action].value + (total_reward - root[action].value) / (root[action].num_visits)
return total_reward
root.update()
root[action].update(total_response)
return total_response

cpdef _rollout(self, State state, tuple history, VNode root, int depth):
cdef Action action
cdef float discount = 1.0
cdef float total_discounted_reward = 0
cdef Response total_discounted_response = self._agent.response_model.create_response()
cdef State next_state
cdef Observation observation
cdef float reward
cdef Response response

while depth < self._max_depth:
action = self._rollout_policy.rollout(state, history)
next_state, observation, reward, nsteps = sample_generative_model(self._agent, state, action)
next_state, observation, response, nsteps = sample_generative_model(self._agent, state, action)
history = history + ((action, observation),)
depth += nsteps
total_discounted_reward += reward * discount
total_discounted_response = total_discounted_response + response * discount
discount *= (self._discount_factor**nsteps)
state = next_state
return total_discounted_reward
return total_discounted_response

cpdef Action _ucb(self, VNode root):
"""UCB1"""
Expand All @@ -436,15 +446,15 @@ cdef class POUCT(Planner):
'''
cdef State next_state
cdef Observation observation
cdef float reward
cdef Response response

if self._agent.transition_model is None:
next_state, observation, reward = self._agent.generative_model.sample(state, action)
next_state, observation, response = self._agent.generative_model.sample(state, action)
else:
next_state = self._agent.transition_model.sample(state, action)
observation = self._agent.observation_model.sample(next_state, action)
reward = self._agent.reward_model.sample(state, action, next_state)
return next_state, observation, reward
response = self._agent.response_model.sample(state, action, next_state)
return next_state, observation, response

def _VNode(self, root=False, **kwargs):
"""Returns a VNode with default values; The function naming makes it clear
Expand Down
4 changes: 2 additions & 2 deletions pomdp_py/algorithms/pomcp.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,10 @@ cdef class POMCP(POUCT):
cpdef _simulate(POMCP self,
State state, tuple history, VNode root, QNode parent,
Observation observation, int depth):
total_reward = POUCT._simulate(self, state, history, root, parent, observation, depth)
total_response = POUCT._simulate(self, state, history, root, parent, observation, depth)
if depth == 1 and root is not None:
root.belief.add(state) # belief update happens as simulation goes.
return total_reward
return total_response

def _VNode(self, root=False, **kwargs):
"""Returns a VNode with default values; The function naming makes it clear
Expand Down
4 changes: 2 additions & 2 deletions pomdp_py/algorithms/value_iteration.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ cdef class _PolicyTreeNode:
subtree_value = self.children[o].values[sp] # corresponds to V_{oi(p)} in paper
else:
subtree_value = 0.0
reward = self._agent.reward_model.sample(s, self.action, sp)
expected_future_value += trans_prob * obsrv_prob * (reward + discount_factor*subtree_value)
response = self._agent.response_model.sample(s, self.action, sp)
expected_future_value += trans_prob * obsrv_prob * (response.reward + discount_factor*subtree_value)
values[s] = expected_future_value
return values

Expand Down
16 changes: 14 additions & 2 deletions pomdp_py/framework/basics.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ cdef class TransitionModel:
pass
cdef class PolicyModel:
pass

cdef class ResponseModel:
cdef dict _model_dict
cdef Response _response
cdef dict __dict__

cdef class BlackboxModel:
pass
cdef class RewardModel:
Expand All @@ -27,6 +33,12 @@ cdef class State:
cdef class Observation:
pass

cdef class Vector(list):
pass

cdef class Response:
cdef float _reward

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @troiwill. What is the significance of wrapping a float inside Response? Why does this change need to happen in basic.pyx? This affects all other programs currently using pomdp-py. This would not be acceptable.

cdef class Agent:
cdef GenerativeDistribution _init_belief
cdef PolicyModel _policy_model
Expand All @@ -41,12 +53,12 @@ cdef class Agent:
cdef class Environment:
cdef State _init_state
cdef TransitionModel _transition_model
cdef RewardModel _reward_model
cdef ResponseModel _response_model
cdef BlackboxModel _blackbox_model
cdef State _cur_state

cdef class Option(Action):
pass

cpdef sample_generative_model(Agent agent, State state, Action action, float discount_factor=*)
cpdef sample_explict_models(TransitionModel T, ObservationModel O, RewardModel R, State state, Action a, float discount_factor=*)
cpdef sample_explict_models(TransitionModel T, ObservationModel O, ResponseModel R, State state, Action a, float discount_factor=*)
Loading
Loading