-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathREINFORCEbrain.py
68 lines (54 loc) · 3.57 KB
/
REINFORCEbrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# coding=utf-8
import gym
import tensorflow as tf
from blocAndTools.buildingbloc import ExperimentSpec, GymPlayground, build_MLP_computation_graph, \
policy_theta_discrete_space, discrete_pseudo_loss
from blocAndTools.rl_vocabulary import rl_name
vocab = rl_name()
def REINFORCE_policy(observation_placeholder: tf.Tensor, action_placeholder: tf.Tensor, Q_values_placeholder: tf.Tensor,
experiment_spec: ExperimentSpec, playground: GymPlayground) -> (tf.Tensor, tf.Tensor, tf.Tensor):
"""
The learning agent: REINFORCE (aka: Basic Policy Gradient)
Based on the paper by Williams, R. J.
Simple statistical gradient-following algorithms for connectionist reinforcement learning. (1992)
Policy gradient is a on-policy method which seek to directly optimize the policy π_θ by using sampled trajectories τ
as weight. Those weight will then be used to indicate how good the policy performed.
Based on that knowledge, the algorithm update the parameter θ of his policy to make action leading to similar good
trajectories more likely and similar bad trajectories less likely.
In the case of Deep Reinforcement Learning, the policy parameter θ is a neural net.
:type observation_placeholder: tf.Tensor
:type action_placeholder: tf.Tensor
:type Q_values_placeholder: tf.Tensor
:type playground: GymPlayground
:type experiment_spec: ExperimentSpec
:return: (sampled_action, theta_mlp, pseudo_loss)
:rtype: (tf.Tensor, tf.Tensor, tf.Tensor)
"""
with tf.name_scope(vocab.REINFORCE) as scope:
""" ---- Build parameter theta as a multilayer perceptron ---- """
theta_mlp = build_MLP_computation_graph(observation_placeholder, playground.ACTION_CHOICES,
experiment_spec.theta_nn_h_layer_topo,
hidden_layers_activation=experiment_spec.theta_hidden_layers_activation,
output_layers_activation=experiment_spec.theta_output_layers_activation,
name=vocab.theta_NeuralNet)
# ::Discrete case
if isinstance(playground.env.action_space, gym.spaces.Discrete):
""" ---- Assess the input shape compatibility ---- """
are_compatible = observation_placeholder.shape.as_list()[-1] == playground.OBSERVATION_SPACE.shape[0]
assert are_compatible, ("the observation_placeholder is incompatible with environment, "
"{} != {}").format(observation_placeholder.shape.as_list()[-1],
playground.OBSERVATION_SPACE.shape[0])
""" ---- Build the policy for discrete space ---- """
sampled_action, log_p_all = policy_theta_discrete_space(theta_mlp, playground)
""" ---- Build the pseudo loss function ---- """
pseudo_loss = discrete_pseudo_loss(log_p_all, action_placeholder, Q_values_placeholder, playground,
vocab.pseudo_loss)
# ::Continuous case
elif isinstance(playground.env.action_space, gym.spaces.Box):
raise NotImplementedError # (Ice-Boxed) todo:implement --> for policy for continuous space:
# ::Other gym environment
else:
print("\n>>> The agent implementation does not support that environment space "
"{} yet.\n\n".format(playground.env.action_space))
raise NotImplementedError
return sampled_action, theta_mlp, pseudo_loss