-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove Theano from Week 5 #348
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,153 +1,261 @@ | ||
""" | ||
A single-file module that makes your lasagne network into a bayesian neural net. | ||
Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity | ||
|
||
See example in the notebook | ||
""" | ||
|
||
import torch | ||
import numpy as np | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
|
||
from theano import tensor as T | ||
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams | ||
|
||
import lasagne | ||
from lasagne import init | ||
from lasagne.random import get_rng | ||
class BayesianModule(nn.Module): | ||
""" | ||
creates base class for BNN, in order to enable specific behavior | ||
""" | ||
def init(self): | ||
super().__init__() | ||
|
||
from functools import wraps | ||
|
||
__all__ = ['NormalApproximation', 'get_var_cost', 'bbpwrap'] | ||
class GaussianVariational(nn.Module): | ||
#Samples weights for variational inference as in Weights Uncertainity on Neural Networks (Bayes by backprop paper) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needs space after # |
||
#Calculates the variational posterior part of the complexity part of the loss | ||
def __init__(self, mu, rho): | ||
super().__init__() | ||
|
||
self.mu = nn.Parameter(mu) | ||
self.rho = nn.Parameter(rho) | ||
self.w = None | ||
self.sigma = None | ||
self.pi = np.pi | ||
self.normal = torch.distributions.Normal(0, 1) | ||
|
||
class NormalApproximation(object): | ||
def __init__(self, mu=0, std=np.exp(-3), seed=None): | ||
def sample(self): | ||
""" | ||
Approximation that samples network weights from factorized normal distribution. | ||
|
||
:param mu: prior mean for gaussian weights | ||
:param std: prior std for gaussian weights | ||
:param seed: random seed | ||
Samples weights by sampling form a Normal distribution, multiplying by a sigma, which is | ||
a function from a trainable parameter, and adding a mean | ||
sets those weights as the current ones | ||
returns: | ||
torch.tensor with same shape as self.mu and self.rho | ||
""" | ||
self.prior_mu = mu | ||
self.prior_std = std | ||
self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579)) | ||
device = self.mu.device | ||
epsilon = self.normal.sample(self.mu.size()).to(device) | ||
self.sigma = torch.log(1 + torch.exp(self.rho)).to(device) | ||
self.w = self.mu + self.sigma * epsilon | ||
return self.w | ||
|
||
def log_normal(self, x, mean, std, eps=0.0): | ||
"""computes log-proba of normal distribution""" | ||
std += eps | ||
return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - \ | ||
(x - mean) ** 2 / (2 * std ** 2) | ||
def log_posterior(self): | ||
|
||
def log_prior(self, weights): | ||
""" | ||
Logarithm of prior probabilities for weights: | ||
log P(weights) aka log P(theta) | ||
Calculates the log_likelihood for each of the weights sampled as a part of the complexity cost | ||
returns: | ||
torch.tensor with shape [] | ||
Qwasser marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
return self.log_normal(weights, self.prior_mu, self.prior_std) | ||
|
||
def log_posterior_approx(self, weights, mean, rho): | ||
assert (self.w is not None), "You can only have a log posterior for W if you've already sampled it" | ||
|
||
log_sqrt2pi = np.log(np.sqrt(2*self.pi)) | ||
log_posteriors = -log_sqrt2pi - self.sigma - (((self.w - self.mu) ** 2)/(2 * self.sigma ** 2)) | ||
return log_posteriors.mean() | ||
|
||
|
||
class ScaleMixturePrior(nn.Module): | ||
#Calculates a Scale Mixture Prior distribution for the prior part of the complexity cost on Bayes by Backprop paper | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use ''' doc strings. Add link to paper. |
||
def __init__(self, pi, sigma1, sigma2): | ||
super().__init__() | ||
|
||
self.pi = pi | ||
self.sigma1 = sigma1 | ||
self.sigma2 = sigma2 | ||
self.normal1 = torch.distributions.Normal(0, sigma1) | ||
self.normal2 = torch.distributions.Normal(0, sigma2) | ||
|
||
def log_prior(self, w): | ||
""" | ||
Logarithm of ELBO on posterior probabilities: | ||
log q(weights|learned mu and rho) aka log q(theta|x) | ||
Calculates the log_likelihood for each of the weights sampled relative to a prior distribution as a part of the complexity cost | ||
returns: | ||
torch.tensor with shape [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what shape? |
||
""" | ||
std = T.log1p(T.exp(rho)) # rho to std | ||
return self.log_normal(weights, mean, std) | ||
|
||
def __call__(self, layer, spec, shape, name=None, **tags): | ||
# case when user uses default init specs | ||
assert tags.get( | ||
'variational', False), "Please declare param as variational to avoid confusion" | ||
|
||
if not isinstance(spec, dict): | ||
initial_rho = np.log(np.expm1(self.prior_std)) # std to rho | ||
assert np.isfinite(initial_rho), "too small std to initialize correctly. Please pass explicit"\ | ||
" initializer (dict with {'mu':mu_init, 'rho':rho_init})." | ||
spec = {'mu': spec, 'rho': init.Constant(initial_rho)} | ||
|
||
mu_spec, rho_spec = spec['mu'], spec['rho'] | ||
|
||
rho = layer.add_param( | ||
rho_spec, shape, name=( | ||
name or 'unk') + '.rho', **tags) | ||
mean = layer.add_param( | ||
mu_spec, shape, name=( | ||
name or 'unk') + '.mu', **tags) | ||
|
||
# Reparameterization trick | ||
e = self.srng.normal(shape, std=1) | ||
W = mean + T.log1p(T.exp(rho)) * e | ||
|
||
# KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka | ||
# variational cost | ||
q_p = T.sum( | ||
self.log_posterior_approx(W, mean, rho) - | ||
self.log_prior(W) | ||
) | ||
|
||
# accumulate variational cost | ||
layer._bbwrap_var_cost += q_p | ||
return W | ||
|
||
|
||
def get_var_cost(layer_or_layers, treat_as_input=None): | ||
""" | ||
Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network | ||
prob_n1 = torch.exp(self.normal1.log_prob(w)) | ||
prob_n2 = torch.exp(self.normal2.log_prob(w)) | ||
prior_pdf = (self.pi * prob_n1 + (1 - self.pi) * prob_n2) | ||
|
||
return (torch.log(prior_pdf)).mean() | ||
|
||
:param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output | ||
:param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers | ||
|
||
Alternatively, one can manually get weights for one layer via layer.get_var_cost() | ||
class BayesianLinear(BayesianModule): | ||
""" | ||
cost = 0 | ||
for layer in lasagne.layers.get_all_layers( | ||
layer_or_layers, treat_as_input): | ||
if hasattr(layer, 'get_var_cost'): | ||
# if layer is bayesian or pretends so | ||
cost += layer.get_var_cost() | ||
return cost | ||
Bayesian Linear layer, implements the linear layer proposed on Weight Uncertainity on Neural Networks | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. link to paper |
||
(Bayes by Backprop paper). | ||
Its objective is be interactable with torch nn.Module API, being able even to be chained in nn.Sequential models with other non-this-lib layers | ||
|
||
parameters: | ||
in_fetaures: int -> incoming features for the layer | ||
out_features: int -> output features for the layer | ||
bias: bool -> whether the bias will exist (True) or set to zero (False) | ||
prior_sigma_1: float -> prior sigma on the mixture prior distribution 1 | ||
prior_sigma_2: float -> prior sigma on the mixture prior distribution 2 | ||
prior_pi: float -> pi on the scaled mixture prior | ||
freeze: bool -> wheter the model will start with frozen(deterministic) weights, or not | ||
|
||
""" | ||
def __init__(self, | ||
in_features, | ||
out_features, | ||
bias=True, | ||
prior_sigma_1 = 1, | ||
prior_sigma_2 = 0.002, | ||
prior_pi = 0.5, | ||
freeze = False): | ||
super().__init__() | ||
|
||
#our main parameters | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Spaces after #. Comment is redundatn. |
||
self.in_features = in_features | ||
self.out_features = out_features | ||
self.bias = bias | ||
self.freeze = freeze | ||
|
||
#parameters for the scale mixture prior | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Space |
||
self.prior_sigma_1 = prior_sigma_1 | ||
self.prior_sigma_2 = prior_sigma_2 | ||
self.prior_pi = prior_pi | ||
|
||
# Variational weight parameters and sample | ||
self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features).uniform_(-0.2, 0.2)) | ||
self.weight_rho = nn.Parameter(torch.Tensor(out_features, in_features).uniform_(-5, -4)) | ||
self.weight_sampler = GaussianVariational(self.weight_mu, self.weight_rho) | ||
|
||
# Variational bias parameters and sample | ||
self.bias_mu = nn.Parameter(torch.Tensor(out_features).uniform_(-0.2, 0.2)) | ||
self.bias_rho = nn.Parameter(torch.Tensor(out_features).uniform_(-5, -4)) | ||
self.bias_sampler = GaussianVariational(self.bias_mu, self.bias_rho) | ||
|
||
# Priors (as BBP paper) | ||
self.weight_prior_dist = ScaleMixturePrior(self.prior_pi, self.prior_sigma_1, self.prior_sigma_2) | ||
self.bias_prior_dist = ScaleMixturePrior(self.prior_pi, self.prior_sigma_1, self.prior_sigma_2) | ||
self.log_prior = 0 | ||
self.log_variational_posterior = 0 | ||
|
||
def forward(self, x): | ||
# Sample the weights and forward it | ||
Qwasser marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
#if the model is frozen, return frozen | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment repeats code. Remove it |
||
if self.freeze: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is boolean variable. Name it is_frozen. |
||
return self.forward_frozen(x) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. redundant new line |
||
w = self.weight_sampler.sample() | ||
|
||
if self.bias: | ||
b = self.bias_sampler.sample() | ||
b_log_posterior = self.bias_sampler.log_posterior() | ||
b_log_prior = self.bias_prior_dist.log_prior(b) | ||
|
||
else: | ||
b = torch.zeros((self.out_features)) | ||
b_log_posterior = 0 | ||
b_log_prior = 0 | ||
|
||
# Get the complexity cost | ||
self.log_variational_posterior = self.weight_sampler.log_posterior() + b_log_posterior | ||
self.log_prior = self.weight_prior_dist.log_prior(w) + b_log_prior | ||
|
||
return F.linear(x, w, b) | ||
|
||
def bbpwrap(approximation=NormalApproximation()): | ||
def forward_frozen(self, x): | ||
""" | ||
Computes the feedforward operation with the expected value for weight and biases | ||
""" | ||
if self.bias: | ||
return F.linear(x, self.weight_mu, self.bias_mu) | ||
else: | ||
return F.linear(x, self.weight_mu, torch.zeros(self.out_features)) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. redundant new line |
||
|
||
def kl_divergence_from_nn(model): | ||
|
||
""" | ||
Gathers the KL Divergence from a nn.Module object | ||
Works by gathering each Bayesian layer kl divergence and summing it, doing nothing with the non Bayesian ones | ||
""" | ||
A decorator that makes arbitrary lasagne layer into a bayesian network layer: | ||
BayesDenseLayer = bbwrap()(DenseLayer) | ||
or more verbosely, | ||
@bbpwrap(NormalApproximation(pstd=0.01)) | ||
BayesDenseLayer(DenseLayer): | ||
pass | ||
kl_divergence = 0 | ||
for module in model.modules(): | ||
if isinstance(module, (BayesianModule)): | ||
kl_divergence += module.log_variational_posterior - module.log_prior | ||
return kl_divergence | ||
|
||
|
||
|
||
def variational_estimator(nn_class): | ||
""" | ||
This decorator adds some util methods to a nn.Module, in order to facilitate the handling of Bayesian Deep Learning features | ||
Parameters: | ||
nn_class: torch.nn.Module -> Torch neural network module | ||
Returns a nn.Module with methods for: | ||
(1) Gathering the KL Divergence along its BayesianModules; | ||
(2) Sample the Elbo Loss along its variational inferences (helps training) | ||
(3) Freeze the model, in order to predict using only their weight distribution means | ||
""" | ||
|
||
def decorator(cls): | ||
def add_param_wrap(add_param): | ||
@wraps(add_param) | ||
def wrapped(self, spec, shape, name=None, **tags): | ||
# we should take care about some user specification | ||
# to avoid bbp hook just set tags['variational'] = True | ||
if not tags.get('trainable', True) or \ | ||
tags.get('variational', False): | ||
return add_param(self, spec, shape, name, **tags) | ||
else: | ||
# we declare that params we add next | ||
# are the ones we need to fit the distribution | ||
# they don't need to be regularized, strictly | ||
tags['variational'] = True | ||
tags['regularizable'] = False | ||
param = self.approximation(self, spec, shape, name, **tags) | ||
return param | ||
return wrapped | ||
|
||
def get_var_cost(self): | ||
""" | ||
Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer. | ||
Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one. | ||
""" | ||
return self._bbwrap_var_cost | ||
|
||
cls.approximation = approximation | ||
cls._bbwrap_var_cost = 0 | ||
cls.add_param = add_param_wrap(cls.add_param) | ||
cls.get_var_cost = get_var_cost | ||
return cls | ||
|
||
return decorator | ||
def nn_kl_divergence(self): | ||
"""Returns the sum of the KL divergence of each of the BayesianModules of the model, which are from | ||
their posterior current distribution of weights relative to a scale-mixtured prior (and simpler) distribution of weights | ||
Parameters: | ||
N/a | ||
Returns torch.tensor with 0 dim. | ||
|
||
""" | ||
return kl_divergence_from_nn(self) | ||
|
||
setattr(nn_class, "nn_kl_divergence", nn_kl_divergence) | ||
|
||
def sample_elbo(self, | ||
inputs, | ||
labels, | ||
criterion, | ||
sample_nbr, | ||
complexity_cost_weight=1): | ||
|
||
""" Samples the ELBO Loss for a batch of data, consisting of inputs and corresponding-by-index labels | ||
The ELBO Loss consists of the sum of the KL Divergence of the model | ||
(explained above, interpreted as a "complexity part" of the loss) | ||
with the actual criterion - (loss function) of optimization of our model | ||
(the performance part of the loss). | ||
As we are using variational inference, it takes several (quantified by the parameter sample_nbr) Monte-Carlo | ||
samples of the weights in order to gather a better approximation for the loss. | ||
Parameters: | ||
inputs: torch.tensor -> the input data to the model | ||
labels: torch.tensor -> label data for the performance-part of the loss calculation | ||
The shape of the labels must match the label-parameter shape of the criterion (one hot encoded or as index, if needed) | ||
criterion: torch.nn.Module, custom criterion (loss) function, torch.nn.functional function -> criterion to gather | ||
the performance cost for the model | ||
sample_nbr: int -> The number of times of the weight-sampling and predictions done in our Monte-Carlo approach to | ||
gather the loss to be .backwarded in the optimization of the model. | ||
|
||
""" | ||
|
||
loss = 0 | ||
for _ in range(sample_nbr): | ||
outputs = self(inputs) | ||
loss = criterion(outputs, labels) | ||
loss += self.nn_kl_divergence() * complexity_cost_weight | ||
return loss / sample_nbr | ||
|
||
setattr(nn_class, "sample_elbo", sample_elbo) | ||
|
||
|
||
def freeze_model(self): | ||
""" | ||
Freezes the model by making it predict using only the expected value to their BayesianModules' weights distributions | ||
""" | ||
for module in self.modules(): | ||
if isinstance(module, (BayesianModule)): | ||
module.freeze = True | ||
|
||
setattr(nn_class, "freeze", freeze_model) | ||
|
||
def unfreeze_model(self): | ||
""" | ||
Unfreezes the model by letting it draw its weights with uncertanity from their correspondent distributions | ||
""" | ||
|
||
for module in self.modules(): | ||
if isinstance(module, (BayesianModule)): | ||
module.freeze = False | ||
|
||
setattr(nn_class, "unfreeze", unfreeze_model) | ||
return nn_class |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since this file is converted from someone's module, I believe credit should be given to the original author.