Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Theano from Week 5 #348

Merged
merged 3 commits into from
Feb 8, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
364 changes: 236 additions & 128 deletions week05_explore/bayes.py
Original file line number Diff line number Diff line change
@@ -1,153 +1,261 @@
"""
A single-file module that makes your lasagne network into a bayesian neural net.
Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity

See example in the notebook
"""

import torch
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this file is converted from someone's module, I believe credit should be given to the original author.

import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

import lasagne
from lasagne import init
from lasagne.random import get_rng
class BayesianModule(nn.Module):
"""
creates base class for BNN, in order to enable specific behavior
"""
def init(self):
super().__init__()

from functools import wraps

__all__ = ['NormalApproximation', 'get_var_cost', 'bbpwrap']
class GaussianVariational(nn.Module):
#Samples weights for variational inference as in Weights Uncertainity on Neural Networks (Bayes by backprop paper)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs space after #
Add link to referred paper.

#Calculates the variational posterior part of the complexity part of the loss
def __init__(self, mu, rho):
super().__init__()

self.mu = nn.Parameter(mu)
self.rho = nn.Parameter(rho)
self.w = None
self.sigma = None
self.pi = np.pi
self.normal = torch.distributions.Normal(0, 1)

class NormalApproximation(object):
def __init__(self, mu=0, std=np.exp(-3), seed=None):
def sample(self):
"""
Approximation that samples network weights from factorized normal distribution.

:param mu: prior mean for gaussian weights
:param std: prior std for gaussian weights
:param seed: random seed
Samples weights by sampling form a Normal distribution, multiplying by a sigma, which is
a function from a trainable parameter, and adding a mean
sets those weights as the current ones
returns:
torch.tensor with same shape as self.mu and self.rho
"""
self.prior_mu = mu
self.prior_std = std
self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579))
device = self.mu.device
epsilon = self.normal.sample(self.mu.size()).to(device)
self.sigma = torch.log(1 + torch.exp(self.rho)).to(device)
self.w = self.mu + self.sigma * epsilon
return self.w

def log_normal(self, x, mean, std, eps=0.0):
"""computes log-proba of normal distribution"""
std += eps
return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - \
(x - mean) ** 2 / (2 * std ** 2)
def log_posterior(self):

def log_prior(self, weights):
"""
Logarithm of prior probabilities for weights:
log P(weights) aka log P(theta)
Calculates the log_likelihood for each of the weights sampled as a part of the complexity cost
returns:
torch.tensor with shape []
Qwasser marked this conversation as resolved.
Show resolved Hide resolved
"""
return self.log_normal(weights, self.prior_mu, self.prior_std)

def log_posterior_approx(self, weights, mean, rho):
assert (self.w is not None), "You can only have a log posterior for W if you've already sampled it"

log_sqrt2pi = np.log(np.sqrt(2*self.pi))
log_posteriors = -log_sqrt2pi - self.sigma - (((self.w - self.mu) ** 2)/(2 * self.sigma ** 2))
return log_posteriors.mean()


class ScaleMixturePrior(nn.Module):
#Calculates a Scale Mixture Prior distribution for the prior part of the complexity cost on Bayes by Backprop paper
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use ''' doc strings. Add link to paper.

def __init__(self, pi, sigma1, sigma2):
super().__init__()

self.pi = pi
self.sigma1 = sigma1
self.sigma2 = sigma2
self.normal1 = torch.distributions.Normal(0, sigma1)
self.normal2 = torch.distributions.Normal(0, sigma2)

def log_prior(self, w):
"""
Logarithm of ELBO on posterior probabilities:
log q(weights|learned mu and rho) aka log q(theta|x)
Calculates the log_likelihood for each of the weights sampled relative to a prior distribution as a part of the complexity cost
returns:
torch.tensor with shape []
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what shape?

"""
std = T.log1p(T.exp(rho)) # rho to std
return self.log_normal(weights, mean, std)

def __call__(self, layer, spec, shape, name=None, **tags):
# case when user uses default init specs
assert tags.get(
'variational', False), "Please declare param as variational to avoid confusion"

if not isinstance(spec, dict):
initial_rho = np.log(np.expm1(self.prior_std)) # std to rho
assert np.isfinite(initial_rho), "too small std to initialize correctly. Please pass explicit"\
" initializer (dict with {'mu':mu_init, 'rho':rho_init})."
spec = {'mu': spec, 'rho': init.Constant(initial_rho)}

mu_spec, rho_spec = spec['mu'], spec['rho']

rho = layer.add_param(
rho_spec, shape, name=(
name or 'unk') + '.rho', **tags)
mean = layer.add_param(
mu_spec, shape, name=(
name or 'unk') + '.mu', **tags)

# Reparameterization trick
e = self.srng.normal(shape, std=1)
W = mean + T.log1p(T.exp(rho)) * e

# KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka
# variational cost
q_p = T.sum(
self.log_posterior_approx(W, mean, rho) -
self.log_prior(W)
)

# accumulate variational cost
layer._bbwrap_var_cost += q_p
return W


def get_var_cost(layer_or_layers, treat_as_input=None):
"""
Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network
prob_n1 = torch.exp(self.normal1.log_prob(w))
prob_n2 = torch.exp(self.normal2.log_prob(w))
prior_pdf = (self.pi * prob_n1 + (1 - self.pi) * prob_n2)

return (torch.log(prior_pdf)).mean()

:param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output
:param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers

Alternatively, one can manually get weights for one layer via layer.get_var_cost()
class BayesianLinear(BayesianModule):
"""
cost = 0
for layer in lasagne.layers.get_all_layers(
layer_or_layers, treat_as_input):
if hasattr(layer, 'get_var_cost'):
# if layer is bayesian or pretends so
cost += layer.get_var_cost()
return cost
Bayesian Linear layer, implements the linear layer proposed on Weight Uncertainity on Neural Networks
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

link to paper

(Bayes by Backprop paper).
Its objective is be interactable with torch nn.Module API, being able even to be chained in nn.Sequential models with other non-this-lib layers

parameters:
in_fetaures: int -> incoming features for the layer
out_features: int -> output features for the layer
bias: bool -> whether the bias will exist (True) or set to zero (False)
prior_sigma_1: float -> prior sigma on the mixture prior distribution 1
prior_sigma_2: float -> prior sigma on the mixture prior distribution 2
prior_pi: float -> pi on the scaled mixture prior
freeze: bool -> wheter the model will start with frozen(deterministic) weights, or not

"""
def __init__(self,
in_features,
out_features,
bias=True,
prior_sigma_1 = 1,
prior_sigma_2 = 0.002,
prior_pi = 0.5,
freeze = False):
super().__init__()

#our main parameters
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spaces after #. Comment is redundatn.

self.in_features = in_features
self.out_features = out_features
self.bias = bias
self.freeze = freeze

#parameters for the scale mixture prior
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Space

self.prior_sigma_1 = prior_sigma_1
self.prior_sigma_2 = prior_sigma_2
self.prior_pi = prior_pi

# Variational weight parameters and sample
self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features).uniform_(-0.2, 0.2))
self.weight_rho = nn.Parameter(torch.Tensor(out_features, in_features).uniform_(-5, -4))
self.weight_sampler = GaussianVariational(self.weight_mu, self.weight_rho)

# Variational bias parameters and sample
self.bias_mu = nn.Parameter(torch.Tensor(out_features).uniform_(-0.2, 0.2))
self.bias_rho = nn.Parameter(torch.Tensor(out_features).uniform_(-5, -4))
self.bias_sampler = GaussianVariational(self.bias_mu, self.bias_rho)

# Priors (as BBP paper)
self.weight_prior_dist = ScaleMixturePrior(self.prior_pi, self.prior_sigma_1, self.prior_sigma_2)
self.bias_prior_dist = ScaleMixturePrior(self.prior_pi, self.prior_sigma_1, self.prior_sigma_2)
self.log_prior = 0
self.log_variational_posterior = 0

def forward(self, x):
# Sample the weights and forward it
Qwasser marked this conversation as resolved.
Show resolved Hide resolved

#if the model is frozen, return frozen
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment repeats code. Remove it

if self.freeze:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is boolean variable. Name it is_frozen.

return self.forward_frozen(x)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

redundant new line

w = self.weight_sampler.sample()

if self.bias:
b = self.bias_sampler.sample()
b_log_posterior = self.bias_sampler.log_posterior()
b_log_prior = self.bias_prior_dist.log_prior(b)

else:
b = torch.zeros((self.out_features))
b_log_posterior = 0
b_log_prior = 0

# Get the complexity cost
self.log_variational_posterior = self.weight_sampler.log_posterior() + b_log_posterior
self.log_prior = self.weight_prior_dist.log_prior(w) + b_log_prior

return F.linear(x, w, b)

def bbpwrap(approximation=NormalApproximation()):
def forward_frozen(self, x):
"""
Computes the feedforward operation with the expected value for weight and biases
"""
if self.bias:
return F.linear(x, self.weight_mu, self.bias_mu)
else:
return F.linear(x, self.weight_mu, torch.zeros(self.out_features))

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

redundant new line


def kl_divergence_from_nn(model):

"""
Gathers the KL Divergence from a nn.Module object
Works by gathering each Bayesian layer kl divergence and summing it, doing nothing with the non Bayesian ones
"""
A decorator that makes arbitrary lasagne layer into a bayesian network layer:
BayesDenseLayer = bbwrap()(DenseLayer)
or more verbosely,
@bbpwrap(NormalApproximation(pstd=0.01))
BayesDenseLayer(DenseLayer):
pass
kl_divergence = 0
for module in model.modules():
if isinstance(module, (BayesianModule)):
kl_divergence += module.log_variational_posterior - module.log_prior
return kl_divergence



def variational_estimator(nn_class):
"""
This decorator adds some util methods to a nn.Module, in order to facilitate the handling of Bayesian Deep Learning features
Parameters:
nn_class: torch.nn.Module -> Torch neural network module
Returns a nn.Module with methods for:
(1) Gathering the KL Divergence along its BayesianModules;
(2) Sample the Elbo Loss along its variational inferences (helps training)
(3) Freeze the model, in order to predict using only their weight distribution means
"""

def decorator(cls):
def add_param_wrap(add_param):
@wraps(add_param)
def wrapped(self, spec, shape, name=None, **tags):
# we should take care about some user specification
# to avoid bbp hook just set tags['variational'] = True
if not tags.get('trainable', True) or \
tags.get('variational', False):
return add_param(self, spec, shape, name, **tags)
else:
# we declare that params we add next
# are the ones we need to fit the distribution
# they don't need to be regularized, strictly
tags['variational'] = True
tags['regularizable'] = False
param = self.approximation(self, spec, shape, name, **tags)
return param
return wrapped

def get_var_cost(self):
"""
Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer.
Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one.
"""
return self._bbwrap_var_cost

cls.approximation = approximation
cls._bbwrap_var_cost = 0
cls.add_param = add_param_wrap(cls.add_param)
cls.get_var_cost = get_var_cost
return cls

return decorator
def nn_kl_divergence(self):
"""Returns the sum of the KL divergence of each of the BayesianModules of the model, which are from
their posterior current distribution of weights relative to a scale-mixtured prior (and simpler) distribution of weights
Parameters:
N/a
Returns torch.tensor with 0 dim.

"""
return kl_divergence_from_nn(self)

setattr(nn_class, "nn_kl_divergence", nn_kl_divergence)

def sample_elbo(self,
inputs,
labels,
criterion,
sample_nbr,
complexity_cost_weight=1):

""" Samples the ELBO Loss for a batch of data, consisting of inputs and corresponding-by-index labels
The ELBO Loss consists of the sum of the KL Divergence of the model
(explained above, interpreted as a "complexity part" of the loss)
with the actual criterion - (loss function) of optimization of our model
(the performance part of the loss).
As we are using variational inference, it takes several (quantified by the parameter sample_nbr) Monte-Carlo
samples of the weights in order to gather a better approximation for the loss.
Parameters:
inputs: torch.tensor -> the input data to the model
labels: torch.tensor -> label data for the performance-part of the loss calculation
The shape of the labels must match the label-parameter shape of the criterion (one hot encoded or as index, if needed)
criterion: torch.nn.Module, custom criterion (loss) function, torch.nn.functional function -> criterion to gather
the performance cost for the model
sample_nbr: int -> The number of times of the weight-sampling and predictions done in our Monte-Carlo approach to
gather the loss to be .backwarded in the optimization of the model.

"""

loss = 0
for _ in range(sample_nbr):
outputs = self(inputs)
loss = criterion(outputs, labels)
loss += self.nn_kl_divergence() * complexity_cost_weight
return loss / sample_nbr

setattr(nn_class, "sample_elbo", sample_elbo)


def freeze_model(self):
"""
Freezes the model by making it predict using only the expected value to their BayesianModules' weights distributions
"""
for module in self.modules():
if isinstance(module, (BayesianModule)):
module.freeze = True

setattr(nn_class, "freeze", freeze_model)

def unfreeze_model(self):
"""
Unfreezes the model by letting it draw its weights with uncertanity from their correspondent distributions
"""

for module in self.modules():
if isinstance(module, (BayesianModule)):
module.freeze = False

setattr(nn_class, "unfreeze", unfreeze_model)
return nn_class
Loading