yandexdataschool · Qwasser · Feb 8, 2021 · Apr 10, 2020 · Apr 11, 2020 · Feb 8, 2021
diff --git a/week05_explore/bayes.py b/week05_explore/bayes.py
@@ -1,153 +1,261 @@
-"""
-A single-file module that makes your lasagne network into a bayesian neural net.
-Originally created by github.com/ferrine , rewritten by github.com/justheuristic for simplicity
-
-See example in the notebook
-"""
-
+import torch
 import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
 
-from theano import tensor as T
-from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
 
-import lasagne
-from lasagne import init
-from lasagne.random import get_rng
+class BayesianModule(nn.Module):
+    """
+    creates base class for BNN, in order to enable specific behavior
+    """
+    def init(self):
+        super().__init__()
 
-from functools import wraps
 
-__all__ = ['NormalApproximation', 'get_var_cost', 'bbpwrap']
+class GaussianVariational(nn.Module):
+    #Samples weights for variational inference as in Weights Uncertainity on Neural Networks (Bayes by backprop paper)
+    #Calculates the variational posterior part of the complexity part of the loss
+    def __init__(self, mu, rho):
+        super().__init__()
 
+        self.mu = nn.Parameter(mu)
+        self.rho = nn.Parameter(rho)
+        self.w = None
+        self.sigma = None
+        self.pi = np.pi
+        self.normal = torch.distributions.Normal(0, 1)
 
-class NormalApproximation(object):
-    def __init__(self, mu=0, std=np.exp(-3), seed=None):
+    def sample(self):
         """
-        Approximation that samples network weights from factorized normal distribution.
-
-        :param mu: prior mean for gaussian weights
-        :param std: prior std for gaussian weights
-        :param seed: random seed
+        Samples weights by sampling form a Normal distribution, multiplying by a sigma, which is 
+        a function from a trainable parameter, and adding a mean
+        sets those weights as the current ones
+        returns:
+            torch.tensor with same shape as self.mu and self.rho
         """
-        self.prior_mu = mu
-        self.prior_std = std
-        self.srng = RandomStreams(seed or get_rng().randint(1, 2147462579))
+        device = self.mu.device
+        epsilon = self.normal.sample(self.mu.size()).to(device)
+        self.sigma = torch.log(1 + torch.exp(self.rho)).to(device)
+        self.w = self.mu + self.sigma * epsilon
+        return self.w
 
-    def log_normal(self, x, mean, std, eps=0.0):
-        """computes log-proba of normal distribution"""
-        std += eps
-        return - 0.5 * np.log(2 * np.pi) - T.log(T.abs_(std)) - \
-            (x - mean) ** 2 / (2 * std ** 2)
+    def log_posterior(self):
 
-    def log_prior(self, weights):
         """
-        Logarithm of prior probabilities for weights:
-        log P(weights) aka log P(theta)
+        Calculates the log_likelihood for each of the weights sampled as a part of the complexity cost
+        returns:
+            torch.tensor with shape []
         """
-        return self.log_normal(weights, self.prior_mu, self.prior_std)
 
-    def log_posterior_approx(self, weights, mean, rho):
+        assert (self.w is not None), "You can only have a log posterior for W if you've already sampled it"
+
+        log_sqrt2pi = np.log(np.sqrt(2*self.pi))
+        log_posteriors =  -log_sqrt2pi - self.sigma - (((self.w - self.mu) ** 2)/(2 * self.sigma ** 2))
+        return log_posteriors.mean()
+
+
+class ScaleMixturePrior(nn.Module):
+    #Calculates a Scale Mixture Prior distribution for the prior part of the complexity cost on Bayes by Backprop paper
+    def __init__(self, pi, sigma1, sigma2):
+        super().__init__()
+
+        self.pi = pi
+        self.sigma1 = sigma1
+        self.sigma2 = sigma2
+        self.normal1 = torch.distributions.Normal(0, sigma1)
+        self.normal2 = torch.distributions.Normal(0, sigma2)
+
+    def log_prior(self, w):
         """
-        Logarithm of ELBO on posterior probabilities:
-        log q(weights|learned mu and rho) aka log q(theta|x)
+        Calculates the log_likelihood for each of the weights sampled relative to a prior distribution as a part of the complexity cost
+        returns:
+            torch.tensor with shape []
         """
-        std = T.log1p(T.exp(rho))  # rho to std
-        return self.log_normal(weights, mean, std)
-
-    def __call__(self, layer, spec, shape, name=None, **tags):
-        # case when user uses default init specs
-        assert tags.get(
-            'variational', False), "Please declare param as variational to avoid confusion"
-
-        if not isinstance(spec, dict):
-            initial_rho = np.log(np.expm1(self.prior_std))  # std to rho
-            assert np.isfinite(initial_rho), "too small std to initialize correctly. Please pass explicit"\
-                " initializer (dict with {'mu':mu_init, 'rho':rho_init})."
-            spec = {'mu': spec, 'rho': init.Constant(initial_rho)}
-
-        mu_spec, rho_spec = spec['mu'], spec['rho']
-
-        rho = layer.add_param(
-            rho_spec, shape, name=(
-                name or 'unk') + '.rho', **tags)
-        mean = layer.add_param(
-            mu_spec, shape, name=(
-                name or 'unk') + '.mu', **tags)
-
-        # Reparameterization trick
-        e = self.srng.normal(shape, std=1)
-        W = mean + T.log1p(T.exp(rho)) * e
-
-        # KL divergence KL(q,p) = E_(w~q(w|x)) [log q(w|x) - log P(w)] aka
-        # variational cost
-        q_p = T.sum(
-            self.log_posterior_approx(W, mean, rho) -
-            self.log_prior(W)
-        )
-
-        # accumulate variational cost
-        layer._bbwrap_var_cost += q_p
-        return W
-
-
-def get_var_cost(layer_or_layers, treat_as_input=None):
-    """
-    Returns total variational cost aka KL(q(theta|x)||p(theta)) for all layers in the network
+        prob_n1 = torch.exp(self.normal1.log_prob(w))
+        prob_n2 = torch.exp(self.normal2.log_prob(w))
+        prior_pdf = (self.pi * prob_n1 + (1 - self.pi) * prob_n2)
+
+        return (torch.log(prior_pdf)).mean()
 
-    :param layer_or_layers: top layer(s) of your network, just like with lasagne.layers.get_output
-    :param treat_as_input: don't accumulate over layers below these layers. See same param for lasagne.layers.get_all_layers
 
-    Alternatively, one can manually get weights for one layer via layer.get_var_cost()
+class BayesianLinear(BayesianModule):
     """
-    cost = 0
-    for layer in lasagne.layers.get_all_layers(
-            layer_or_layers, treat_as_input):
-        if hasattr(layer, 'get_var_cost'):
-            # if layer is bayesian or pretends so
-            cost += layer.get_var_cost()
-    return cost
+    Bayesian Linear layer, implements the linear layer proposed on Weight Uncertainity on Neural Networks
+    (Bayes by Backprop paper).
+    Its objective is be interactable with torch nn.Module API, being able even to be chained in nn.Sequential models with other non-this-lib layers
+
+    parameters:
+        in_fetaures: int -> incoming features for the layer
+        out_features: int -> output features for the layer
+        bias: bool -> whether the bias will exist (True) or set to zero (False)
+        prior_sigma_1: float -> prior sigma on the mixture prior distribution 1
+        prior_sigma_2: float -> prior sigma on the mixture prior distribution 2
+        prior_pi: float -> pi on the scaled mixture prior
+        freeze: bool -> wheter the model will start with frozen(deterministic) weights, or not
+
+    """
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 bias=True,
+                 prior_sigma_1 = 1,
+                 prior_sigma_2 = 0.002,
+                 prior_pi = 0.5,
+                 freeze = False):
+        super().__init__()
+
+        #our main parameters
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bias = bias
+        self.freeze = freeze
+
+        #parameters for the scale mixture prior
+        self.prior_sigma_1 = prior_sigma_1
+        self.prior_sigma_2 = prior_sigma_2
+        self.prior_pi = prior_pi
+
+        # Variational weight parameters and sample
+        self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features).uniform_(-0.2, 0.2))
+        self.weight_rho = nn.Parameter(torch.Tensor(out_features, in_features).uniform_(-5, -4))
+        self.weight_sampler = GaussianVariational(self.weight_mu, self.weight_rho)
+
+        # Variational bias parameters and sample
+        self.bias_mu = nn.Parameter(torch.Tensor(out_features).uniform_(-0.2, 0.2))
+        self.bias_rho = nn.Parameter(torch.Tensor(out_features).uniform_(-5, -4))
+        self.bias_sampler = GaussianVariational(self.bias_mu, self.bias_rho)
+
+        # Priors (as BBP paper)
+        self.weight_prior_dist = ScaleMixturePrior(self.prior_pi, self.prior_sigma_1, self.prior_sigma_2)
+        self.bias_prior_dist = ScaleMixturePrior(self.prior_pi, self.prior_sigma_1, self.prior_sigma_2)
+        self.log_prior = 0
+        self.log_variational_posterior = 0
+
+    def forward(self, x):
+        # Sample the weights and forward it
+
+        #if the model is frozen, return frozen
+        if self.freeze:
+            return self.forward_frozen(x)
+
+        w = self.weight_sampler.sample()
+
+        if self.bias:
+            b = self.bias_sampler.sample()
+            b_log_posterior = self.bias_sampler.log_posterior()
+            b_log_prior = self.bias_prior_dist.log_prior(b)
+
+        else:
+            b = torch.zeros((self.out_features))
+            b_log_posterior = 0
+            b_log_prior = 0
+
+        # Get the complexity cost
+        self.log_variational_posterior = self.weight_sampler.log_posterior() + b_log_posterior
+        self.log_prior = self.weight_prior_dist.log_prior(w) + b_log_prior
 
+        return F.linear(x, w, b)
 
-def bbpwrap(approximation=NormalApproximation()):
+    def forward_frozen(self, x):
+        """
+        Computes the feedforward operation with the expected value for weight and biases
+        """
+        if self.bias:
+            return F.linear(x, self.weight_mu, self.bias_mu)
+        else:
+            return F.linear(x, self.weight_mu, torch.zeros(self.out_features))
+
+
+def kl_divergence_from_nn(model):
+
+    """
+    Gathers the KL Divergence from a nn.Module object
+    Works by gathering each Bayesian layer kl divergence and summing it, doing nothing with the non Bayesian ones
     """
-    A decorator that makes arbitrary lasagne layer into a bayesian network layer:
-    BayesDenseLayer = bbwrap()(DenseLayer)
-    or more verbosely,
-    @bbpwrap(NormalApproximation(pstd=0.01))
-    BayesDenseLayer(DenseLayer):
-        pass
+    kl_divergence = 0
+    for module in model.modules():
+        if isinstance(module, (BayesianModule)):
+            kl_divergence += module.log_variational_posterior - module.log_prior
+    return kl_divergence
+
 
+
+def variational_estimator(nn_class):
+    """
+    This decorator adds some util methods to a nn.Module, in order to facilitate the handling of Bayesian Deep Learning features
+    Parameters:
+        nn_class: torch.nn.Module -> Torch neural network module
+    Returns a nn.Module with methods for:
+        (1) Gathering the KL Divergence along its BayesianModules;
+        (2) Sample the Elbo Loss along its variational inferences (helps training)
+        (3) Freeze the model, in order to predict using only their weight distribution means
     """
 
-    def decorator(cls):
-        def add_param_wrap(add_param):
-            @wraps(add_param)
-            def wrapped(self, spec, shape, name=None, **tags):
-                # we should take care about some user specification
-                # to avoid bbp hook just set tags['variational'] = True
-                if not tags.get('trainable', True) or \
-                        tags.get('variational', False):
-                    return add_param(self, spec, shape, name, **tags)
-                else:
-                    # we declare that params we add next
-                    # are the ones we need to fit the distribution
-                    # they don't need to be regularized, strictly
-                    tags['variational'] = True
-                    tags['regularizable'] = False
-                    param = self.approximation(self, spec, shape, name, **tags)
-                    return param
-            return wrapped
-
-        def get_var_cost(self):
-            """
-            Returns total variational cost aka KL(q(theta|x)||p(theta)) for this layer.
-            Alternatively, use function get_var_cost(layer) to get total cost for all layers below this one.
-            """
-            return self._bbwrap_var_cost
-
-        cls.approximation = approximation
-        cls._bbwrap_var_cost = 0
-        cls.add_param = add_param_wrap(cls.add_param)
-        cls.get_var_cost = get_var_cost
-        return cls
-
-    return decorator
+    def nn_kl_divergence(self):
+        """Returns the sum of the KL divergence of each of the BayesianModules of the model, which are from
+            their posterior current distribution of weights relative to a scale-mixtured prior (and simpler) distribution of weights
+            Parameters:
+                N/a
+            Returns torch.tensor with 0 dim.      
+
+        """
+        return kl_divergence_from_nn(self)
+
+    setattr(nn_class, "nn_kl_divergence", nn_kl_divergence)
+
+    def sample_elbo(self,
+                    inputs,
+                    labels,
+                    criterion,
+                    sample_nbr,
+                    complexity_cost_weight=1):
+
+        """ Samples the ELBO Loss for a batch of data, consisting of inputs and corresponding-by-index labels
+                The ELBO Loss consists of the sum of the KL Divergence of the model 
+                 (explained above, interpreted as a "complexity part" of the loss)
+                 with the actual criterion - (loss function) of optimization of our model
+                 (the performance part of the loss). 
+                As we are using variational inference, it takes several (quantified by the parameter sample_nbr) Monte-Carlo
+                 samples of the weights in order to gather a better approximation for the loss.
+            Parameters:
+                inputs: torch.tensor -> the input data to the model
+                labels: torch.tensor -> label data for the performance-part of the loss calculation
+                        The shape of the labels must match the label-parameter shape of the criterion (one hot encoded or as index, if needed)
+                criterion: torch.nn.Module, custom criterion (loss) function, torch.nn.functional function -> criterion to gather
+                            the performance cost for the model
+                sample_nbr: int -> The number of times of the weight-sampling and predictions done in our Monte-Carlo approach to 
+                            gather the loss to be .backwarded in the optimization of the model.        
+
+        """
+
+        loss = 0
+        for _ in range(sample_nbr):
+            outputs = self(inputs)
+            loss = criterion(outputs, labels)
+            loss += self.nn_kl_divergence() * complexity_cost_weight
+        return loss / sample_nbr
+
+    setattr(nn_class, "sample_elbo", sample_elbo)
+
+
+    def freeze_model(self):
+        """
+        Freezes the model by making it predict using only the expected value to their BayesianModules' weights distributions
+        """
+        for module in self.modules():
+            if isinstance(module, (BayesianModule)):
+                module.freeze = True
+
+    setattr(nn_class, "freeze", freeze_model)
+
+    def unfreeze_model(self):
+        """
+        Unfreezes the model by letting it draw its weights with uncertanity from their correspondent distributions
+        """
+
+        for module in self.modules():
+            if isinstance(module, (BayesianModule)):
+                module.freeze = False
+
+    setattr(nn_class, "unfreeze", unfreeze_model)
+    return nn_class