rsokl · davidmascharka · Jun 25, 2020 · Jun 26, 2020 · Jun 29, 2020 · Apr 4, 2021
diff --git a/src/mygrad/__init__.py b/src/mygrad/__init__.py
@@ -24,7 +24,7 @@
 from mygrad.math.sequential.funcs import *
 from mygrad.math.sequential.funcs import max, min
 from mygrad.math.trigonometric.funcs import *
-from mygrad.nnet.layers.utils import sliding_window_view
+from mygrad.nnet.layers.operations.utils import sliding_window_view
 from mygrad.no_grad_funcs import *
 from mygrad.tensor_creation.funcs import *
 from mygrad.tensor_manip.array_shape.funcs import *

diff --git a/src/mygrad/nnet/layers/__init__.py b/src/mygrad/nnet/layers/__init__.py
@@ -1,13 +0,0 @@
-from .batchnorm import batchnorm
-from .conv import conv_nd
-from .pooling import max_pool
-
-__all__ = ["conv_nd", "max_pool", "batchnorm"]
-
-
-try:
-    from .gru import gru
-
-    __all__ += ["gru"]
-except ImportError:  # pragma: no cover
-    pass

diff --git a/src/mygrad/nnet/layers/batchnorm.py b/src/mygrad/nnet/layers/batchnorm.py
@@ -1,173 +1,60 @@
-from typing import Optional, Tuple, Union
-
 import numpy as np
 
 from mygrad import Tensor
-from mygrad.operation_base import Operation
-from mygrad.typing import ArrayLike
-
-__all__ = ["batchnorm"]
+from mygrad.tensor_creation.funcs import ones, zeros
+from mygrad.nnet.layers.operations.batchnorm import batchnorm as batchnorm_op
 
 
-# TODO: Remove affine parameters from Operation
-class BatchNorm(Operation):
-    """
-    Attributes
-    ----------
-    mean : numpy.ndarray
-    var : numpy.ndarray
+class BatchNorm:
+    """ A batch normalization layer.
 
-    Notes
-    -----
-    `mean` and `var` are bound as instance-attributes upon
-    calling the batch-norm instance.
+    This class will perform an n-dimensional batch normalization operation on an
+    (N, D, ...)-shaped tensor scaled by γ of shape (D, ...) and shifted by β of shape (D, ...).
     """
 
-    def __call__(self, x, gamma, beta, *, eps):
-        """
-        y(x) = (x - E[x]) / sqrt(Var[x} + eps)
-        batchnorm(x) = gamma * y(x) + beta
+    def __init__(self, input_channels: int, momentum: float = 0.1):
+        """ Initialize a batch normalization layer.
 
         Parameters
         ----------
-        x : mygrad.Tensor
-        gamma : Optional[mygrad.Tensor]
-        beta : Optional[mygrad.Tensor]
-        eps : Real
-           A small non-negative number.
-
-        Returns
-        -------
-        numpy.ndarray
+        input_channels : int
+            The number of channels of the data to be batch-normalized.
+        momentum : float, optional (default=0.1)
+            The momentum value used to maintain moving averages.
         """
-        normed_dims = tuple(i for i in range(x.ndim) if i != 1)
-        keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))
-
-        self.variables = tuple(i for i in (x, gamma, beta))
-
-        if gamma.size == 0:
-            gamma = None
-        if beta.size == 0:
-            beta = None
+        self.gamma = ones((1, input_channels), dtype=np.float32)
+        self.beta = zeros((1, input_channels), dtype=np.float32)
+        self.moving_mean = np.zeros((1, input_channels), dtype=np.float32)
+        self.moving_variance = np.zeros((1, input_channels), dtype=np.float32)
+        self.momentum = momentum
+        self.input_channels = input_channels
 
-        self.gamma = gamma
-        self.beta = beta
+    def __call__(self, x: Tensor, test: bool = False) -> Tensor:
+        """ Perform the forward-pass of n-dimensional batch normalization over axis 1 on `x`.
 
-        x = x.data
-        self.x_norm = None  # required for backprop through gamma
-        self.mean = x.mean(axis=normed_dims)
-        self.var = x.var(axis=normed_dims)
-
-        if eps:
-            self.var += eps
-
-        y = x - self.mean.reshape(keepdims_shape)
-        self._std = np.sqrt(self.var).reshape(keepdims_shape)  # sqrt(var + eps)
-        y /= self._std
-        self.x_norm = y
-        # optional affine transformation
-        if gamma is not None:
-            gamma = gamma.data
-            # must copy `y` to prevent mutation of `self.x_norm`
-            y = y * gamma.reshape(keepdims_shape)
-
-        if beta is not None:
-            beta = beta.data
-            y = y + beta.reshape(keepdims_shape)
-        return y
-
-    def backward_var(self, grad, index, **kwargs):
-        x = self.variables[0].data
-        if index == 0:  # backprop through x
-            normed_dims = tuple(i for i in range(x.ndim) if i != 1)
+        Parameters
+        ----------
+        x : Union[numpy.ndarray, mygrad.Tensor], shape=(N, D, ...)
+            The data to normalize.
+        test : boolean, optional (default=False)
+            Determines whether the layer is being used at training time. The mean and variance
+            will be computed for the batch during training, while averaged batch statistics will
+            be used at test time.
+        """
+        if test:
+            # use the averaged batch statistics from training rather than computing them on a test batch
             keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape))
-            N = x.size / x.shape[1]
+            x = x - self.moving_mean.reshape(keepdims_shape)
+            x /= np.sqrt(self.moving_variance.reshape(keepdims_shape) + 1e-08)
+            return self.gamma * x + self.beta
 
-            # all sums carried over non-channel dims
-            # (1/sqrt(var + eps)) * [dL - dL.mean() - (1/N)*x_norm*(x_norm @ dL)]
-            grad_ = grad - np.mean(grad, axis=normed_dims, keepdims=True)
+        x_norm = batchnorm_op(x, gamma=self.gamma, beta=self.beta, eps=1e-08)
 
-            rterm = self.x_norm * np.reshape(
-                np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]),
-                keepdims_shape,
-            )
-            rterm /= N
-            grad_ -= rterm
-            grad_ /= self._std
-            if (
-                self.gamma is not None
-            ):  # backprop through optional affine transformation
-                gamma = self.gamma.data
-                grad_ *= gamma.reshape(keepdims_shape)
-            return grad_
+        batch_mean = x_norm.creator.mean
+        batch_variance = x_norm.creator.var
 
-        elif index == 1 and self.gamma is not None:  # backprop through gamma
-            return np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1])
-
-        elif (index == 1 and self.gamma is None) or index == 2:
-            normed_dims = tuple(i for i in range(x.ndim) if i != 1)
-            return grad.sum(axis=normed_dims)
-        else:  # pragma: no cover
-            raise IndexError
-
-
-def batchnorm(
-    x: ArrayLike,
-    *,
-    gamma: Optional[ArrayLike] = None,
-    beta: Optional[ArrayLike] = None,
-    eps: float,
-    constant: Optional[bool] = None
-) -> Tensor:
-    """
-    Performs batch normalization on ``x``::
-
-                 y(x) = (x - E[x]) / sqrt(Var[x] + eps)
-                 batchnorm(x) = gamma * y(x) + beta
-
-    Where :math:`E[x]` and :math:`Var[x]` represent the mean and variance, respectively,
-    over axis-1 of ``x``. The subsequent affine transformation on ``y``
-    is optional.
-
-    Parameters
-    ----------
-    x : array_like, shape=(N, C, ...)
-        The batch to be normalized within each entry of C
-
-    gamma : Optional[array_like], shape=(C,)
-        Optional per-channel scaling factors to be applied after the
-        normalization step.
-
-    beta  : Optional[array_like], shape=(C,)
-        Optional per-channel scaling bias factors to be applied after the
-        normalization step.
-
-    eps : Real
-       A small non-negative number.
-
-    constant : bool, optional (default=False)
-        If True, the resulting Tensor is a constant.
-
-    Returns
-    -------
-    mygrad.Tensor
-        The batch-normalized data.
-
-    Examples
-    --------
-    >>> import mygrad as mg
-    >>> from mygrad.nnet import batchnorm
-    >>> x = mg.Tensor([1., 4., 1.]).reshape(3, 1)
-    >>> batchnorm(x, eps=0)
-    Tensor([[-0.70710678],
-            [ 1.41421356],
-            [-0.70710678]])
-    """
-    # pass gamma and beta as empty arrays if they are not supplied
-    if gamma is None:
-        gamma = np.array([])
-    if beta is None:
-        beta = np.array([])
-    return Tensor._op(
-        BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant
-    )
+        self.moving_mean *= 1 - self.momentum
+        self.moving_mean += self.momentum * batch_mean
+        self.moving_variance *= 1 - self.momentum
+        self.moving_variance += self.momentum * batch_variance
+        return x_norm
diff --git a/src/mygrad/nnet/layers/operations/__init__.py b/src/mygrad/nnet/layers/operations/__init__.py
@@ -0,0 +1,13 @@
+from .batchnorm import batchnorm
+from .conv import conv_nd
+from .pooling import max_pool
+
+__all__ = ["conv_nd", "max_pool", "batchnorm"]
+
+
+try:
+    from .gru import gru
+
+    __all__ += ["gru"]
+except ImportError:  # pragma: no cover
+    pass