diff --git a/src/mygrad/__init__.py b/src/mygrad/__init__.py index fec278c2..dcda5549 100644 --- a/src/mygrad/__init__.py +++ b/src/mygrad/__init__.py @@ -24,7 +24,7 @@ from mygrad.math.sequential.funcs import * from mygrad.math.sequential.funcs import max, min from mygrad.math.trigonometric.funcs import * -from mygrad.nnet.layers.utils import sliding_window_view +from mygrad.nnet.layers.operations.utils import sliding_window_view from mygrad.no_grad_funcs import * from mygrad.tensor_creation.funcs import * from mygrad.tensor_manip.array_shape.funcs import * diff --git a/src/mygrad/nnet/layers/__init__.py b/src/mygrad/nnet/layers/__init__.py index c8e0b485..e69de29b 100644 --- a/src/mygrad/nnet/layers/__init__.py +++ b/src/mygrad/nnet/layers/__init__.py @@ -1,13 +0,0 @@ -from .batchnorm import batchnorm -from .conv import conv_nd -from .pooling import max_pool - -__all__ = ["conv_nd", "max_pool", "batchnorm"] - - -try: - from .gru import gru - - __all__ += ["gru"] -except ImportError: # pragma: no cover - pass diff --git a/src/mygrad/nnet/layers/batchnorm.py b/src/mygrad/nnet/layers/batchnorm.py index f98d35bf..08044dc3 100644 --- a/src/mygrad/nnet/layers/batchnorm.py +++ b/src/mygrad/nnet/layers/batchnorm.py @@ -1,173 +1,60 @@ -from typing import Optional, Tuple, Union - import numpy as np from mygrad import Tensor -from mygrad.operation_base import Operation -from mygrad.typing import ArrayLike - -__all__ = ["batchnorm"] +from mygrad.tensor_creation.funcs import ones, zeros +from mygrad.nnet.layers.operations.batchnorm import batchnorm as batchnorm_op -# TODO: Remove affine parameters from Operation -class BatchNorm(Operation): - """ - Attributes - ---------- - mean : numpy.ndarray - var : numpy.ndarray +class BatchNorm: + """ A batch normalization layer. - Notes - ----- - `mean` and `var` are bound as instance-attributes upon - calling the batch-norm instance. + This class will perform an n-dimensional batch normalization operation on an + (N, D, ...)-shaped tensor scaled by γ of shape (D, ...) and shifted by β of shape (D, ...). """ - def __call__(self, x, gamma, beta, *, eps): - """ - y(x) = (x - E[x]) / sqrt(Var[x} + eps) - batchnorm(x) = gamma * y(x) + beta + def __init__(self, input_channels: int, momentum: float = 0.1): + """ Initialize a batch normalization layer. Parameters ---------- - x : mygrad.Tensor - gamma : Optional[mygrad.Tensor] - beta : Optional[mygrad.Tensor] - eps : Real - A small non-negative number. - - Returns - ------- - numpy.ndarray + input_channels : int + The number of channels of the data to be batch-normalized. + momentum : float, optional (default=0.1) + The momentum value used to maintain moving averages. """ - normed_dims = tuple(i for i in range(x.ndim) if i != 1) - keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape)) - - self.variables = tuple(i for i in (x, gamma, beta)) - - if gamma.size == 0: - gamma = None - if beta.size == 0: - beta = None + self.gamma = ones((1, input_channels), dtype=np.float32) + self.beta = zeros((1, input_channels), dtype=np.float32) + self.moving_mean = np.zeros((1, input_channels), dtype=np.float32) + self.moving_variance = np.zeros((1, input_channels), dtype=np.float32) + self.momentum = momentum + self.input_channels = input_channels - self.gamma = gamma - self.beta = beta + def __call__(self, x: Tensor, test: bool = False) -> Tensor: + """ Perform the forward-pass of n-dimensional batch normalization over axis 1 on `x`. - x = x.data - self.x_norm = None # required for backprop through gamma - self.mean = x.mean(axis=normed_dims) - self.var = x.var(axis=normed_dims) - - if eps: - self.var += eps - - y = x - self.mean.reshape(keepdims_shape) - self._std = np.sqrt(self.var).reshape(keepdims_shape) # sqrt(var + eps) - y /= self._std - self.x_norm = y - # optional affine transformation - if gamma is not None: - gamma = gamma.data - # must copy `y` to prevent mutation of `self.x_norm` - y = y * gamma.reshape(keepdims_shape) - - if beta is not None: - beta = beta.data - y = y + beta.reshape(keepdims_shape) - return y - - def backward_var(self, grad, index, **kwargs): - x = self.variables[0].data - if index == 0: # backprop through x - normed_dims = tuple(i for i in range(x.ndim) if i != 1) + Parameters + ---------- + x : Union[numpy.ndarray, mygrad.Tensor], shape=(N, D, ...) + The data to normalize. + test : boolean, optional (default=False) + Determines whether the layer is being used at training time. The mean and variance + will be computed for the batch during training, while averaged batch statistics will + be used at test time. + """ + if test: + # use the averaged batch statistics from training rather than computing them on a test batch keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape)) - N = x.size / x.shape[1] + x = x - self.moving_mean.reshape(keepdims_shape) + x /= np.sqrt(self.moving_variance.reshape(keepdims_shape) + 1e-08) + return self.gamma * x + self.beta - # all sums carried over non-channel dims - # (1/sqrt(var + eps)) * [dL - dL.mean() - (1/N)*x_norm*(x_norm @ dL)] - grad_ = grad - np.mean(grad, axis=normed_dims, keepdims=True) + x_norm = batchnorm_op(x, gamma=self.gamma, beta=self.beta, eps=1e-08) - rterm = self.x_norm * np.reshape( - np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]), - keepdims_shape, - ) - rterm /= N - grad_ -= rterm - grad_ /= self._std - if ( - self.gamma is not None - ): # backprop through optional affine transformation - gamma = self.gamma.data - grad_ *= gamma.reshape(keepdims_shape) - return grad_ + batch_mean = x_norm.creator.mean + batch_variance = x_norm.creator.var - elif index == 1 and self.gamma is not None: # backprop through gamma - return np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]) - - elif (index == 1 and self.gamma is None) or index == 2: - normed_dims = tuple(i for i in range(x.ndim) if i != 1) - return grad.sum(axis=normed_dims) - else: # pragma: no cover - raise IndexError - - -def batchnorm( - x: ArrayLike, - *, - gamma: Optional[ArrayLike] = None, - beta: Optional[ArrayLike] = None, - eps: float, - constant: Optional[bool] = None -) -> Tensor: - """ - Performs batch normalization on ``x``:: - - y(x) = (x - E[x]) / sqrt(Var[x] + eps) - batchnorm(x) = gamma * y(x) + beta - - Where :math:`E[x]` and :math:`Var[x]` represent the mean and variance, respectively, - over axis-1 of ``x``. The subsequent affine transformation on ``y`` - is optional. - - Parameters - ---------- - x : array_like, shape=(N, C, ...) - The batch to be normalized within each entry of C - - gamma : Optional[array_like], shape=(C,) - Optional per-channel scaling factors to be applied after the - normalization step. - - beta : Optional[array_like], shape=(C,) - Optional per-channel scaling bias factors to be applied after the - normalization step. - - eps : Real - A small non-negative number. - - constant : bool, optional (default=False) - If True, the resulting Tensor is a constant. - - Returns - ------- - mygrad.Tensor - The batch-normalized data. - - Examples - -------- - >>> import mygrad as mg - >>> from mygrad.nnet import batchnorm - >>> x = mg.Tensor([1., 4., 1.]).reshape(3, 1) - >>> batchnorm(x, eps=0) - Tensor([[-0.70710678], - [ 1.41421356], - [-0.70710678]]) - """ - # pass gamma and beta as empty arrays if they are not supplied - if gamma is None: - gamma = np.array([]) - if beta is None: - beta = np.array([]) - return Tensor._op( - BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant - ) + self.moving_mean *= 1 - self.momentum + self.moving_mean += self.momentum * batch_mean + self.moving_variance *= 1 - self.momentum + self.moving_variance += self.momentum * batch_variance + return x_norm diff --git a/src/mygrad/nnet/layers/operations/__init__.py b/src/mygrad/nnet/layers/operations/__init__.py new file mode 100644 index 00000000..c8e0b485 --- /dev/null +++ b/src/mygrad/nnet/layers/operations/__init__.py @@ -0,0 +1,13 @@ +from .batchnorm import batchnorm +from .conv import conv_nd +from .pooling import max_pool + +__all__ = ["conv_nd", "max_pool", "batchnorm"] + + +try: + from .gru import gru + + __all__ += ["gru"] +except ImportError: # pragma: no cover + pass diff --git a/src/mygrad/nnet/layers/operations/batchnorm.py b/src/mygrad/nnet/layers/operations/batchnorm.py new file mode 100644 index 00000000..f98d35bf --- /dev/null +++ b/src/mygrad/nnet/layers/operations/batchnorm.py @@ -0,0 +1,173 @@ +from typing import Optional, Tuple, Union + +import numpy as np + +from mygrad import Tensor +from mygrad.operation_base import Operation +from mygrad.typing import ArrayLike + +__all__ = ["batchnorm"] + + +# TODO: Remove affine parameters from Operation +class BatchNorm(Operation): + """ + Attributes + ---------- + mean : numpy.ndarray + var : numpy.ndarray + + Notes + ----- + `mean` and `var` are bound as instance-attributes upon + calling the batch-norm instance. + """ + + def __call__(self, x, gamma, beta, *, eps): + """ + y(x) = (x - E[x]) / sqrt(Var[x} + eps) + batchnorm(x) = gamma * y(x) + beta + + Parameters + ---------- + x : mygrad.Tensor + gamma : Optional[mygrad.Tensor] + beta : Optional[mygrad.Tensor] + eps : Real + A small non-negative number. + + Returns + ------- + numpy.ndarray + """ + normed_dims = tuple(i for i in range(x.ndim) if i != 1) + keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape)) + + self.variables = tuple(i for i in (x, gamma, beta)) + + if gamma.size == 0: + gamma = None + if beta.size == 0: + beta = None + + self.gamma = gamma + self.beta = beta + + x = x.data + self.x_norm = None # required for backprop through gamma + self.mean = x.mean(axis=normed_dims) + self.var = x.var(axis=normed_dims) + + if eps: + self.var += eps + + y = x - self.mean.reshape(keepdims_shape) + self._std = np.sqrt(self.var).reshape(keepdims_shape) # sqrt(var + eps) + y /= self._std + self.x_norm = y + # optional affine transformation + if gamma is not None: + gamma = gamma.data + # must copy `y` to prevent mutation of `self.x_norm` + y = y * gamma.reshape(keepdims_shape) + + if beta is not None: + beta = beta.data + y = y + beta.reshape(keepdims_shape) + return y + + def backward_var(self, grad, index, **kwargs): + x = self.variables[0].data + if index == 0: # backprop through x + normed_dims = tuple(i for i in range(x.ndim) if i != 1) + keepdims_shape = tuple(1 if n != 1 else d for n, d in enumerate(x.shape)) + N = x.size / x.shape[1] + + # all sums carried over non-channel dims + # (1/sqrt(var + eps)) * [dL - dL.mean() - (1/N)*x_norm*(x_norm @ dL)] + grad_ = grad - np.mean(grad, axis=normed_dims, keepdims=True) + + rterm = self.x_norm * np.reshape( + np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]), + keepdims_shape, + ) + rterm /= N + grad_ -= rterm + grad_ /= self._std + if ( + self.gamma is not None + ): # backprop through optional affine transformation + gamma = self.gamma.data + grad_ *= gamma.reshape(keepdims_shape) + return grad_ + + elif index == 1 and self.gamma is not None: # backprop through gamma + return np.einsum(grad, range(x.ndim), self.x_norm, range(x.ndim), [1]) + + elif (index == 1 and self.gamma is None) or index == 2: + normed_dims = tuple(i for i in range(x.ndim) if i != 1) + return grad.sum(axis=normed_dims) + else: # pragma: no cover + raise IndexError + + +def batchnorm( + x: ArrayLike, + *, + gamma: Optional[ArrayLike] = None, + beta: Optional[ArrayLike] = None, + eps: float, + constant: Optional[bool] = None +) -> Tensor: + """ + Performs batch normalization on ``x``:: + + y(x) = (x - E[x]) / sqrt(Var[x] + eps) + batchnorm(x) = gamma * y(x) + beta + + Where :math:`E[x]` and :math:`Var[x]` represent the mean and variance, respectively, + over axis-1 of ``x``. The subsequent affine transformation on ``y`` + is optional. + + Parameters + ---------- + x : array_like, shape=(N, C, ...) + The batch to be normalized within each entry of C + + gamma : Optional[array_like], shape=(C,) + Optional per-channel scaling factors to be applied after the + normalization step. + + beta : Optional[array_like], shape=(C,) + Optional per-channel scaling bias factors to be applied after the + normalization step. + + eps : Real + A small non-negative number. + + constant : bool, optional (default=False) + If True, the resulting Tensor is a constant. + + Returns + ------- + mygrad.Tensor + The batch-normalized data. + + Examples + -------- + >>> import mygrad as mg + >>> from mygrad.nnet import batchnorm + >>> x = mg.Tensor([1., 4., 1.]).reshape(3, 1) + >>> batchnorm(x, eps=0) + Tensor([[-0.70710678], + [ 1.41421356], + [-0.70710678]]) + """ + # pass gamma and beta as empty arrays if they are not supplied + if gamma is None: + gamma = np.array([]) + if beta is None: + beta = np.array([]) + return Tensor._op( + BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant + ) diff --git a/src/mygrad/nnet/layers/conv.py b/src/mygrad/nnet/layers/operations/conv.py similarity index 99% rename from src/mygrad/nnet/layers/conv.py rename to src/mygrad/nnet/layers/operations/conv.py index aa668a9f..c83bf8b4 100644 --- a/src/mygrad/nnet/layers/conv.py +++ b/src/mygrad/nnet/layers/operations/conv.py @@ -3,7 +3,7 @@ import numpy as np -from mygrad.nnet.layers.utils import sliding_window_view +from mygrad.nnet.layers.operations.utils import sliding_window_view from mygrad.operation_base import Operation from mygrad.tensor_base import Tensor from mygrad.typing import ArrayLike diff --git a/src/mygrad/nnet/layers/gru.py b/src/mygrad/nnet/layers/operations/gru.py similarity index 100% rename from src/mygrad/nnet/layers/gru.py rename to src/mygrad/nnet/layers/operations/gru.py diff --git a/src/mygrad/nnet/layers/pooling.py b/src/mygrad/nnet/layers/operations/pooling.py similarity index 99% rename from src/mygrad/nnet/layers/pooling.py rename to src/mygrad/nnet/layers/operations/pooling.py index 439f9df3..3d1f07ab 100644 --- a/src/mygrad/nnet/layers/pooling.py +++ b/src/mygrad/nnet/layers/operations/pooling.py @@ -3,7 +3,7 @@ import numpy as np -from mygrad.nnet.layers.utils import sliding_window_view +from mygrad.nnet.layers.operations.utils import sliding_window_view from mygrad.operation_base import Operation from mygrad.tensor_base import Tensor from mygrad.typing import ArrayLike diff --git a/src/mygrad/nnet/layers/utils.py b/src/mygrad/nnet/layers/operations/utils.py similarity index 100% rename from src/mygrad/nnet/layers/utils.py rename to src/mygrad/nnet/layers/operations/utils.py diff --git a/src/mygrad/tensor_manip/tiling/ops.py b/src/mygrad/tensor_manip/tiling/ops.py index 3b27d91a..82ce749b 100644 --- a/src/mygrad/tensor_manip/tiling/ops.py +++ b/src/mygrad/tensor_manip/tiling/ops.py @@ -2,7 +2,7 @@ import numpy as np -from mygrad.nnet.layers.utils import sliding_window_view +from mygrad.nnet.layers.operations.utils import sliding_window_view from mygrad.operation_base import Operation from mygrad.tensor_base import Tensor diff --git a/tests/nnet/initializers/test_dirac.py b/tests/nnet/initializers/test_dirac.py index c8ce8df7..270ee960 100644 --- a/tests/nnet/initializers/test_dirac.py +++ b/tests/nnet/initializers/test_dirac.py @@ -6,7 +6,7 @@ from mygrad import Tensor from mygrad.nnet.initializers import dirac -from mygrad.nnet.layers.conv import conv_nd +from mygrad.nnet.layers.operations.conv import conv_nd @given(shape=hnp.array_shapes(max_dims=1)) diff --git a/tests/nnet/layers/test_batchnorm.py b/tests/nnet/layers/test_batchnorm.py index 5953acaf..d7987912 100644 --- a/tests/nnet/layers/test_batchnorm.py +++ b/tests/nnet/layers/test_batchnorm.py @@ -6,7 +6,7 @@ import mygrad as mg from mygrad import Tensor -from mygrad.nnet.layers.batchnorm import batchnorm +from mygrad.nnet.layers.operations.batchnorm import batchnorm from tests.wrappers.uber import backprop_test_factory, fwdprop_test_factory diff --git a/tests/nnet/layers/test_conv.py b/tests/nnet/layers/test_conv.py index 2b1d01b2..8173e9cf 100644 --- a/tests/nnet/layers/test_conv.py +++ b/tests/nnet/layers/test_conv.py @@ -11,7 +11,7 @@ import mygrad as mg from mygrad import Tensor -from mygrad.nnet.layers import conv_nd +from mygrad.nnet.layers.operations import conv_nd from ...utils.numerical_gradient import numerical_gradient_full from ...wrappers.uber import backprop_test_factory, fwdprop_test_factory diff --git a/tests/nnet/layers/test_gru.py b/tests/nnet/layers/test_gru.py index e34fb555..c2be41f2 100644 --- a/tests/nnet/layers/test_gru.py +++ b/tests/nnet/layers/test_gru.py @@ -13,7 +13,7 @@ from mygrad.tensor_base import Tensor from tests.utils.errors import does_not_raise -gru_module = pytest.importorskip("mygrad.nnet.layers.gru") +gru_module = pytest.importorskip("mygrad.nnet.layers.operations.gru") gru = gru_module.gru diff --git a/tests/nnet/layers/test_maxpool.py b/tests/nnet/layers/test_maxpool.py index b106f601..2df7c7d5 100644 --- a/tests/nnet/layers/test_maxpool.py +++ b/tests/nnet/layers/test_maxpool.py @@ -2,7 +2,7 @@ from numpy.testing import assert_allclose from pytest import raises -from mygrad.nnet.layers import max_pool +from mygrad.nnet.layers.operations import max_pool from mygrad.tensor_base import Tensor diff --git a/tests/nnet/test_sliding_window.py b/tests/nnet/test_sliding_window.py index 219b298b..5928da37 100644 --- a/tests/nnet/test_sliding_window.py +++ b/tests/nnet/test_sliding_window.py @@ -5,7 +5,7 @@ from hypothesis import given, settings from numpy.testing import assert_allclose -from mygrad.nnet.layers.utils import sliding_window_view +from mygrad.nnet.layers.operations.utils import sliding_window_view dtype_strat_numpy = st.sampled_from( (np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64)